diff --git a/build.bat b/build.bat index d021e5a1..f5d27629 100644 --- a/build.bat +++ b/build.bat @@ -104,6 +104,7 @@ if "%rdi_from_pdb%"=="1" set didbuild=1 && %compile% ..\src\rdi_fr if "%rdi_from_dwarf%"=="1" set didbuild=1 && %compile% ..\src\rdi_from_dwarf\rdi_from_dwarf.c %compile_link% %out%rdi_from_dwarf.exe || exit /b 1 if "%rdi_dump%"=="1" set didbuild=1 && %compile% ..\src\rdi_dump\rdi_dump_main.c %compile_link% %out%rdi_dump.exe || exit /b 1 if "%rdi_breakpad_from_pdb%"=="1" set didbuild=1 && %compile% ..\src\rdi_breakpad_from_pdb\rdi_breakpad_from_pdb_main.c %compile_link% %out%rdi_breakpad_from_pdb.exe || exit /b 1 +if "%radlink%"=="1" set didbuild=1 && %compile% ..\src\linker\lnk.c %compile_link% %out%radlink.exe || exit /b 1 if "%tester%"=="1" set didbuild=1 && %compile% ..\src\tester\tester_main.c %compile_link% %out%tester.exe || exit /b 1 if "%ryan_scratch%"=="1" set didbuild=1 && %compile% ..\src\scratch\ryan_scratch.c %compile_link% %out%ryan_scratch.exe || exit /b 1 if "%mule_main%"=="1" set didbuild=1 && del vc*.pdb mule*.pdb && %compile_release% %only_compile% ..\src\mule\mule_inline.cpp && %compile_release% %only_compile% ..\src\mule\mule_o2.cpp && %compile_debug% %EHsc% ..\src\mule\mule_main.cpp ..\src\mule\mule_c.c mule_inline.obj mule_o2.obj %compile_link% %no_aslr% %out%mule_main.exe || exit /b 1 diff --git a/src/linker/base_ext/base_arena.c b/src/linker/base_ext/base_arena.c new file mode 100644 index 00000000..5b6ca528 --- /dev/null +++ b/src/linker/base_ext/base_arena.c @@ -0,0 +1,88 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal String8 +push_cstr(Arena *arena, String8 str) +{ + U64 buffer_size = str.size + 1; + U8 *buffer = push_array_no_zero(arena, U8, buffer_size); + MemoryCopy(buffer, str.str, str.size); + buffer[str.size] = 0; + String8 result = str8(buffer, buffer_size); + return result; +} + +internal U32 * +push_u32(Arena *arena, U32 value) +{ + U32 *result = push_array_no_zero(arena, U32, 1); + *result = value; + return result; +} + +internal U64 * +push_u64(Arena *arena, U64 value) +{ + U64 *result = push_array_no_zero(arena, U64, 1); + *result = value; + return result; +} + +internal U32 * +push_array_copy_u32(Arena *arena, U32 *v, U64 count) +{ + U32 *result = push_array_no_zero(arena, U32, count); + MemoryCopyTyped(result, v, count); + return result; +} + +internal U64 * +push_array_copy_u64(Arena *arena, U64 *v, U64 count) +{ + U64 *result = push_array_no_zero(arena, U64, count); + MemoryCopyTyped(result, v, count); + return result; +} + +internal U64 ** +push_matrix_u64(Arena *arena, U64 rows, U64 columns) +{ + U64 **result = push_array_no_zero(arena, U64 *, rows); + for (U64 row_idx = 0; row_idx < rows; row_idx += 1) { + result[row_idx] = push_array(arena, U64, columns); + } + return result; +} + +internal Arena ** +alloc_fixed_size_arena_array(Arena *arena, U64 count, U64 res, U64 cmt) +{ + U64 data_size = sizeof(count) + sizeof(Arena *) * count; + U8 *data = push_array_no_zero(arena, U8, data_size); + U64 *count_ptr = (U64 *)data; + Arena **arr = (Arena **)(count_ptr + 1); + *count_ptr = count; + + ArenaParams params = {0}; + params.reserve_size = res; + params.commit_size = cmt; + + for (U64 i = 0; i < count; i += 1) { + Arena *fixed_arena = arena_alloc_(¶ms); + arr[i] = fixed_arena; + } + + return arr; +} + +internal void +release_arena_array(Arena **arr) +{ + U64 *count_ptr = (U64 *)arr - 1; + U64 count = *count_ptr; + for (U64 i = 0; i < count; i += 1) { + arena_release(arr[i]); + arr[i] = 0; + } +} + diff --git a/src/linker/base_ext/base_arena.h b/src/linker/base_ext/base_arena.h new file mode 100644 index 00000000..dbafb624 --- /dev/null +++ b/src/linker/base_ext/base_arena.h @@ -0,0 +1,15 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal U32 * push_u32(Arena *arena, U32 value); +internal U64 * push_u64(Arena *arena, U64 value); +internal U32 * push_array_copy_u32(Arena *arena, U32 *v, U64 count); +internal U64 * push_array_copy_u64(Arena *arena, U64 *v, U64 count); +internal U64 ** push_matrix_u64(Arena *arena, U64 rows, U64 columns); +internal String8 push_cstr(Arena *arena, String8 str); + +internal Arena ** alloc_fixed_size_arena_array(Arena *arena, U64 count, U64 res, U64 cmt); +internal void release_arena_array(Arena **arr); + diff --git a/src/linker/base_ext/base_arrays.c b/src/linker/base_ext/base_arrays.c new file mode 100644 index 00000000..9cc83a8e --- /dev/null +++ b/src/linker/base_ext/base_arrays.c @@ -0,0 +1,230 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U64 +void_list_count_nodes(VoidNode *head) +{ + U64 node_count = 0; + for (VoidNode *curr = head; curr != 0; curr = curr->next) { + ++node_count; + } + return node_count; +} + +internal void +void_node_concat(VoidNode **head, VoidNode *node) +{ + Assert(*head != node); + node->next = *head; + *head = node; +} + +internal void +void_node_concat_atomic(VoidNode **head, VoidNode *node) +{ + Assert(*head != node); + node->next = ins_atomic_ptr_eval_assign(head, node); +} + +internal U64Node * +u64_list_push(Arena *arena, U64List *list, U64 data) +{ + U64Node *n = push_array(arena, U64Node, 1); + n->next = 0; + n->data = data; + + SLLQueuePush(list->first, list->last, n); + ++list->count; + + return n; +} + +internal void +u64_list_concat_in_place(U64List *list, U64List *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal U64Array +u64_array_from_list(Arena *arena, U64List *list) +{ + U64Array result; + result.count = 0; + result.v = push_array(arena, U64, list->count); + for (U64Node *n = list->first; n != NULL; n = n->next) { + result.v[result.count++] = n->data; + } + return result; +} + +internal void +u32_array_sort(U64 count, U32 *v) +{ + radsort(v, count, u32_is_before); +} + +internal void +u32_pair_radix_sort(U64 count, PairU32 *arr) +{ + Temp scratch = scratch_begin(0,0); + + PairU32 *temp = push_array(scratch.arena, PairU32, count); + + const U64 bit_count0 = 11; + const U64 bit_count1 = 11; + const U64 bit_count2 = 10; + + U32 *count0 = push_array(scratch.arena, U32, (1 << bit_count0)); + U32 *count1 = push_array(scratch.arena, U32, (1 << bit_count1)); + U32 *count2 = push_array(scratch.arena, U32, (1 << bit_count2)); + + for (U64 i = 0; i < count; ++i) { + U32 digit0 = (arr[i].v0 >> 0 ) % (1 << bit_count0); + U32 digit1 = (arr[i].v0 >> bit_count0) % (1 << bit_count1); + U32 digit2 = (arr[i].v0 >> bit_count1) % (1 << bit_count2); + + ++count0[digit0]; + ++count1[digit1]; + ++count2[digit2]; + } + + counts_to_offsets_array_u32((1 << bit_count0), count0); + counts_to_offsets_array_u32((1 << bit_count1), count1); + counts_to_offsets_array_u32((1 << bit_count2), count2); + + for (U64 i = 0; i < count; ++i) { + U32 digit0 = (arr[i].v0 >> 0) % (1 << bit_count0); + temp[count0[digit0]++] = arr[i]; + } + + for (U64 i = 0; i < count; ++i) { + U32 digit1 = (temp[i].v0 >> bit_count0) % (1 << bit_count1); + arr[count1[digit1]++] = temp[i]; + } + + for (U64 i = 0; i < count; ++i) { + U32 digit2 = (arr[i].v0 >> bit_count1) % (1 << bit_count2); + temp[count2[digit2]++] = arr[i]; + } + + MemoryCopyTyped(arr, temp, count); + + scratch_end(scratch); +} + +internal B32 +u32_array_compare(U32Array a, U32Array b) +{ + B32 are_equal = 0; + if (a.count == b.count) { + int cmp = MemoryCompare(a.v, b.v, sizeof(a.v[0]) * a.count); + are_equal = (cmp == 0); + } + return are_equal; +} + +internal U64Array +u64_array_remove_duplicates(Arena *arena, U64Array in) +{ + U64Array result; + result.count = 0; + result.v = push_array(arena, U64, in.count); + + for (U64 i = 1; i < in.count; ++i) { + B32 is_unique = in.v[i - 1] != in.v[i]; + if (is_unique) { + result.v[result.count++] = in.v[i - 1]; + } + } + + if (in.count > 0 && result.count > 0) { + B32 is_unique = result.v[result.count - 1] != in.v[in.count - 1]; + if (is_unique) { + result.v[result.count++] = in.v[in.count - 1]; + } + } + + U64 slack_size = (in.count - result.count) * sizeof(result.v[0]); + arena_pop(arena, slack_size); + + return result; +} + +internal U64 +sum_array_u64(U64 count, U64 *v) +{ + U64 result = 0; + for (U64 i = 0; i < count; i += 1) { + result += v[i]; + } + return result; +} + +internal U64 +sum_matrix_u64(U64 rows, U64 cols, U64 **v) +{ + U64 result = 0; + for (U64 i = 0; i < rows; ++i) { + result += sum_array_u64(cols, v[i]); + } + return result; +} + +internal U64 +max_array_u64(U64 count, U64 *v) +{ + U64 result = 0; + for (U64 i = 0; i < count; i += 1) { + result = Max(v[i], result); + } + return result; +} + +internal U64 +min_array_u64(U64 count, U64 *v) +{ + U64 result = max_U64; + for (U64 i = 0; i < count; i += 1) { + result = Min(v[i], result); + } + return result; +} + +internal void +counts_to_offsets_array_u32(U64 count, U32 *arr) +{ + U32 next_offset = 0; + for (U64 i = 0; i < count; i += 1) { + U32 current_offset = next_offset; + next_offset += arr[i]; + arr[i] = current_offset; + } +} + +internal void +counts_to_offsets_array_u64(U64 count, U64 *arr) +{ + U64 next_offset = 0; + for (U64 i = 0; i < count; i += 1) { + U64 current_offset = next_offset; + next_offset += arr[i]; + arr[i] = current_offset; + } +} + +internal U32 * +offsets_from_counts_array_u32(Arena *arena, U32 *v, U64 count) +{ + U32 *result = push_array_copy_u32(arena, v, count); + counts_to_offsets_array_u32(count, result); + return result; +} + +internal U64 * +offsets_from_counts_array_u64(Arena *arena, U64 *v, U64 count) +{ + U64 *result = push_array_copy_u64(arena, v, count); + counts_to_offsets_array_u64(count, result); + return result; +} + diff --git a/src/linker/base_ext/base_arrays.h b/src/linker/base_ext/base_arrays.h new file mode 100644 index 00000000..f6b27dd4 --- /dev/null +++ b/src/linker/base_ext/base_arrays.h @@ -0,0 +1,63 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct U32Array +{ + U64 count; + U32 *v; +} U32Array; + +typedef struct U64Array +{ + U64 count; + U64 *v; +} U64Array; + +typedef struct +{ + U64 count; + U128 *v; +} U128Array; + +typedef struct U64Node +{ + struct U64Node *next; + U64 data; +} U64Node; + +typedef struct U64List +{ + U64 count; + U64Node *first; + U64Node *last; +} U64List; + +typedef struct VoidNode +{ + struct VoidNode *next; + void *v; +} VoidNode; + +//////////////////////////////// + +internal U64Node * u64_list_push(Arena *arena, U64List *list, U64 data); +internal void u64_list_concat_in_place(U64List *list, U64List *to_concat); +internal U64Array u64_array_from_list(Arena *arena, U64List *list); + +internal U64Array u64_array_remove_duplicates(Arena *arena, U64Array in); + +internal void u32_array_sort(U64 count, U32 *v); +internal B32 u32_array_compare(U32Array a, U32Array b); + +internal U64 sum_array_u64(U64 count, U64 *v); +internal U64 max_array_u64(U64 count, U64 *v); +internal U64 min_array_u64(U64 count, U64 *v); + +internal void counts_to_offsets_array_u32(U64 count, U32 *arr); +internal void counts_to_offsets_array_u64(U64 count, U64 *arr); + +internal U32 * offsets_from_counts_array_u32(Arena *arena, U32 *v, U64 count); +internal U64 * offsets_from_counts_array_u64(Arena *arena, U64 *v, U64 count); + diff --git a/src/linker/base_ext/base_bit_array.c b/src/linker/base_ext/base_bit_array.c new file mode 100644 index 00000000..2e213294 --- /dev/null +++ b/src/linker/base_ext/base_bit_array.c @@ -0,0 +1,276 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U32Array +bit_array_init32(Arena *arena, U64 word_count) +{ + U32Array result; + result.count = CeilIntegerDiv(word_count, 32); + result.v = push_array(arena, U32, word_count); + return result; +} + +internal U64 +bit_array_scan_left_to_right32(U32Array bit_array, U64 lo, U64 hi, B32 state) +{ + Assert(lo < bit_array.count*32); + Assert(hi <= bit_array.count*32); + Assert(lo <= hi); + Assert(state == 0 || state == 1); + + U64 word_lo = lo / 32; + U64 word_hi = CeilIntegerDiv(hi, 32) - 1; + + U64 word_idx = word_lo; + U64 bit_idx = 0; + + U64 scan_count = hi - lo; + if (scan_count < 32) { + U64 bit_lo = lo % 32; + U64 bit_hi = hi % 32; + U64 word = bit_array.v[word_idx]; + word ^= state - 1; + word &= (1U << bit_hi) - (1U << bit_lo); + if (word) { + bit_idx = ctz32(word); + goto exit; + } + } else { + U32 first_word = bit_array.v[word_idx]; + first_word ^= state - 1; + first_word &= ~0u << (lo % 32); + if (first_word) { + bit_idx = ctz32(first_word); + goto exit; + } + + for (word_idx += 1; word_idx < word_hi; word_idx += 1) { + U32 word = bit_array.v[word_idx]; + word ^= state - 1; + if (word != 0) { + bit_idx = ctz32(word); + goto exit; + } + } + + U64 bit_hi = hi - (word_idx * 32); + U32 last_word = bit_array.v[word_idx]; + last_word ^= state - 1; + last_word &= (1 << bit_hi) - 1; + if (last_word) { + bit_idx = ctz32(last_word); + goto exit; + } + } + + word_idx = 0; + bit_idx = max_U32; + + exit:; + + U64 result = word_idx * 32 + bit_idx; + return result; +} + +internal U64 +bit_array_scan_right_to_left32(U32Array bit_array, U64 lo, U64 hi, B32 state) +{ + Assert(lo <= hi); + Assert(state == 0 || state == 1); + + S64 word_lo = lo / 32; + S64 word_hi = CeilIntegerDiv(hi, 32) - 1; + + S64 word_idx = word_hi; + S64 bit_idx = -1; + + U64 scan_count = hi - lo; + if (scan_count < 32) { + S64 bit_lo = lo % 32; + S64 bit_hi = bit_lo + scan_count; + U32 word = bit_array.v[word_idx]; + for (bit_idx = bit_hi; bit_idx >= bit_lo; bit_idx -= 1) { + U32 bit = ExtractBit(word, bit_idx); + if (bit == state) { + goto exit; + } + } + } else { + U32 last_word = bit_array.v[word_idx]; + S64 bit_hi = hi % 32; + for (bit_idx = bit_hi; bit_idx >= 0; bit_idx -= 1) { + U32 bit = ExtractBit(last_word, bit_idx); + if (bit == state) { + goto exit; + } + } + + for (word_idx -= 1; word_idx > word_lo; word_idx -= 1) { + U32 word = bit_array.v[word_idx]; + for (bit_idx = 32 - 1; bit_idx >= 0; bit_idx -= 1) { + U32 bit = ExtractBit(word, bit_idx); + if (bit == state) { + goto exit; + } + } + } + + U32 first_word = bit_array.v[word_idx]; + S64 bit_lo = lo % 32; + for (bit_idx = 32 - 1; bit_idx >= bit_lo; bit_idx -= 1) { + U32 bit = ExtractBit(first_word, bit_idx); + if (bit == state) { + goto exit; + } + } + } + + word_idx = 0; + bit_idx = max_U32; + + exit:; + + S64 result_s64 = word_idx * 32 + bit_idx; + U64 result_u64 = (U64)result_s64; + return result_u64; +} + +internal Rng1U64 +bit_array_scan_left_to_right32_contiguous(U32Array bit_array, U64 lo, U64 hi, B32 state, U64 in_row_count) +{ + Rng1U64 result = rng_1u64(max_U64, max_U64); + + U64 curr_count = 0, rover = lo; + while (curr_count < in_row_count) { + rover = bit_array_scan_left_to_right32(bit_array, rover, hi, state); + + // no more bits in range + if (rover >= hi) { + break; + } + + // set first match + if (result.v[0] == max_U64) { + result = rng_1u64(rover, rover); + continue; + } + + // reset on non-contiguous range + B32 is_bit_index_not_adjoined = (result.v[0] + 1 < rover); + if (is_bit_index_not_adjoined) { + curr_count = 0; + result = rng_1u64(max_U64, max_U64); + continue; + } + + // advance + result.v[1] = rover; + curr_count -= 1; + } + + // did we allocate enough bits? + if (curr_count != in_row_count) { + result = rng_1u64(max_U64, max_U64); + } + + return result; +} + +internal Rng1U64 +bit_array_scan_right_to_left32_contiguous(U32Array bit_array, U64 lo, U64 hi, B32 state, U64 in_row_count) +{ + Rng1U64 result = rng_1u64(max_U64, max_U64); + + U64 curr_count = 0, rover = lo; + while (curr_count < in_row_count) { + rover = bit_array_scan_right_to_left32(bit_array, lo, rover, state); + + // no more bits in range + if (rover >= hi) { + break; + } + + // set first match + if (result.v[0] == max_U64) { + result = rng_1u64(rover, rover); + continue; + } + + // reset on non-contiguous range + B32 is_bit_index_not_adjoined = (result.v[0] + 1 < rover); + if (is_bit_index_not_adjoined) { + curr_count = 0; + result = rng_1u64(max_U64, max_U64); + continue; + } + + // advance + result.v[0] = rover; + curr_count -= 1; + } + + // did we allocate enough bits? + if (curr_count != in_row_count) { + result = rng_1u64(max_U64, max_U64); + } + + return result; +} + +internal U64 +bit_array_find_next_unset_bit32(U32Array bit_array) +{ + U64 result = bit_array_scan_left_to_right32(bit_array, 0, bit_array.count*32, 0); + return result; +} + +internal U64 +bit_array_find_next_set_bit32(U32Array bit_array) +{ + U64 result = bit_array_scan_left_to_right32(bit_array, 0, bit_array.count*32, 1); + return result; +} + +internal void +bit_array_set_bit32(U32Array bit_array, U64 idx, B32 state) +{ + Assert(idx < bit_array.count*32); + U64 word_idx = idx / 32; + U64 bit_idx = idx % 32; + if (state) { + bit_array.v[word_idx] |= (1 << bit_idx); + } else { + bit_array.v[word_idx] &= ~(1 << bit_idx); + } +} + +internal void +bit_array_set_bit_range32(U32Array bit_array, Rng1U64 range, B32 state) +{ + for (U64 idx = range.min ; idx < range.max; idx += 1) { + bit_array_set_bit32(bit_array, idx, state); + } +} + +internal U32 +bit_array_get_bit32(U32Array bit_array, U64 idx) +{ + Assert(idx < bit_array.count*32); + U64 word_idx = idx / 32; + U64 bit_idx = idx % 32; + U32 bit = (bit_array.v[word_idx] & (1 << bit_idx)) >> bit_idx; + return bit; +} + +internal B32 +bit_array_is_bit_set(U32Array bit_arr, U64 bit_pos) +{ + U64 word_idx = bit_pos / 32; + Assert(word_idx < bit_arr.count); + U32 word = bit_arr.v[word_idx]; + U64 bit_idx = bit_pos % 32; + B32 is_set = !!(word & (1 << bit_idx)); + return is_set; +} + + diff --git a/src/linker/base_ext/base_bit_array.h b/src/linker/base_ext/base_bit_array.h new file mode 100644 index 00000000..c9428957 --- /dev/null +++ b/src/linker/base_ext/base_bit_array.h @@ -0,0 +1,18 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal U32Array bit_array_init32(Arena *arena, U64 word_count); +internal U64 bit_array_scan_left_to_right32(U32Array bit_array, U64 lo, U64 hi, B32 state); +internal U64 bit_array_scan_right_to_left32(U32Array bit_array, U64 lo, U64 hi, B32 state); +internal Rng1U64 bit_array_scan_left_to_right32_contiguous(U32Array bit_array, U64 lo, U64 hi, B32 state, U64 in_row_count); +internal Rng1U64 bit_array_scan_right_to_left32_contiguous(U32Array bit_array, U64 lo, U64 hi, B32 state, U64 in_row_count); +internal B32 byte_scan_right_to_left(U8 *start, U8 *opl, U8 byte, U64 *offset_out); +internal U64 bit_array_find_next_unset_bit32(U32Array bit_array); +internal U64 bit_array_find_next_set_bit32(U32Array bit_array); +internal void bit_array_set_bit32(U32Array bit_array, U64 idx, B32 state); +internal void bit_array_set_bit_range32(U32Array bit_array, Rng1U64 range, B32 state); +internal U32 bit_array_get_bit32(U32Array bit_array, U64 idx); + + diff --git a/src/linker/base_ext/base_blake3.c b/src/linker/base_ext/base_blake3.c new file mode 100644 index 00000000..256e600c --- /dev/null +++ b/src/linker/base_ext/base_blake3.c @@ -0,0 +1,102 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmacro-redefined" +#elif defined(_MSC_VER) +#pragma warning (push, 0) +#endif + +#include "../third_party_ext/blake3/c/blake3_portable.c" + +#if defined(_M_AMD64) || defined(__x86_64__) + +#define round_fn sse2_round_fn +#define compress_pre sse2_compress_pre + +#include "../third_party_ext/blake3/c/blake3_sse2.c" + +#define loadu sse41_loadu +#define storeu sse41_storeu +#define addv sse41_addv +#define xorv sse41_xorv +#define set1 sse41_set1 +#define set4 sse41_set4 +#define rot16 sse41_rot16 +#define rot12 sse41_rot12 +#define rot8 sse41_rot8 +#define rot7 sse41_rot7 +#define g1 sse41_g1 +#define g2 sse41_g2 +#define diagonalize sse41_diagonalize +#define undiagonalize sse41_undiagonalize +#define compress_pre sse41_compress_pre +#define round_fn sse41_round_fn +#define transpose_vecs sse41_transpose_vecs +#define transpose_msg_vecs sse41_transpose_msg_vecs +#define load_counters sse41_load_counters + +#if defined(__clang__) +#pragma clang attribute push(__attribute__((target("sse4.1"))), apply_to=function) +#endif +#include "../third_party_ext/blake3/c/blake3_sse41.c" +#if defined(__clang__) +#pragma clang attribute pop +#endif + +#define loadu avx2_loadu +#define storeu avx2_storeu +#define addv avx2_addv +#define xorv avx2_xorv +#define set1 avx2_set1 +#define rot7 avx2_rot7 +#define rot8 avx2_rot8 +#define rot12 avx2_rot12 +#define rot16 avx2_rot16 +#define round_fn avx2_round_fn +#define transpose_vecs avx2_transpose_vecs +#define transpose_msg_vecs avx2_transpose_msg_vecs +#define load_counters avx2_load_counters + +#if defined(__clang__) +#pragma clang attribute push(__attribute__((target("avx2"))), apply_to=function) +#endif +#include "../third_party_ext/blake3/c/blake3_avx2.c" +#if defined(__clang__) +#pragma clang attribute pop +#endif + +#define set4 avx512_set4 +#define g1 avx512_g1 +#define g2 avx512_g2 +#define diagonalize avx512_diagonalize +#define undiagonalize avx512_undiagonalize +#define compress_pre avx512_compress_pre +#define transpose_vecs avx512_transpose_vecs +#define transpose_msg_vecs avx512_transpose_msg_vecs +#define load_counters avx512_load_counters + +#if defined(__clang__) +#pragma clang attribute push(__attribute__((target("avx512f,avx512vl"))), apply_to=function) +#endif +#include "../third_party_ext/blake3/c/blake3_avx512.c" +#if defined(__clang__) +#pragma clang attribute pop +#endif + +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#include "../third_party_ext/blake3/c/blake3_neon.c" +#endif + +#include "../third_party_ext/blake3/c/blake3_dispatch.c" +#include "../third_party_ext/blake3/c/blake3.c" + +#if defined(__clang__) +#pragma clang diagnostic pop +#elif defined(_MSC_VER) +#pragma warning (pop, 0) +#endif + diff --git a/src/linker/base_ext/base_blake3.h b/src/linker/base_ext/base_blake3.h new file mode 100644 index 00000000..bec40c60 --- /dev/null +++ b/src/linker/base_ext/base_blake3.h @@ -0,0 +1,41 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#if defined(__clang__) && defined(__x86_64__) +# if defined(__IMMINTRIN_H) +# error "include this header before immintrin.h / x86intrin.h / intrin.h" +# endif +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wreserved-macro-identifier" +# pragma push_macro("__AVX__") +# pragma push_macro("__AVX2__") +# pragma push_macro("__SSE4_1__") +# pragma push_macro("__AVX512F__") +# pragma push_macro("__AVX512VL__") +# define __AVX__ 1 +# define __AVX2__ 1 +# define __SSE4_1__ 1 +# define __AVX512F__ 1 +# define __AVX512VL__ 1 +# include +# pragma pop_macro("__AVX512VL__") +# pragma pop_macro("__AVX512F__") +# pragma pop_macro("__SSE4_1__") +# pragma pop_macro("__AVX2__") +# pragma pop_macro("__AVX__") +# pragma clang diagnostic pop +#endif + +#include "../third_party_ext/blake3/c/blake3.h" + +static void +blake3(void* out, size_t outlen, void* in, size_t inlen) +{ + blake3_hasher hasher; + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, in, inlen); + blake3_hasher_finalize(&hasher, (uint8_t*)out, outlen); +} + diff --git a/src/linker/base_ext/base_blake3_asm.c b/src/linker/base_ext/base_blake3_asm.c new file mode 100644 index 00000000..d02a4380 --- /dev/null +++ b/src/linker/base_ext/base_blake3_asm.c @@ -0,0 +1,13 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#include "../third_party_ext/blake3/blake3_portable.c" + +#if defined(__aarch64__) || defined(_M_ARM64) +#include "../third_party_ext/blake3/blake3_neon.c" +#endif + +#include "../third_party_ext/blake3/blake3_dispatch.c" +#include "../third_party_ext/blake3/blake3.c" + +#pragma comment (lib, "blake3") diff --git a/src/linker/base_ext/base_blake3_asm.h b/src/linker/base_ext/base_blake3_asm.h new file mode 100644 index 00000000..94cdf267 --- /dev/null +++ b/src/linker/base_ext/base_blake3_asm.h @@ -0,0 +1,18 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#ifndef BASE_BLAKE3_H +#define BASE_BLAKE3_H + +#include "../third_party_ext/blake3/blake3.h" + +static void +blake3(void* out, size_t outlen, void* in, size_t inlen) +{ + blake3_hasher hasher; + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, in, inlen); + blake3_hasher_finalize(&hasher, (uint8_t*)out, outlen); +} + +#endif // BASE_BLAKE3_H diff --git a/src/linker/base_ext/base_core.c b/src/linker/base_ext/base_core.c new file mode 100644 index 00000000..2f44ddd5 --- /dev/null +++ b/src/linker/base_ext/base_core.c @@ -0,0 +1,227 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U16 +safe_cast_u16x(U64 x) +{ + AssertAlways(x <= max_U16); + return (U16)x; +} + +//////////////////////////////// + +internal U64 +u128_mod64(U128 a, U64 b) +{ + return a.u64[1] % b; +} + +//////////////////////////////// + +internal Version +make_version(U64 major, U64 minor) +{ + Version version; + version.major = major; + version.minor = minor; + return version; +} + +internal int +version_compar(Version a, Version b) +{ + int cmp = 0; + if (a.major < b.major) { + cmp = -1; + } else if (a.major > b.major) { + cmp = +1; + } else if (a.major == b.major) { + if (a.minor < b.minor) { + cmp = -1; + } else if (a.minor > b.minor) { + cmp = +1; + } + } + return cmp; +} + +//////////////////////////////// + +internal ISectOff +isect_off(U32 isect, U32 off) +{ + ISectOff result = { isect, off }; + return result; +} + +//////////////////////////////// + +internal int +u16_compar(const void *raw_a, const void *raw_b) +{ + U16 a = *(U16*)raw_a; + U16 b = *(U16*)raw_b; + int result = a < b ? -1 : + a > b ? +1 : + 0; + return result; +} + +internal int +u32_compar(const void *raw_a, const void *raw_b) +{ + U32 a = *(U32*)raw_a; + U32 b = *(U32*)raw_b; + int result = a < b ? -1 : + a > b ? +1 : + 0; + return result; +} + +internal int +u64_compar(const void *raw_a, const void *raw_b) +{ + U64 a = *(const U64*)raw_a; + U64 b = *(const U64*)raw_b; + int result = a < b ? -1 : a > b ? +1 : 0; + return result; +} + +internal int +u64_compar_inv(const void *raw_a, const void *raw_b) +{ + U64 a = *(const U64*)raw_a; + U64 b = *(const U64*)raw_b; + int result = a < b ? +1 : a > b ? -1 : 0; + return result; +} + +internal int +u16_compar_is_before(void *raw_a, void *raw_b) +{ + U16 *a = (U16 *)raw_a; + U16 *b = (U16 *)raw_b; + int is_before = *a < *b; + return is_before; +} + +internal int +u32_compar_is_before(void *raw_a, void *raw_b) +{ + U32 *a = (U32 *)raw_a; + U32 *b = (U32 *)raw_b; + int is_before = *a < *b; + return is_before; +} + +internal int +u64_compar_is_before(void *raw_a, void *raw_b) +{ + U64 *a = (U64 *)raw_a; + U64 *b = (U64 *)raw_b; + int is_before = *a < *b; + return is_before; +} + + +internal int +u8_is_before(void *raw_a, void *raw_b) +{ + U8 *a = (U8 *) raw_a; + U8 *b = (U8 *) raw_b; + return *a < *b; +} + +internal int +u16_is_before(void *raw_a, void *raw_b) +{ + U16 *a = (U16 *) raw_a; + U16 *b = (U16 *) raw_b; + return *a < *b; +} + +internal int +u32_is_before(void *raw_a, void *raw_b) +{ + U32 *a = (U32 *) raw_a; + U32 *b = (U32 *) raw_b; + return *a < *b; +} + +internal int +u64_is_before(void *raw_a, void *raw_b) +{ + U64 *a = (U64 *) raw_a; + U64 *b = (U64 *) raw_b; + return *a < *b; +} + +internal int +pair_u32_is_before_v0(void *raw_a, void *raw_b) +{ + PairU32 *a = raw_a; + PairU32 *b = raw_b; + return a->v0 < b->v0; +} + +internal int +pair_u32_is_before(void *raw_a, void *raw_b) +{ + PairU32 *a = raw_a; + PairU32 *b = raw_b; + return a->v1 < b->v1; +} + +internal int +pair_u64_is_before_v0(void *raw_a, void *raw_b) +{ + PairU64 *a = raw_a; + PairU64 *b = raw_b; + return a->v0 < b->v0; +} + +internal int +pair_u64_is_before_v1(void *raw_a, void *raw_b) +{ + PairU64 *a = raw_a; + PairU64 *b = raw_b; + return a->v1 < b->v1; +} + +internal int +pair_u32_compar_v0(const void *raw_a, const void *raw_b) +{ + const PairU32 *a = raw_a; + const PairU32 *b = raw_b; + return u32_compar(&a->v0, &b->v0); +} + +internal int +pair_u64_compar_v0(const void *raw_a, const void *raw_b) +{ + const PairU64 *a = raw_a; + const PairU64 *b = raw_b; + return u64_compar(&a->v0, &b->v0); +} + +internal int +pair_u64_compar_v1(const void *raw_a, const void *raw_b) +{ + const PairU64 *a = raw_a; + const PairU64 *b = raw_b; + return u64_compar(&a->v1, &b->v1); +} + + +//////////////////////////////// + +internal void +str8_list_concat_in_place_array(String8List *list, String8List *arr, U64 count) +{ + for (U64 i = 0; i < count; ++i) { + str8_list_concat_in_place(list, &arr[i]); + } +} + + + diff --git a/src/linker/base_ext/base_core.h b/src/linker/base_ext/base_core.h new file mode 100644 index 00000000..d931bcfb --- /dev/null +++ b/src/linker/base_ext/base_core.h @@ -0,0 +1,199 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#if COMPILER_MSVC +# define COMPILER_STRING "MSVC" +#elif COMPILER_CLANG +# define COMPILER_STRING "Clang" +#elif COMPILER_GCC +# define COMPILER_STRING "GCC" +#else +# error "undefined compiler string" +#endif + +#if BUILD_DEBUG +# define BUILD_MODE_STRING "Debug" +#else +# define BUILD_MODE_STRING "Release" +#endif + +//////////////////////////////// + +#define BitExtract(x, count, shift) (((x) >> (shift)) & ((1 << (count)) - 1)) + +//////////////////////////////// + +#if OS_WINDOWS +# define ins_atomic_ptr_eval_cond_assign(x,k,c) InterlockedCompareExchangePointer((volatile PVOID *)(x),(k),(c)) +# define ins_atomic_u32_add_eval(x,c) InterlockedAdd((volatile LONG *)(x), c) +# define ins_atomic_u32_inc_eval(x) InterlockedIncrement((volatile LONG *)x) +#else +# error "atomics are not defined for this system" +#endif + +//////////////////////////////// +// Linked List Helpers + +#define DLLConcatInPlace(list, to_concat) do { \ + if ((to_concat)->count) { \ + if ((list)->count) { \ + (list)->last->next = (to_concat)->first; \ + (to_concat)->first->prev = (list)->last; \ + (list)->last = (to_concat)->last; \ + } else { \ + (list)->first = (to_concat)->first; \ + (list)->last = (to_concat)->last; \ + } \ + (list)->count += (to_concat)->count; \ + MemoryZeroStruct(to_concat); \ + } \ +} while (0) +#define DLLConcatInPlaceArray(list, to_concat_arr, count) for (U64 i = 0; i < (count); i += 1) { DLLConcatInPlace(list, &(to_concat_arr)[i]); } + +#define SLLQueuePushCount(list, node) do { \ + SLLQueuePush((list)->first, (list)->last, node); \ + ++(list)->count; \ +} while (0) + +#define SLLConcatInPlaceNoCount(list, to_concat) do { \ + if ((to_concat)->first) { \ + if ((list)->first) { \ + (list)->last->next = (to_concat)->first; \ + (list)->last = (to_concat)->last; \ + } else { \ + (list)->first = (to_concat)->first; \ + (list)->last = (to_concat)->last; \ + } \ + MemoryZeroStruct(to_concat); \ + } \ +} while (0) + +#define SLLConcatInPlace(list, to_concat) do { \ + if ((to_concat)->count) { \ + if ((list)->count) { \ + (list)->last->next = (to_concat)->first; \ + (list)->last = (to_concat)->last; \ + } else { \ + (list)->first = (to_concat)->first; \ + (list)->last = (to_concat)->last; \ + } \ + (list)->count += (to_concat)->count; \ + MemoryZeroStruct(to_concat); \ + } \ +} while (0) +#define SLLConcatInPlaceArray(list, to_concat_arr, count) for (U64 i = 0; i < (count); ++i) { SLLConcatInPlace(list, &(to_concat_arr)[i]); } + +#define SLLConcatInPlaceChunkList(list, to_concat, chunk_type) do { \ + if ((list)->last != 0) { \ + U64 base_cursor = (list)->last->base + (list)->last->count; \ + for (chunk_type *c = (to_concat)->first; c != 0; c = c->next) { \ + c->base = base_cursor; \ + base_cursor += c->count; \ + } \ + } \ + SLLConcatInPlace(list, to_concat); \ + } while (0) + +#define SLLConcatInPlaceChunkListArray(list, to_concat_arr, type, count) for (U64 i = 0; i < (count); ++i) { SLLConcatInPlaceChunkList(list, &(to_concat_arr)[i], type); } + +#define SLLChunkListPush(_arena, _list, _cap, _value_type) do { \ + if ((_list)->last == 0 || (_list)->last->count >= (_list)->last->cap) { \ + _value_type##Chunk *new_chunk = push_array(_arena, _value_type##Chunk, 1); \ + new_chunk->v = push_array(_arena, _value_type, _cap); \ + new_chunk->cap = _cap; \ + new_chunk->base = (_list)->last ? (_list)->last->base + (_list)->last->cap : 0; \ + SLLQueuePushCount(_list, new_chunk); \ + } \ + _value_type *v = &(_list)->last->v[(_list)->last->count++]; \ + v->chunk = (_list)->last; \ +} while (0) + +#define SLLChunkListPushZero(_arena, _list, _cap, _value_type) do { \ + SLLChunkListPush(_arena, _list, _cap, _value_type); \ + MemoryZeroStruct(SLLChunkListLastItem(_list)); \ + SLLChunkListLastItem(_list)->chunk = (_list)->last; \ +} while(0) + +#define SLLChunkListLastItem(_list) (&(_list)->last->v[(_list)->last->count - 1]) + +//////////////////////////////// + +#define MemoryIsZeroStruct(p) memory_is_zero(p, sizeof(*p)) + +//////////////////////////////// + +#if ARCH_LITTLE_ENDIAN +# define BE_U32(x) bswap_u32(x) +#else +# define BE_U32(x) (x) +#endif + +//////////////////////////////// + +typedef struct +{ + U64 major; + U64 minor; +} Version; + +//////////////////////////////// + +typedef struct ISectOff +{ + U32 isect; + U32 off; +} ISectOff; + +//////////////////////////////// + +typedef struct PairU32 +{ + U32 v0; + U32 v1; +} PairU32; + +typedef struct PairU64 +{ + U64 v0; + U64 v1; +} PairU64; + +//////////////////////////////// + +internal U16 safe_cast_u16x(U64 x); + +//////////////////////////////// + +internal U64 u128_mod64(U128 a, U64 b); + +//////////////////////////////// + +internal Version make_version(U64 major, U64 minor); +internal int version_compar(Version a, Version b); + +//////////////////////////////// + +internal ISectOff isect_off(U32 isect, U32 off); + +//////////////////////////////// + +internal int u16_compar(const void *raw_a, const void *raw_b); +internal int u32_compar(const void *raw_a, const void *raw_b); +internal int u64_compar(const void *raw_a, const void *raw_b); + +internal int u8_is_before(void *raw_a, void *raw_b); +internal int u16_is_before(void *raw_a, void *raw_b); +internal int u32_is_before(void *raw_a, void *raw_b); +internal int u64_is_before(void *raw_a, void *raw_b); + +internal int pair_u32_is_before_v0(void *raw_a, void *raw_b); +internal int pair_u32_is_before_v1(void *raw_a, void *raw_b); +internal int pair_u64_is_before_v0(void *raw_a, void *raw_b); +internal int pair_u64_is_before_v1(void *raw_a, void *raw_b); + +//////////////////////////////// + +internal void str8_list_concat_in_place_array(String8List *list, String8List *arr, U64 count); + diff --git a/src/linker/base_ext/base_crc32.c b/src/linker/base_ext/base_crc32.c new file mode 100644 index 00000000..ee5a95aa --- /dev/null +++ b/src/linker/base_ext/base_crc32.c @@ -0,0 +1,72 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U32 +update_crc32(U32 crc, U8 *ptr, U64 size) +{ + // CRC-32 algo borrowed from stb.h + + local_persist U32 crc_table[256] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, + 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, + 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, + 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, + 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, + 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, + 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, + 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, + 0xb6662d3d, 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, + 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, + 0x086d3d2d, 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, + 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 0x4db26158, 0x3ab551ce, + 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, + 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, + 0xce61e49f, 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, + 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, + 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, 0x8708a3d2, 0x1e01f268, + 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, + 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, + 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, + 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, + 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, + 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, + 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, + 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 0x88085ae6, + 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, + 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, + 0x47b2cf7f, 0x30b5ffe9, 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, + 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d + }; + +#if 0 + for (U32 i = 0; i < 256; ++i) { + U32 s = i; + for (U32 j = 0; j < 8; ++j) { + s = (s >> 1) ^ (s & 1 ? 0xedb88320 : 0); + } + crc_table[i] = s; + } +#endif + + crc = ~crc; + for (U32 i = 0; i < size; ++i) { + crc = (crc >> 8) ^ crc_table[(ptr[i] ^ crc) & 0xff]; + } + + return ~crc; +} + +internal U32 +crc32_from_string(String8 string) +{ + return update_crc32(0, string.str, string.size); +} + diff --git a/src/linker/base_ext/base_crc32.h b/src/linker/base_ext/base_crc32.h new file mode 100644 index 00000000..a7bb209d --- /dev/null +++ b/src/linker/base_ext/base_crc32.h @@ -0,0 +1,8 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal U32 update_crc32(U32 crc, U8 *ptr, U64 size); +internal U32 crc32_from_string(String8 string); + diff --git a/src/linker/base_ext/base_inc.c b/src/linker/base_ext/base_inc.c new file mode 100644 index 00000000..94acc94c --- /dev/null +++ b/src/linker/base_ext/base_inc.c @@ -0,0 +1,12 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#include "base_core.c" +#include "base_strings.c" +#include "base_arena.c" +#include "base_math.c" +#include "base_arrays.c" +#include "base_bit_array.c" +#include "base_crc32.c" +#include "base_md5.c" + diff --git a/src/linker/base_ext/base_inc.h b/src/linker/base_ext/base_inc.h new file mode 100644 index 00000000..8d1c09e1 --- /dev/null +++ b/src/linker/base_ext/base_inc.h @@ -0,0 +1,15 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#include "base_core.h" +#include "base_strings.h" +#include "base_arena.h" +#include "base_math.h" +#include "base_arrays.h" +#include "base_blake3.h" +#include "base_bit_array.h" +#include "base_crc32.h" +#include "base_md5.h" + diff --git a/src/linker/base_ext/base_math.c b/src/linker/base_ext/base_math.c new file mode 100644 index 00000000..e5b78e6c --- /dev/null +++ b/src/linker/base_ext/base_math.c @@ -0,0 +1,19 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal void +rng_1u64_list_push_node(Rng1U64List *list, Rng1U64Node *node) +{ + SLLQueuePush(list->first, list->last, node); + ++list->count; +} + +internal Rng1U64Node * +rng_1u64_list_push(Arena *arena, Rng1U64List *list, Rng1U64 range) +{ + Rng1U64Node *node = push_array(arena, Rng1U64Node, 1); + node->v = range; + rng_1u64_list_push_node(list, node); + return node; +} + diff --git a/src/linker/base_ext/base_math.h b/src/linker/base_ext/base_math.h new file mode 100644 index 00000000..a6efcffc --- /dev/null +++ b/src/linker/base_ext/base_math.h @@ -0,0 +1,23 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct Rng1U64Node +{ + struct Rng1U64Node *next; + Rng1U64 v; +} Rng1U64Node; + +typedef struct Rng1U64List +{ + U64 count; + Rng1U64Node *first; + Rng1U64Node *last; +} Rng1U64List; + +//////////////////////////////// + +internal void rng_1u64_list_push_node(Rng1U64List *list, Rng1U64Node *node); +internal Rng1U64Node * rng_1u64_list_push(Arena *arena, Rng1U64List *list, Rng1U64 range); + diff --git a/src/linker/base_ext/base_md5.c b/src/linker/base_ext/base_md5.c new file mode 100644 index 00000000..326c466b --- /dev/null +++ b/src/linker/base_ext/base_md5.c @@ -0,0 +1,12 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal MD5Hash +md5_hash_from_string(String8 data) +{ + MD5_CTX ctx; MD5_Init(&ctx); + MD5_Update(&ctx, (void*)data.str, safe_cast_u32(data.size)); + MD5Hash hash; MD5_Final((unsigned char*)&hash, &ctx); + return hash; +} + diff --git a/src/linker/base_ext/base_md5.h b/src/linker/base_ext/base_md5.h new file mode 100644 index 00000000..b4593dd4 --- /dev/null +++ b/src/linker/base_ext/base_md5.h @@ -0,0 +1,12 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct MD5Hash +{ + U8 value[16]; +} MD5Hash; + +internal MD5Hash md5_hash_from_string(String8 data); + diff --git a/src/linker/base_ext/base_strings.c b/src/linker/base_ext/base_strings.c new file mode 100644 index 00000000..05943699 --- /dev/null +++ b/src/linker/base_ext/base_strings.c @@ -0,0 +1,248 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +global read_only String8 g_null_string; + +internal String8 +str8_cstring_capped_reverse(void *start, void *cap) +{ + char *ptr = cap; + while (ptr > (char *)start) { + --ptr; + if (*ptr == '\0') break; + } + U64 null_offset = (U64)(ptr - (char *) start); + String8 result = str8((U8 *) start, null_offset); + return result; +} + +internal U64 +str8_find_needle_reverse(String8 string, U64 start_pos, String8 needle, StringMatchFlags flags) +{ + for (S64 i = string.size - start_pos - needle.size; i >= 0; --i) { + String8 haystack = str8_substr(string, rng_1u64(i, i + needle.size)); + if (str8_match(haystack, needle, flags)) { + return (U64)i + needle.size; + } + } + return 0; +} + +internal int +str8_compar(String8 a, String8 b, B32 ignore_case) +{ + int cmp = 0; + U64 size = Min(a.size, b.size); + if (ignore_case) { + for (U64 i = 0; i < size; ++i) { + U8 la = char_to_lower(a.str[i]); + U8 lb = char_to_lower(b.str[i]); + if (la < lb) { + cmp = -1; + break; + } else if (la > lb) { + cmp = +1; + break; + } + } + } else { + for (U64 i = 0; i < size; ++i) { + if (a.str[i] < b.str[i]) { + cmp = -1; + break; + } else if (a.str[i] > b.str[i]) { + cmp = +1; + break; + } + } + } + + if (cmp == 0) { + // shorter prefix must precede longer prefixes + if (a.size > b.size) { + cmp = +1; + } else if (b.size > a.size) { + cmp = -1; + } + } + + return cmp; +} + +internal int +str8_compar_ignore_case(const void *a, const void *b) +{ + return str8_compar(*(String8*)a, *(String8*)b, 1); +} + +internal int +str8_compar_case_sensetive(const void *a, const void *b) +{ + return str8_compar(*(String8*)a, *(String8*)b, 0); +} + +internal int +str8_is_before_case_sensetive(const void *a, const void *b) +{ + int cmp = str8_compar_case_sensetive(a, b); + return cmp < 0; +} + +internal String8Node * +str8_list_push_raw(Arena *arena, String8List *list, void *data_ptr, U64 data_size) +{ + String8 data = str8((U8 *)data_ptr, data_size); + String8Node *node = str8_list_push(arena, list, data); + return node; +} + +internal U64 +str8_list_push_pad(Arena *arena, String8List *list, U64 offset, U64 align) +{ + U64 pad_size = AlignPow2(offset, align) - offset; + U8 *pad = push_array(arena, U8, pad_size); + MemorySet(pad, 0, pad_size); + str8_list_push(arena, list, str8(pad, pad_size)); + return pad_size; +} + +internal U64 +str8_list_push_pad_front(Arena *arena, String8List *list, U64 offset, U64 align) +{ + U64 pad_size = AlignPow2(offset, align) - offset; + U8 *pad = push_array(arena, U8, pad_size); + MemorySet(pad, 0, pad_size); + str8_list_push_front(arena, list, str8(pad, pad_size)); + return pad_size; +} + +internal String8List +str8_list_arr_concat(String8List *v, U64 count) +{ + String8List result = {0}; + for (U64 i = 0; i < count; i += 1) { + str8_list_concat_in_place(&result, &v[i]); + } + return result; +} + +internal String8Node * +str8_list_push_many(Arena *arena, String8List *list, U64 count) +{ + String8Node *arr = push_array(arena, String8Node, count); + for (U64 i = 0; i < count; ++i) { + str8_list_push_node(list, arr + i); + } + return arr; +} + +internal String8 +str8_from_bits_u32(Arena *arena, U32 x) +{ + U8 c0 = 'a' + ((x >> 28) & 0xf); + U8 c1 = 'a' + ((x >> 24) & 0xf); + U8 c2 = 'a' + ((x >> 20) & 0xf); + U8 c3 = 'a' + ((x >> 16) & 0xf); + U8 c4 = 'a' + ((x >> 12) & 0xf); + U8 c5 = 'a' + ((x >> 8) & 0xf); + U8 c6 = 'a' + ((x >> 4) & 0xf); + U8 c7 = 'a' + ((x >> 0) & 0xf); + String8 result = push_str8f(arena, "%c%c%c%c%c%c%c%c", c0, c1, c2, c3, c4, c5, c6, c7); + return result; +} + +internal String8 +str8_from_bits_u64(Arena *arena, U64 x) +{ + U8 c0 = 'a' + ((x >> 60) & 0xf); + U8 c1 = 'a' + ((x >> 56) & 0xf); + U8 c2 = 'a' + ((x >> 52) & 0xf); + U8 c3 = 'a' + ((x >> 48) & 0xf); + U8 c4 = 'a' + ((x >> 44) & 0xf); + U8 c5 = 'a' + ((x >> 40) & 0xf); + U8 c6 = 'a' + ((x >> 36) & 0xf); + U8 c7 = 'a' + ((x >> 32) & 0xf); + U8 c8 = 'a' + ((x >> 28) & 0xf); + U8 c9 = 'a' + ((x >> 24) & 0xf); + U8 ca = 'a' + ((x >> 20) & 0xf); + U8 cb = 'a' + ((x >> 16) & 0xf); + U8 cc = 'a' + ((x >> 12) & 0xf); + U8 cd = 'a' + ((x >> 8) & 0xf); + U8 ce = 'a' + ((x >> 4) & 0xf); + U8 cf = 'a' + ((x >> 0) & 0xf); + String8 result = push_str8f(arena, + "%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c", + c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ca, cb, cc, cd, ce, cf); + return result; +} + +internal String8Node * +str8_list_pop_front(String8List *list) +{ + String8Node *node = 0; + if (list->node_count) { + node = list->first; + Assert(list->total_size >= list->first->string.size); + list->node_count -= 1; + list->total_size -= list->first->string.size; + SLLQueuePop(list->first, list->last); + } + return node; +} + +internal String8 +str8_from_memory_size2(Arena *arena, U64 size) +{ + String8 result; + if (size < KB(1)) { + result = push_str8f(arena, "%llu Bytes", size); + } else if (size < MB(1)) { + result = push_str8f(arena, "%llu.%02llu KiB", size / KB(1), ((size * 100) / KB(1)) % 100); + } else if (size < GB(1)) { + result = push_str8f(arena, "%llu.%02llu MiB", size / MB(1), ((size * 100) / MB(1)) % 100); + } else if (size < TB(1)) { + result = push_str8f(arena, "%llu.%02llu GiB", size / GB(1), ((size * 100) / GB(1)) % 100); + } else { + result = push_str8f(arena, "%llu.%02llu TiB", size / TB(1), ((size * 100) / TB(1)) % 100); + } + return result; +} + +internal String8 +str8_from_count(Arena *arena, U64 count) +{ + String8 result; + if (count < 1000) { + result = push_str8f(arena, "%llu", count); + } else if (count < 1000000) { + U64 frac = ((count * 100) / 1000) % 100; + if (frac) { + result = push_str8f(arena, "%llu.%02lluK", count / 1000, frac); + } else { + result = push_str8f(arena, "%lluK", count / 1000); + } + } else if (count < 1000000000) { + U64 frac = ((count * 100) / 1000000) % 100; + if (frac) { + result = push_str8f(arena, "%llu.%02lluM", count / 1000000, frac); + } else { + result = push_str8f(arena, "%lluM", count / 1000000); + } + } else { + U64 frac = ((count * 100) * 1000000000) % 100; + if (frac) { + result = push_str8f(arena, "%llu.%02lluB", count / 1000000000, frac); + } else { + result = push_str8f(arena, "%lluB", count / 1000000000, frac); + } + } + return result; +} + +internal U64 +hash_from_str8(String8 string) +{ + XXH64_hash_t hash64 = XXH3_64bits(string.str, string.size); + return hash64; +} + diff --git a/src/linker/base_ext/base_strings.h b/src/linker/base_ext/base_strings.h new file mode 100644 index 00000000..093c4efe --- /dev/null +++ b/src/linker/base_ext/base_strings.h @@ -0,0 +1,33 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#define MemoryCopyStr8(dst, s) MemoryCopy(dst, (s).str, (s).size) + +internal String8 str8_cstring_capped_reverse(void *start, void *cap); + +internal U64 str8_find_needle_reverse(String8 string, U64 start_pos, String8 needle, StringMatchFlags flags); + +internal int str8_compar(String8 a, String8 b, B32 ignore_case); +internal int str8_compar_ignore_case(const void *a, const void *b); +internal int str8_compar_case_sensetive(const void *a, const void *b); + +#define str8_list_push_struct(a,l,d) str8_list_push_raw(a, l, d, sizeof(*d)) +internal String8Node * str8_list_push_raw(Arena *arena, String8List *list, void *data_ptr, U64 data_size); +internal U64 str8_list_push_pad(Arena *arena, String8List *list, U64 offset, U64 align); +internal U64 str8_list_push_pad_front(Arena *arena, String8List *list, U64 offset, U64 align); +internal String8List str8_list_arr_concat(String8List *v, U64 count); +internal String8Node * str8_list_push_many(Arena *arena, String8List *list, U64 count); + +internal String8 str8_from_bits_u32(Arena *arena, U32 x); +internal String8 str8_from_bits_u64(Arena *arena, U64 x); + +// TODO: remove +internal String8Node * str8_list_pop_front(String8List *list); + +internal String8 str8_from_memory_size2(Arena *arena, U64 size); +internal String8 str8_from_count(Arena *arena, U64 count); + +internal U64 hash_from_str8(String8 string); + diff --git a/src/linker/codeview_ext/codeview.c b/src/linker/codeview_ext/codeview.c new file mode 100644 index 00000000..9e73e245 --- /dev/null +++ b/src/linker/codeview_ext/codeview.c @@ -0,0 +1,3234 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +//////////////////////////////// + +internal U64 +hash_from_cv_symbol(CV_Symbol *symbol) +{ + XXH3_state_t hasher; + XXH3_64bits_reset(&hasher); + XXH3_64bits_update(&hasher, &symbol->kind, sizeof(symbol->kind)); + XXH3_64bits_update(&hasher, &symbol->data.size, sizeof(symbol->data.size)); + XXH3_64bits_update(&hasher, symbol->data.str, symbol->data.size); + XXH64_hash_t hash = XXH3_64bits_digest(&hasher); + return hash; +} + +//////////////////////////////// +// Type Index Helpers + +internal CV_TypeIndexInfo * +cv_symbol_type_index_info_push(Arena *arena, CV_TypeIndexInfoList *list, CV_TypeIndexSource source, U64 offset) +{ + CV_TypeIndexInfo *info = push_array_no_zero(arena, CV_TypeIndexInfo, 1); + info->next = 0; + info->offset = offset; + info->source = source; + + SLLQueuePush(list->first, list->last, info); + list->count += 1; + + return info; +} + +internal CV_TypeIndexInfoList +cv_get_symbol_type_index_offsets(Arena *arena, CV_SymKind kind, String8 data) +{ + CV_TypeIndexInfoList list = {0}; + switch (kind) { + case CV_SymKind_BUILDINFO: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, OffsetOf(CV_SymBuildInfo, id)); + } break; + case CV_SymKind_GDATA32: + case CV_SymKind_LDATA32: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymData32, itype)); + } break; + case CV_SymKind_LPROC32_ID: + case CV_SymKind_GPROC32_ID: + case CV_SymKind_LPROC32_DPC_ID: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, OffsetOf(CV_SymProc32, itype)); + } break; + case CV_SymKind_GPROC32: + case CV_SymKind_LPROC32: + case CV_SymKind_LPROC32_DPC: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymProc32, itype)); + } break; + case CV_SymKind_UDT: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymUDT, itype)); + } break; + case CV_SymKind_GTHREAD32: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymThread32, itype)); + } break; + case CV_SymKind_FILESTATIC: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymFileStatic, itype)); + } break; + case CV_SymKind_LOCAL: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymLocal, itype)); + } break; + case CV_SymKind_REGREL32: + case CV_SymKind_BPREL32: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymRegrel32, itype)); + } break; + case CV_SymKind_REGISTER: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymRegister, itype)); + } break; + case CV_SymKind_CONSTANT: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymConstant, itype)); + } break; + case CV_SymKind_CALLSITEINFO: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymCallSiteInfo, itype)); + } break; + case CV_SymKind_CALLERS: + case CV_SymKind_CALLEES: + case CV_SymKind_INLINEES: { + Assert(data.size >= sizeof(CV_SymFunctionList)); + CV_SymFunctionList *func_list = (CV_SymFunctionList*)data.str; + for (U64 i = 0; i < func_list->count; ++i) { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, sizeof(CV_SymFunctionList) + i * sizeof(CV_TypeIndex)); + } + } break; + case CV_SymKind_INLINESITE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, OffsetOf(CV_SymInlineSite, inlinee)); + } break; + case CV_SymKind_HEAPALLOCSITE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_SymHeapAllocSite, itype)); + } break; + } + return list; +} + +internal CV_TypeIndexInfoList +cv_get_leaf_type_index_offsets(Arena *arena, CV_LeafKind leaf_kind, String8 data) +{ + CV_TypeIndexInfoList list = {0}; + switch (leaf_kind) { + case CV_LeafKind_NOTYPE: + case CV_LeafKind_VTSHAPE: + case CV_LeafKind_LABEL: + case CV_LeafKind_NULL: + case CV_LeafKind_NOTTRAN: { + // no type indices + } break; + case CV_LeafKind_MODIFIER: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafModifier, itype)); + } break; + case CV_LeafKind_POINTER: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafPointer, itype)); + CV_LeafPointer *ptr = (CV_LeafPointer *)data.str; + CV_PointerKind ptr_kind = CV_PointerAttribs_ExtractKind(ptr->attribs); + if (ptr_kind == CV_PointerKind_BaseType) { + // TODO: add CV_LeafPointerBaseType + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, sizeof(CV_LeafPointer) + 0); + } else { + CV_PointerMode ptr_mode = CV_PointerAttribs_ExtractMode(ptr->attribs); + if (ptr_mode == CV_PointerMode_PtrMem || ptr_mode == CV_PointerMode_PtrMethod) { + // TODO: add type for the CvLeafPointerMember to syms_cv.mc + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, sizeof(CV_LeafPointer) + 0); + } + } + } break; + case CV_LeafKind_ARRAY: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafArray, entry_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafArray, index_itype)); + } break; + case CV_LeafKind_CLASS: + case CV_LeafKind_STRUCTURE: + case CV_LeafKind_INTERFACE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafStruct, field_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafStruct, derived_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafStruct, vshape_itype)); + } break; + case CV_LeafKind_CLASS2: + case CV_LeafKind_STRUCT2: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafStruct2, field_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafStruct2, derived_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafStruct2, vshape_itype)); + } break; + case CV_LeafKind_UNION: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafUnion, field_itype)); + } break; + case CV_LeafKind_ALIAS: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafAlias, itype)); + } break; + case CV_LeafKind_FUNC_ID: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, OffsetOf(CV_LeafFuncId, scope_string_id)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafFuncId, itype)); + } break; + case CV_LeafKind_MFUNC_ID: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMFuncId, owner_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMFuncId, itype)); + } break; + case CV_LeafKind_STRING_ID: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, OffsetOf(CV_LeafStringId, substr_list_id)); + } break; + case CV_LeafKind_UDT_SRC_LINE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafUDTSrcLine, udt_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, OffsetOf(CV_LeafUDTSrcLine, src_string_id)); + } break; + case CV_LeafKind_UDT_MOD_SRC_LINE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafUDTModSrcLine, udt_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, OffsetOf(CV_LeafUDTModSrcLine, src_string_id)); + } break; + case CV_LeafKind_BUILDINFO: { + Assert(data.size >= sizeof(CV_LeafBuildInfo)); + CV_LeafBuildInfo *build_info = (CV_LeafBuildInfo *)data.str; + for (U16 i = 0; i < build_info->count; ++i) { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, sizeof(CV_LeafBuildInfo) + i * sizeof(CV_ItemId)); + } + } break; + case CV_LeafKind_ENUM: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafEnum, base_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafEnum, field_itype)); + } break; + case CV_LeafKind_PROCEDURE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafProcedure, ret_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafProcedure, arg_itype)); + } break; + case CV_LeafKind_MFUNCTION: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMFunction, ret_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMFunction, class_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMFunction, this_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMFunction, arg_itype)); + } break; + case CV_LeafKind_VFTABLE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafVFTable, owner_itype)); + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafVFTable, base_table_itype)); + } break; + case CV_LeafKind_VFTPATH: { + Assert(sizeof(CV_LeafVFPath) <= data.size); + CV_LeafVFPath *vfpath = (CV_LeafVFPath *)data.str; + for (U32 i = 0; i < vfpath->count; ++i) { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, sizeof(CV_LeafVFPath) + i * sizeof(CV_TypeId)); + } + } break; + case CV_LeafKind_TYPESERVER: + case CV_LeafKind_TYPESERVER2: + case CV_LeafKind_TYPESERVER_ST: { + // no type indices + } break; + case CV_LeafKind_SKIP: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafSkip, itype)); + } break; + case CV_LeafKind_SUBSTR_LIST: { + Assert(sizeof(CV_LeafArgList) <= data.size); + CV_LeafArgList *arg_list = (CV_LeafArgList*)data.str; + for (U32 i = 0; i < arg_list->count; ++i) { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, sizeof(CV_LeafArgList) + i * sizeof(CV_TypeIndex)); + } + } break; + case CV_LeafKind_ARGLIST: { + Assert(sizeof(CV_LeafArgList) <= data.size); + CV_LeafArgList *arg_list = (CV_LeafArgList*)data.str; + for (U32 i = 0; i < arg_list->count; ++i) { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, sizeof(CV_LeafArgList) + i * sizeof(CV_TypeIndex)); + } + } break; + case CV_LeafKind_LIST: + case CV_LeafKind_FIELDLIST: { + for (U64 cursor = 0; cursor < data.size; ) { + CV_LeafKind list_member_kind = 0; + U64 read_size = str8_deserial_read_struct(data, cursor, &list_member_kind); + + if(read_size != sizeof(list_member_kind)) { + Assert(!"malformed LF_FIELDLIST"); + break; + } + cursor += read_size; + + switch (list_member_kind) { + default: Assert(!"TODO: handle malformed field member"); break; + case CV_LeafKind_INDEX: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafIndex, itype)); + cursor += sizeof(CV_LeafIndex); + } break; + case CV_LeafKind_MEMBER: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafMember, itype)); + cursor += sizeof(CV_LeafMember); + + CV_NumericParsed size; + cursor += cv_read_numeric(data, cursor, &size); + + String8 name; + cursor += str8_deserial_read_cstr(data, cursor, &name); + } break; + case CV_LeafKind_STMEMBER: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafStMember, itype)); + cursor += sizeof(CV_LeafStMember); + + String8 name; + cursor += str8_deserial_read_cstr(data, cursor, &name); + } break; + case CV_LeafKind_METHOD: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafMethod, list_itype)); + cursor += sizeof(CV_LeafMethod); + + String8 name; + cursor += str8_deserial_read_cstr(data, cursor, &name); + } break; + case CV_LeafKind_ONEMETHOD: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafOneMethod, itype)); + + CV_LeafOneMethod onemethod; + cursor += str8_deserial_read_struct(data, cursor, &onemethod); + + CV_MethodProp prop = CV_FieldAttribs_ExtractMethodProp(onemethod.attribs); + if(prop == CV_MethodProp_PureIntro || prop == CV_MethodProp_Intro) + { + cursor += sizeof(U32); // virtoff + } + + String8 name; + cursor += str8_deserial_read_cstr(data, cursor, &name); + } break; + case CV_LeafKind_ENUMERATE: { + // no type index + cursor += sizeof(CV_LeafEnumerate); + CV_NumericParsed value; + cursor += cv_read_numeric(data, cursor, &value); + String8 name; + cursor += str8_deserial_read_cstr(data, cursor, &name); + } break; + case CV_LeafKind_NESTTYPE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafNestType, itype)); + cursor += sizeof(CV_LeafNestType); + + String8 name; + cursor += str8_deserial_read_cstr(data, cursor, &name); + } break; + case CV_LeafKind_NESTTYPEEX: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafNestTypeEx, itype)); + + cursor += sizeof(CV_LeafNestTypeEx); + String8 name; + cursor += str8_deserial_read_cstr(data, cursor, &name); + } break; + case CV_LeafKind_BCLASS: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafBClass, itype)); + + cursor += sizeof(CV_LeafBClass); + CV_NumericParsed offset; + cursor += cv_read_numeric(data, cursor, &offset); + } break; + case CV_LeafKind_VBCLASS: + case CV_LeafKind_IVBCLASS: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafVBClass, itype)); + cursor += sizeof(CV_LeafVBClass); + + CV_NumericParsed virtual_base_pointer; + cursor += cv_read_numeric(data, cursor, &virtual_base_pointer); + + CV_NumericParsed virtual_base_offset; + cursor += cv_read_numeric(data, cursor, &virtual_base_offset); + } break; + case CV_LeafKind_VFUNCTAB: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafVFuncTab, itype)); + cursor += sizeof(CV_LeafVFuncTab); + } break; + case CV_LeafKind_VFUNCOFF: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafVFuncOff, itype)); + cursor += sizeof(CV_LeafVFuncOff); + } break; + } + cursor = AlignPow2(cursor, 4); + } + } break; + case CV_LeafKind_METHOD: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMethod, list_itype)); + } break; + case CV_LeafKind_METHODLIST: { + for (U64 cursor = 0; cursor < data.size; ) { + // read method + CV_LeafMethodListMember method; + U64 read_size = str8_deserial_read_struct(data, cursor, &method); + + // error check read + if (read_size != sizeof(method)) { + Assert(!"malformed LF_METHODLIST"); + break; + } + + // push type index offset + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, cursor + OffsetOf(CV_LeafMethodListMember, itype)); + + // take into account intro virtual offset + CV_MethodProp mprop = CV_FieldAttribs_ExtractMethodProp(method.attribs); + if (mprop == CV_MethodProp_Intro || mprop == CV_MethodProp_PureIntro) { + read_size += sizeof(U32); + } + + // advance + cursor += read_size; + } + } break; + case CV_LeafKind_ONEMETHOD: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafOneMethod, itype)); + } break; + case CV_LeafKind_BITFIELD: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafBitField, itype)); + } break; + case CV_LeafKind_PRECOMP: + case CV_LeafKind_REFSYM: { + // no type indices + } break; + case CV_LeafKind_INDEX: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafIndex, itype)); + } break; + case CV_LeafKind_MEMBER: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafMember, itype)); + } break; + case CV_LeafKind_VFUNCTAB: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafVFuncTab, itype)); + } break; + case CV_LeafKind_VFUNCOFF: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafVFuncOff, itype)); + } break; + case CV_LeafKind_NESTTYPE: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafNestType, itype)); + } break; + case CV_LeafKind_NESTTYPEEX: { + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_TPI, OffsetOf(CV_LeafNestTypeEx, itype)); + } break; + default: { + NotImplemented; + } break; + } + return list; +} + +internal CV_TypeIndexInfoList +cv_get_inlinee_type_index_offsets(Arena *arena, String8 raw_data) +{ + CV_TypeIndexInfoList list = {0}; + + U64 cursor = 0; + + // first four bytes are always signature + CV_C13InlineeLinesSig sig = max_U32; + cursor += str8_deserial_read_struct(raw_data, cursor, &sig); + + while (cursor < raw_data.size) { + // read header + CV_C13InlineeSourceLineHeader *header = (CV_C13InlineeSourceLineHeader *) str8_deserial_get_raw_ptr(raw_data, cursor, sizeof(CV_C13InlineeSourceLineHeader)); + + // store type index offset + cv_symbol_type_index_info_push(arena, &list, CV_TypeIndexSource_IPI, cursor + OffsetOf(CV_C13InlineeSourceLineHeader, inlinee)); + + // advance past header + cursor += sizeof(*header); + + // skip extra files + B32 has_extra_files = (sig == CV_C13InlineeLinesSig_EXTRA_FILES); + if (has_extra_files) { + U32 file_count = 0; + cursor += str8_deserial_read_struct(raw_data, cursor, &file_count); + cursor += /* file id: */ sizeof(U32) * file_count; + } + } + + return list; +} + +internal String8Array +cv_get_data_around_type_indices(Arena *arena, CV_TypeIndexInfoList ti_list, String8 data) +{ + String8Array result; + if (ti_list.count > 0) { + result.count = ti_list.count + 1; + result.v = push_array_no_zero(arena, String8, result.count); + + U64 cursor = 0; + U64 ti_idx = 0; + + for (CV_TypeIndexInfo *ti_info = ti_list.first; ti_info != 0; ti_info = ti_info->next, ++ti_idx) { + result.v[ti_idx].size = ti_info->offset - cursor; + result.v[ti_idx].str = data.str + cursor; + cursor = ti_info->offset + sizeof(CV_TypeIndex); + } + + result.v[result.count-1].size = data.size - cursor; + result.v[result.count-1].str = data.str + cursor; + } else { + result.count = 1; + result.v = push_array_no_zero(arena, String8, 1); + result.v[0] = data; + } + return result; +} + +internal CV_TypeIndexSource +cv_type_index_source_from_leaf_kind(CV_LeafKind leaf_kind) +{ + CV_TypeIndexSource source; + if (leaf_kind == CV_LeafKind_FUNC_ID || + leaf_kind == CV_LeafKind_MFUNC_ID || + leaf_kind == CV_LeafKind_BUILDINFO || + leaf_kind == CV_LeafKind_SUBSTR_LIST || + leaf_kind == CV_LeafKind_STRING_ID || + leaf_kind == CV_LeafKind_UDT_SRC_LINE || + leaf_kind == CV_LeafKind_UDT_MOD_SRC_LINE) { + source = CV_TypeIndexSource_IPI; + } else if (leaf_kind == CV_LeafKind_NOTYPE) { + source = CV_TypeIndexSource_NULL; + } else { + source = CV_TypeIndexSource_TPI; + } + return source; +} + +//////////////////////////////// + +internal U64 +cv_name_offset_from_symbol(CV_SymKind kind, String8 data) +{ + U64 offset = data.size; + switch (kind) { + case CV_SymKind_COMPILE: break; + case CV_SymKind_OBJNAME: break; + case CV_SymKind_THUNK32: { + offset = sizeof(CV_SymThunk32); + } break; + case CV_SymKind_LABEL32: { + offset = sizeof(CV_SymLabel32); + } break; + case CV_SymKind_REGISTER: { + offset = sizeof(CV_SymRegister); + } break; + case CV_SymKind_CONSTANT: { + offset = sizeof(CV_SymConstant); + CV_NumericParsed size; + offset += cv_read_numeric(data, offset, &size); + } break; + case CV_SymKind_UDT: { + offset = sizeof(CV_SymUDT); + } break; + case CV_SymKind_BPREL32: { + offset = sizeof(CV_SymBPRel32); + } break; + case CV_SymKind_LDATA32: + case CV_SymKind_GDATA32: { + offset = sizeof(CV_SymData32); + } break; + case CV_SymKind_PUB32: { + offset = sizeof(CV_SymPub32); + } break; + case CV_SymKind_LPROC32: + case CV_SymKind_GPROC32: + case CV_SymKind_LPROC32_ID: + case CV_SymKind_GPROC32_ID: { + offset = sizeof(CV_SymProc32); + } break; + case CV_SymKind_REGREL32: { + offset = sizeof(CV_SymRegrel32); + } break; + case CV_SymKind_LTHREAD32: + case CV_SymKind_GTHREAD32: { + offset = sizeof(CV_SymData32); + } break; + case CV_SymKind_COMPILE2: break; + case CV_SymKind_LOCALSLOT: { + offset = sizeof(CV_SymSlot); + } break; + case CV_SymKind_PROCREF: + case CV_SymKind_LPROCREF: + case CV_SymKind_DATAREF: { + offset = sizeof(CV_SymRef2); + } break; + case CV_SymKind_TRAMPOLINE: break; + case CV_SymKind_LOCAL: { + offset = sizeof(CV_SymLocal); + } break; + default: InvalidPath; + } + return offset; +} + +internal String8 +cv_name_from_symbol(CV_SymKind kind, String8 data) +{ + U64 buf_off = cv_name_offset_from_symbol(kind, data); + U8 *buf_ptr = data.str + buf_off; + U8 *buf_opl = data.str + data.size; + String8 name = str8_cstring_capped(buf_ptr, buf_opl); + return name; +} + +internal CV_UDTInfo +cv_get_udt_info(CV_LeafKind kind, String8 data) +{ + String8 name = str8_zero(); + String8 unique_name = str8_zero(); + CV_TypeProps props = 0; + + switch(kind) { + case CV_LeafKind_CLASS: + case CV_LeafKind_STRUCTURE: + case CV_LeafKind_INTERFACE: { + U64 cursor = 0; + + CV_LeafStruct udt; + cursor += str8_deserial_read_struct(data, cursor, &udt); + + props = udt.props; + + CV_NumericParsed size; + cursor += cv_read_numeric(data, cursor, &size); + + cursor += str8_deserial_read_cstr(data, cursor, &name); + + if (udt.props & CV_TypeProp_HasUniqueName) { + cursor += str8_deserial_read_cstr(data, cursor, &unique_name); + } + } break; + + case CV_LeafKind_CLASS2: + case CV_LeafKind_STRUCT2: { + U64 cursor = 0; + + CV_LeafStruct2 udt; + cursor += str8_deserial_read_struct(data, cursor, &udt); + + props = udt.props; + + CV_NumericParsed size; + cursor += cv_read_numeric(data, cursor, &size); + + cursor += str8_deserial_read_cstr(data, cursor, &name); + + if (udt.props & CV_TypeProp_HasUniqueName) { + cursor += str8_deserial_read_cstr(data, cursor, &unique_name); + } + } break; + + case CV_LeafKind_UNION: { + U64 cursor = 0; + + CV_LeafUnion udt; + cursor += str8_deserial_read_struct(data, cursor, &udt); + + CV_NumericParsed size; + cursor += cv_read_numeric(data, cursor, &size); + + props = udt.props; + + cursor += str8_deserial_read_cstr(data, cursor, &name); + + if(udt.props & CV_TypeProp_HasUniqueName) { + cursor += str8_deserial_read_cstr(data, cursor, &unique_name); + } + } break; + + case CV_LeafKind_ENUM: { + U64 cursor = 0; + + CV_LeafEnum udt; + cursor += str8_deserial_read_struct(data, cursor, &udt); + + props = udt.props; + + cursor += str8_deserial_read_cstr(data, cursor, &name); + + if(udt.props & CV_TypeProp_HasUniqueName) { + cursor += str8_deserial_read_cstr(data, cursor, &unique_name); + } + } break; + + // dbi/tpi.cpp:1332 + case CV_LeafKind_UDT_SRC_LINE: { + CV_LeafUDTSrcLine *src_line = str8_deserial_get_raw_ptr(data, 0, sizeof(CV_LeafUDTSrcLine)); + name = str8_struct(&src_line->udt_itype); + } break; + case CV_LeafKind_UDT_MOD_SRC_LINE: { + CV_LeafUDTModSrcLine *mod_src_line = str8_deserial_get_raw_ptr(data, 0, sizeof(CV_LeafUDTModSrcLine)); + name = str8_struct(&mod_src_line->udt_itype); + } break; + + case CV_LeafKind_ALIAS: { + str8_deserial_read_cstr(data, 0, &name); + } break; + + default: { + InvalidPath; + } break; + } + + CV_UDTInfo info = {0}; + info.name = name; + info.unique_name = unique_name; + info.props = props; + return info; +} + +internal String8 +cv_name_from_udt_info(CV_UDTInfo udt_info) +{ + if (udt_info.props & CV_TypeProp_HasUniqueName) { + return udt_info.unique_name; + } + return udt_info.name; +} + +internal B32 +cv_is_udt_name_anon(String8 name) +{ + // corresponds to fUDTAnon from dbi/tm.cpp:817 + B32 is_anon = str8_match(str8_lit(""), name, 0) || + str8_match(str8_lit("__unnamed"), name, 0) || + str8_match(str8_lit("::"), name, StringMatchFlag_RightSideSloppy) || + str8_match(str8_lit("::__unnamed"), name, StringMatchFlag_RightSideSloppy); + return is_anon; +} + +internal B32 +cv_is_udt(CV_LeafKind kind) +{ + B32 is_udt = kind == CV_LeafKind_CLASS || + kind == CV_LeafKind_STRUCTURE || + kind == CV_LeafKind_CLASS2 || + kind == CV_LeafKind_STRUCT2 || + kind == CV_LeafKind_INTERFACE || + kind == CV_LeafKind_UNION || + kind == CV_LeafKind_ENUM || + kind == CV_LeafKind_UDT_MOD_SRC_LINE || + kind == CV_LeafKind_UDT_SRC_LINE || + kind == CV_LeafKind_ALIAS; + return is_udt; +} + +internal B32 +cv_is_global_symbol(CV_SymKind kind) +{ + B32 is_global_symbol = kind == CV_SymKind_CONSTANT || + kind == CV_SymKind_GDATA16 || + kind == CV_SymKind_GDATA32_16t || + kind == CV_SymKind_GDATA32_ST || + kind == CV_SymKind_GDATA32 || + kind == CV_SymKind_GTHREAD32_16t || + kind == CV_SymKind_GTHREAD32_ST || + kind == CV_SymKind_GTHREAD32; + return is_global_symbol; +} + +internal B32 +cv_is_typedef(CV_SymKind kind) +{ + B32 is_typedef = kind == CV_SymKind_UDT_16t || + kind == CV_SymKind_UDT_ST || + kind == CV_SymKind_UDT; + return is_typedef; +} + +internal B32 +cv_is_scope_symbol(CV_SymKind kind) +{ + B32 is_scope = kind == CV_SymKind_GPROC32 || + kind == CV_SymKind_LPROC32 || + kind == CV_SymKind_BLOCK32 || + kind == CV_SymKind_THUNK32 || + kind == CV_SymKind_INLINESITE || + kind == CV_SymKind_INLINESITE2 || + kind == CV_SymKind_WITH32 || + kind == CV_SymKind_SEPCODE || + kind == CV_SymKind_GPROC32_ID || + kind == CV_SymKind_LPROC32_ID; + return is_scope; +} + +internal B32 +cv_is_end_symbol(CV_SymKind kind) +{ + B32 is_end = kind == CV_SymKind_END || + kind == CV_SymKind_PROC_ID_END || + kind == CV_SymKind_INLINESITE_END; + return is_end; +} + +internal B32 +cv_is_leaf_type_server(CV_LeafKind kind) +{ + B32 is_type_server = kind == CV_LeafKind_TYPESERVER || + kind == CV_LeafKind_TYPESERVER2 || + kind == CV_LeafKind_TYPESERVER_ST; + return is_type_server; +} + +internal B32 +cv_is_leaf_pch(CV_LeafKind kind) +{ + B32 is_pch = kind == CV_LeafKind_PRECOMP || + kind == CV_LeafKind_PRECOMP_ST || + kind == CV_LeafKind_PRECOMP_16t; + return is_pch; +} + +internal CV_ObjInfo +cv_obj_info_from_symbol(CV_Symbol symbol) +{ + CV_ObjInfo result; MemoryZeroStruct(&result); + switch (symbol.kind) { + case CV_SymKind_OBJNAME: { + CV_SymObjName *obj_name = (CV_SymObjName *) symbol.data.str; + result.sig = obj_name->sig; + str8_deserial_read_cstr(symbol.data, sizeof(CV_SymObjName), &result.name); + } break; + case CV_SymKind_OBJNAME_ST: { + NotImplemented; + } break; + default: { + InvalidPath; + } break; + } + return result; +} + +internal CV_TypeServerInfo +cv_type_server_info_from_leaf(CV_Leaf leaf) +{ + CV_TypeServerInfo result = {0}; + switch (leaf.kind) { + case CV_LeafKind_TYPESERVER: { + CV_LeafTypeServer *ts = (CV_LeafTypeServer *) leaf.data.str; + + result.name = str8_cstring_capped_reverse(ts + 1, leaf.data.str + leaf.data.size); + result.sig.data1 = ts->sig; + result.age = ts->age; + } break; + case CV_LeafKind_TYPESERVER2: { + CV_LeafTypeServer2 *ts = (CV_LeafTypeServer2 *) leaf.data.str; + + Assert(sizeof(result.sig) == sizeof(ts->sig70)); + MemoryCopy(&result.sig, &ts->sig70, sizeof(ts->sig70)); + result.name = str8_cstring_capped_reverse(ts + 1, leaf.data.str + leaf.data.size); + result.age = ts->age; + } break; + case CV_LeafKind_TYPESERVER_ST: { + Assert("TODO: LF_TYPESERVER_ST"); + } break; + default: InvalidPath; + } + return result; +} + +internal CV_PrecompInfo +cv_precomp_info_from_leaf(CV_Leaf leaf) +{ + CV_PrecompInfo result = {0}; + switch (leaf.kind) { + case CV_LeafKind_PRECOMP: { + CV_LeafPreComp *precomp = (CV_LeafPreComp*)leaf.data.str; + result.start_index = precomp->start_index; + result.sig = precomp->sig; + result.leaf_count = precomp->count; + str8_deserial_read_cstr(leaf.data, sizeof(CV_LeafPreComp), &result.obj_name); + } break; + case CV_LeafKind_PRECOMP_16t: { + NotImplemented; + } break; + case CV_LeafKind_PRECOMP_ST: { + NotImplemented; + } break; + default: { + InvalidPath; + } break; + } + return result; +} + +internal B32 +cv_is_reg_sp(CV_Arch arch, CV_Reg reg) +{ + switch (arch) { + case CV_Arch_8086: return reg == CV_Regx86_ESP; + case CV_Arch_X64: return reg == CV_Regx64_RSP; + default: NotImplemented; + } + return 0; +} + +//////////////////////////////// +//~ Leaf Helpers + +internal U64 +cv_compute_leaf_record_size(String8 data, U64 align) +{ + U64 size = 0; + size += sizeof(CV_LeafSize); + size += sizeof(CV_LeafKind); + size += data.size; + size = AlignPow2(size, align); + return size; +} + +internal U64 +cv_serialize_leaf_to_buffer(U8 *buffer, U64 buffer_cursor, U64 buffer_size, CV_LeafKind kind, String8 data, U64 align) +{ + U64 buffer_cursor_start = buffer_cursor; + + // compute record size + U64 record_size = sizeof(kind) + data.size; + Assert(record_size <= CV_LeafSize_Max); + CV_LeafSize record_size16 = (CV_LeafSize)record_size; + + // compute pad + static U8 LEAF_PAD_ARR[] = { 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; + U64 pad_size = AlignPadPow2(data.size, align); + Assert(pad_size <= ArrayCount(LEAF_PAD_ARR)); + + // write header + CV_LeafHeader *header_ptr = (CV_LeafHeader *)(buffer + buffer_cursor); + header_ptr->size = record_size16; + header_ptr->kind = kind; + buffer_cursor += sizeof(*header_ptr); + + // write body + U8 *leaf_data_ptr = buffer + buffer_cursor; + MemoryCopy(leaf_data_ptr, data.str, data.size); + buffer_cursor += data.size; + + // write pad + U8 *pad_data_ptr = buffer + buffer_cursor; + MemoryCopy(pad_data_ptr, &LEAF_PAD_ARR[0], pad_size); + buffer_cursor += pad_size; + + U64 write_size = buffer_cursor - buffer_cursor_start; + return write_size; +} + +internal String8 +cv_serialize_leaf_ex(Arena *arena, CV_LeafKind kind, String8 data, U64 align) +{ + U64 buffer_size = cv_compute_leaf_record_size(data, align); + U8 *buffer = push_array_no_zero(arena, U8, buffer_size); + U64 size = cv_serialize_leaf_to_buffer(buffer, 0, buffer_size, kind, data, align); + String8 raw_leaf = str8(buffer, size); + return raw_leaf; +} + +internal String8 +cv_serialize_leaf(Arena *arena, CV_Leaf *leaf, U64 align) +{ + return cv_serialize_leaf_ex(arena, leaf->kind, leaf->data, align); +} + +internal CV_Leaf +cv_make_leaf(Arena *arena, CV_LeafKind kind, String8 data) +{ + CV_Leaf result = {0}; + String8 raw_leaf = cv_serialize_leaf_ex(arena, kind, data, 1); + cv_deserial_leaf(raw_leaf, 0, 1, &result); + return result; +} + +internal U64 +cv_deserial_leaf(String8 raw_data, U64 off, U64 align, CV_Leaf *leaf_out) +{ + // do we have enough bytes to read header? + Assert(raw_data.size >= sizeof(CV_LeafHeader)); + + CV_LeafHeader *header = (CV_LeafHeader*)(raw_data.str + off); + + // leaf size must have enough bytes for the kind enum + Assert(header->size >= sizeof(CV_LeafKind)); + + // do we have enough bytes to read leaf data? + Assert(sizeof(CV_LeafSize) + header->size <= raw_data.size); + + // fill out leaf + leaf_out->kind = header->kind; + leaf_out->data = str8(raw_data.str + sizeof(CV_LeafHeader), header->size - sizeof(CV_LeafKind)); + + U64 leaf_size = AlignPow2(sizeof(CV_LeafHeader) + leaf_out->data.size, align); + Assert(leaf_size <= raw_data.size); + return leaf_size; +} + +internal CV_Leaf +cv_leaf_from_string(String8 raw_data) +{ + CV_Leaf result; + cv_deserial_leaf(raw_data, 0, 1, &result); + return result; +} + +//////////////////////////////// +//~ Symbol Helpers + +internal U64 +cv_compute_symbol_record_size(CV_Symbol *symbol, U64 align) +{ + U64 size = 0; + size += sizeof(CV_SymSize); + size += sizeof(CV_SymKind); + size += AlignPow2(symbol->data.size, align); + return size; +} + +internal U64 +cv_serialize_symbol_to_buffer(U8 *buffer, U64 buffer_cursor, U64 buffer_size, CV_Symbol *symbol, U64 align) +{ + U64 write_size = cv_compute_symbol_record_size(symbol, align); + Assert(buffer_cursor + write_size <= buffer_size); + + U64 record_size = 0; + record_size += sizeof(symbol->kind); + record_size += AlignPow2(symbol->data.size, align); + + Assert(record_size <= CV_SymSize_Max); + CV_SymSize record_size16 = (CV_SymSize)record_size; + + // init header + CV_SymbolHeader *header = (CV_SymbolHeader *)(buffer + buffer_cursor); + header->size = record_size16; + header->kind = symbol->kind; + + // copy symbol data + U8 *data_dst = (U8 *)(header + 1); + MemoryCopy(data_dst, symbol->data.str, symbol->data.size); + + // set pad bytes + U64 pad_size = AlignPadPow2(symbol->data.size, align); + U8 *pad_dst = data_dst + symbol->data.size; + MemorySet(&pad_dst[0], 0, pad_size); + + return write_size; +} + +internal String8 +cv_serialize_symbol(Arena *arena, CV_Symbol *symbol, U64 align) +{ + U64 buffer_size = cv_compute_symbol_record_size(symbol, align); + U8 *buffer = push_array(arena, U8, buffer_size); + cv_serialize_symbol_to_buffer(buffer, 0, buffer_size, symbol, align); + String8 result = str8(buffer, buffer_size); + return result; +} + +internal String8 +cv_make_symbol(Arena *arena, CV_SymKind kind, String8 data) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + AssertAlways((data.size + sizeof(kind)) <= CV_SymSize_Max); + CV_SymSize symbol_size = (CV_SymSize)data.size + sizeof(kind); + String8List srl = {0}; + str8_serial_begin(scratch.arena, &srl); + str8_serial_push_struct(scratch.arena, &srl, &symbol_size); + str8_serial_push_struct(scratch.arena, &srl, &kind); + str8_serial_push_string(scratch.arena, &srl, data); + String8 symbol = str8_serial_end(arena, &srl); + scratch_end(scratch); + ProfEnd(); + return symbol; +} + +internal String8 +cv_make_obj_name(Arena *arena, String8 obj_path, U32 sig) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + CV_SymObjName obj = {0}; + obj.sig = sig; + + String8List serial = {0}; + str8_serial_begin(scratch.arena, &serial); + str8_serial_push_struct(scratch.arena, &serial, &obj); + str8_serial_push_cstr(scratch.arena, &serial, obj_path); + String8 result = str8_serial_end(arena, &serial); + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal String8 +cv_make_comp3(Arena *arena, + CV_Compile3Flags flags, CV_Language lang, CV_Arch arch, + U16 ver_fe_major, U16 ver_fe_minor, U16 ver_fe_build, U16 ver_feqfe, + U16 ver_major, U16 ver_minor, U16 ver_build, U16 ver_qfe, + String8 version_string) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + CV_SymCompile3 comp = {0}; + comp.flags = flags | lang; + comp.machine = arch; + comp.ver_fe_major = ver_fe_major; + comp.ver_fe_minor = ver_fe_minor; + comp.ver_fe_build = ver_fe_build; + comp.ver_feqfe = ver_feqfe; + comp.ver_major = ver_major; + comp.ver_minor = ver_minor; + comp.ver_build = ver_build; + comp.ver_qfe = ver_qfe; + + String8List serial = {0}; + str8_serial_begin(scratch.arena, &serial); + str8_serial_push_struct(scratch.arena, &serial, &comp); + str8_serial_push_cstr(scratch.arena, &serial, version_string); + String8 result = str8_serial_end(arena, &serial); + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal String8 +cv_make_envblock(Arena *arena, String8List string_list) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + String8List serial = {0}; + str8_serial_begin(scratch.arena, &serial); + CV_SymEnvBlock envblock = {0}; + str8_serial_push_struct(scratch.arena, &serial, &envblock); + for (String8Node *n = string_list.first; n != NULL; n = n->next) { + str8_serial_push_cstr(scratch.arena, &serial, n->string); + } + String8 result = str8_serial_end(arena, &serial); + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal CV_Symbol +cv_make_proc_ref(Arena *arena, CV_ModIndex imod, U32 stream_offset, String8 name, B32 is_local) +{ + U64 buffer_size = sizeof(CV_SymRef2) + name.size + 1; + U8 *buffer = push_array_no_zero(arena, U8, buffer_size); + + CV_SymRef2 *ref = (CV_SymRef2*)buffer; + ref->suc_name = 0; + ref->sym_off = stream_offset; + ref->imod = imod + 1; // MSVC adds one + + U8 *name_ptr = (U8*)(ref + 1); + MemoryCopy(name_ptr, name.str, name.size); + name_ptr[name.size] = '\0'; + + CV_Symbol symbol; + symbol.kind = is_local ? CV_SymKind_LPROCREF : CV_SymKind_PROCREF; + symbol.data = str8(buffer, buffer_size); + symbol.offset = max_U64; + + return symbol; +} + +internal CV_Symbol +cv_make_pub32(Arena *arena, CV_Pub32Flags flags, U32 off, U16 isect, String8 name) +{ + U64 buffer_size = sizeof(CV_SymPub32) + name.size + 1; + U8 *buffer = push_array_no_zero(arena, U8, buffer_size); + + CV_SymPub32 *pub = (CV_SymPub32 *)buffer; + pub->flags = flags; + pub->off = off; + pub->sec = isect; + + U8 *name_ptr = (U8*)(pub + 1); + MemoryCopy(name_ptr, name.str, name.size); + name_ptr[name.size] = '\0'; + + CV_Symbol symbol; + symbol.kind = CV_SymKind_PUB32; + symbol.data = str8(buffer, buffer_size); + + return symbol; +} + +internal CV_SymbolList +cv_make_proc_refs(Arena *arena, CV_ModIndex imod, CV_SymbolList symbol_list) +{ + CV_SymbolList proc_ref_list = {0}; + for (CV_SymbolNode *symbol_node = symbol_list.first; symbol_node != 0; symbol_node = symbol_node->next) { + CV_Symbol *symbol = &symbol_node->data; + if (symbol->kind == CV_SymKind_GPROC32) { + String8 name = cv_name_from_symbol(symbol->kind, symbol->data); + CV_Symbol ref = cv_make_proc_ref(arena, imod, safe_cast_u32(symbol->offset), name, /* is_local: */ 0); + CV_SymbolNode *proc_ref_node = cv_symbol_list_push(arena, &proc_ref_list); + proc_ref_node->data = ref; + } else if (symbol->kind == CV_SymKind_LPROC32) { + String8 name = cv_name_from_symbol(symbol->kind, symbol->data); + CV_Symbol ref = cv_make_proc_ref(arena, imod, safe_cast_u32(symbol->offset), name, /* is_local */ 1); + CV_SymbolNode *proc_ref_node = cv_symbol_list_push(arena, &proc_ref_list); + proc_ref_node->data = ref; + } + } + return proc_ref_list; +} + +//////////////////////////////// +//~ .debug$S helpers + +internal void +cv_parse_debug_s_c13_(Arena *arena, CV_DebugS *debug_s, String8 raw_debug_s) +{ + for (U64 cursor = 0; cursor + sizeof(CV_C13SubSectionHeader) <= raw_debug_s.size; ) { + // read header + CV_C13SubSectionHeader header = {0}; + cursor += str8_deserial_read_struct(raw_debug_s, cursor, &header); + + if (~header.kind & CV_C13SubSectionKind_IgnoreFlag) { + // pick sub-section list + U64 sub_sect_idx = cv_c13_sub_section_idx_from_kind(header.kind); + String8List *sub_sect_list = debug_s->data_list + sub_sect_idx; + + // push data to sub-section + Rng1U64 sub_sect_range = r1u64(cursor, cursor + header.size); + String8 sub_sect_data = str8_substr(raw_debug_s, sub_sect_range); + str8_list_push(arena, sub_sect_list, sub_sect_data); + } + + // advance + cursor += header.size; + cursor = AlignPow2(cursor, CV_C13SubSectionAlign); + } +} + +internal CV_DebugS +cv_parse_debug_s_c13(Arena *arena, String8 raw_debug_s) +{ + CV_DebugS debug_s = {0}; + cv_parse_debug_s_c13_(arena, &debug_s, raw_debug_s); + return debug_s; +} + +internal CV_DebugS +cv_parse_debug_s_c13_list(Arena *arena, String8List raw_debug_s) +{ + CV_DebugS debug_s = {0}; + for (String8Node *node = raw_debug_s.first; node != 0; node = node->next) { + cv_parse_debug_s_c13_(arena, &debug_s, node->string); + } + return debug_s; +} + +internal CV_DebugS +cv_parse_debug_s(Arena *arena, String8 raw_debug_s) +{ + CV_DebugS result; MemoryZeroStruct(&result); + if (raw_debug_s.size >= sizeof(CV_Signature)) { + CV_Signature sig = *(CV_Signature *)raw_debug_s.str; + switch (sig) { + case CV_Signature_C13: { + String8 raw_debug_s_past_sig = str8_substr(raw_debug_s, r1u64(sizeof(sig), raw_debug_s.size)); + result = cv_parse_debug_s_c13(arena, raw_debug_s_past_sig); + } break; + case CV_Signature_C6: { + Assert(!"TODO: handle C6"); + } break; + case CV_Signature_C7: { + Assert(!"TODO: handle C7"); + } break; + case CV_Signature_C11: { + Assert(!"TODO: handle C11"); + } break; + default: Assert(!"invalid signature"); break; + } + } + return result; +} + +internal void +cv_debug_s_concat_in_place(CV_DebugS *dst, CV_DebugS *src) +{ + for (U64 sub_sect_idx = 0; sub_sect_idx < ArrayCount(dst->data_list); sub_sect_idx += 1) { + str8_list_concat_in_place(&dst->data_list[sub_sect_idx], &src->data_list[sub_sect_idx]); + } +} + +internal String8List +cv_data_c13_from_debug_s(Arena *arena, CV_DebugS *debug_s, B32 write_sig) +{ + String8List srl = {0}; + str8_serial_begin(arena, &srl); + + if (write_sig) { + CV_Signature sig = CV_Signature_C13; + str8_serial_push_struct(arena, &srl, &sig); + } + + static CV_C13SubSectionKind layout_arr[] = { + CV_C13SubSectionKind_Symbols, + //CV_C13SubSectionKind_Lines, + CV_C13SubSectionKind_FileChksms, + CV_C13SubSectionKind_FrameData, + CV_C13SubSectionKind_InlineeLines, + CV_C13SubSectionKind_IlLines, + CV_C13SubSectionKind_CrossScopeImports, + CV_C13SubSectionKind_CrossScopeExports, + CV_C13SubSectionKind_FuncMDTokenMap, + CV_C13SubSectionKind_TypeMDTokenMap, + CV_C13SubSectionKind_MergedAssemblyInput, + CV_C13SubSectionKind_CoffSymbolRVA, + CV_C13SubSectionKind_XfgHashType, + CV_C13SubSectionKind_XfgHashVirtual, + }; + + for (U64 layout_idx = 0; layout_idx < ArrayCount(layout_arr); layout_idx += 1) { + CV_C13SubSectionKind kind = layout_arr[layout_idx]; + String8List *data = cv_sub_section_ptr_from_debug_s(debug_s, kind); + if (data->total_size > 0) { + U32 size32 = safe_cast_u32(data->total_size); + str8_serial_push_u32(arena, &srl, kind); + str8_serial_push_u32(arena, &srl, size32); + str8_serial_push_data_list(arena, &srl, data->first); + str8_serial_push_align(arena, &srl, 4); + } + } + + String8List *line_data = cv_sub_section_ptr_from_debug_s(debug_s, CV_C13SubSectionKind_Lines); + for (String8Node *line_node = line_data->first; line_node != 0; line_node = line_node->next) { + str8_serial_push_u32(arena, &srl, CV_C13SubSectionKind_Lines); + str8_serial_push_u32(arena, &srl, safe_cast_u32(line_node->string.size)); + str8_serial_push_string(arena, &srl, line_node->string); + str8_serial_push_align(arena, &srl, 4); + } + + return srl; +} + +internal CV_C13SubSectionIdxKind +cv_c13_sub_section_idx_from_kind(CV_C13SubSectionKind kind) +{ + switch (kind) { +#define X(n,c) case CV_C13SubSectionKind_##n: return CV_C13SubSectionIdxKind_##n; + CV_C13SubSectionKindXList(X) +#undef X + } + return CV_C13SubSectionIdxKind_NULL; +} + +internal String8List * +cv_sub_section_ptr_from_debug_s(CV_DebugS *debug_s, CV_C13SubSectionKind kind) +{ + CV_C13SubSectionIdxKind idx = cv_c13_sub_section_idx_from_kind(kind); + return &debug_s->data_list[idx]; +} + +internal String8List +cv_sub_section_from_debug_s(CV_DebugS debug_s, CV_C13SubSectionKind kind) +{ + String8List *list_ptr = cv_sub_section_ptr_from_debug_s(&debug_s, kind); + return *list_ptr; +} + +internal String8 +cv_string_table_from_debug_s(CV_DebugS debug_s) +{ + String8List data_list = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_StringTable); + String8 string_data = str8(0,0); + if (data_list.node_count > 0) { + string_data = data_list.first->string; + } + return string_data; +} + +internal String8 +cv_file_chksms_from_debug_s(CV_DebugS debug_s) +{ + String8List data_list = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_FileChksms); + String8 file_chksms = str8(0,0); + if (data_list.node_count > 0) { + file_chksms = data_list.first->string; + } + return file_chksms; +} + +//////////////////////////////// +//~ String Table Deduper + +internal U64 +cv_string_hash_table_hash(String8 string) +{ + return hash_from_str8(string); +} + +internal int +cv_string_bucket_is_before(void *raw_a, void *raw_b) +{ + CV_StringBucket **a = raw_a; + CV_StringBucket **b = raw_b; + + int is_before; + + if ((*a)->u.idx0 == (*b)->u.idx0) { + is_before = (*a)->u.idx1 < (*b)->u.idx1; + } else { + is_before = (*a)->u.idx0 < (*b)->u.idx0; + } + + return is_before; +} + +internal CV_StringBucket * +cv_string_hash_table_insert_or_update(CV_StringBucket **buckets, U64 cap, U64 hash, CV_StringBucket *new_bucket) +{ + CV_StringBucket *result = 0; + B32 was_bucket_inserted_or_updated = 0; + + U64 best_idx = hash % cap; + U64 idx = best_idx; + + do { + retry:; + CV_StringBucket *curr_bucket = buckets[idx]; + + if (curr_bucket == 0) { + CV_StringBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + was_bucket_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } else if (str8_match(curr_bucket->string, new_bucket->string, 0)) { + if (cv_string_bucket_is_before(&curr_bucket, &new_bucket)) { + // recycle bucket + result = new_bucket; + + // don't need to update, more recent leaf is in the bucket + was_bucket_inserted_or_updated = 1; + + break; + } + + CV_StringBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + + // recycle bucket + result = compare_bucket; + + // new bucket is in the hash table, exit + was_bucket_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } + + // advance + idx = (idx + 1) % cap; + } while (idx != best_idx); + + // are there enough free buckets? + Assert(was_bucket_inserted_or_updated); + + return result; +} + +internal +THREAD_POOL_TASK_FUNC(cv_count_strings_in_debug_s_arr_task) +{ + ProfBeginFunction(); + CV_DedupStringTablesTask *task = raw_task; + CV_StringTableRange *range_list = task->range_lists[task_id]; + + for (CV_StringTableRange *range_n = range_list; range_n != 0; range_n = range_n->next) { + CV_DebugS debug_s = task->arr[range_n->debug_s_idx]; + String8 string_buffer = cv_string_table_from_debug_s(debug_s); + + Assert(range_n->range.min <= range_n->range.max); + Assert(range_n->range.min <= string_buffer.size); + Assert(range_n->range.max <= string_buffer.size); + + U64 count = 0; + for (U64 i = range_n->range.min; i < range_n->range.max; ++i) { + U8 b = string_buffer.str[i]; + if (b == '\0') { + count += 1; + } + } + + ins_atomic_u64_add_eval(&task->string_counts[range_n->debug_s_idx], count); + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(cv_dedup_strings_in_debug_s_arr_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + U64 debug_s_idx = task_id; + CV_DedupStringTablesTask *task = raw_task; + CV_DebugS debug_s = task->arr[debug_s_idx]; + + String8 string_table = cv_string_table_from_debug_s(debug_s); + String8List strings_list = str8_split_by_string_chars(scratch.arena, string_table, str8_lit("\0"), 0); + + CV_StringBucket *bucket = 0; + + U64 total_string_size = 0; + U64 total_insert_count = 0; + + U64 string_idx = 0; + + + for (String8Node *string_n = strings_list.first; string_n != 0; string_n = string_n->next, ++string_idx) { + if (bucket == 0) { + bucket = push_array_no_zero(arena, CV_StringBucket, 1); + } + + bucket->u.idx0 = debug_s_idx; + bucket->u.idx1 = string_idx; + bucket->string = string_n->string; + + U64 hash = cv_string_hash_table_hash(string_n->string); + CV_StringBucket *insert_or_update = cv_string_hash_table_insert_or_update(task->buckets, task->bucket_cap, hash, bucket); + + if (insert_or_update == 0) { + total_string_size += string_n->string.size; + total_insert_count += 1; + } + + if (insert_or_update != bucket) { + bucket = 0; + } + } + + ins_atomic_u64_add_eval(&task->total_string_size, total_string_size); + ins_atomic_u64_add_eval(&task->total_insert_count, total_insert_count); + + scratch_end(scratch); + ProfEnd(); +} + +internal CV_StringHashTable +cv_dedup_string_tables(TP_Arena *arena, TP_Context *tp, U64 count, CV_DebugS *arr) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Compute Total Weight"); + U64 total_weight = 0; + for (U64 i = 0; i < count; ++i) { + String8 string_table = cv_string_table_from_debug_s(arr[i]); + total_weight += string_table.size; + } + ProfEnd(); + + U64 per_task_weight = CeilIntegerDiv(total_weight, tp->worker_count); + U64 task_weight = 0; + U64 task_id = 0; + CV_StringTableRange **range_lists = push_array(scratch.arena, CV_StringTableRange *, tp->worker_count); + + ProfBegin("Divide Work"); + for (U64 debug_s_idx = 0; debug_s_idx < count; ++debug_s_idx) { + String8 string_table = cv_string_table_from_debug_s(arr[debug_s_idx]); + + for (U64 cursor = 0; cursor < string_table.size; cursor += per_task_weight) { + if (task_weight >= per_task_weight) { + task_id = (task_id + 1) % tp->worker_count; + task_weight = 0; + } + + U64 max_range_weight = Min(per_task_weight, string_table.size - cursor); + + CV_StringTableRange *node = push_array(scratch.arena, CV_StringTableRange, 1); + node->range = rng_1u64(cursor, cursor + max_range_weight); + node->debug_s_idx = debug_s_idx; + + SLLStackPush(range_lists[task_id], node); + task_weight += max_range_weight; + } + } + ProfEnd(); + + ProfBegin("Count"); + CV_DedupStringTablesTask task = {0}; + task.arr = arr; + task.range_lists = range_lists; + task.string_counts = push_array(scratch.arena, U64, count); + tp_for_parallel(tp, 0, tp->worker_count, cv_count_strings_in_debug_s_arr_task, &task); + ProfEnd(); + + ProfBegin("Dedup"); + U64 total_string_count = sum_array_u64(count, task.string_counts); + task.bucket_cap = (U64)((F64)total_string_count * 1.3); + task.buckets = push_array(arena->v[0], CV_StringBucket *, task.bucket_cap); + tp_for_parallel(tp, arena, count, cv_dedup_strings_in_debug_s_arr_task, &task); + ProfEnd(); + + CV_StringHashTable string_ht = {0}; + string_ht.total_string_size = task.total_string_size; + string_ht.total_insert_count = task.total_insert_count; + string_ht.bucket_cap = task.bucket_cap; + string_ht.buckets = task.buckets; + + scratch_end(scratch); + ProfEnd(); + return string_ht; +} + +internal void +cv_string_hash_table_assign_buffer_offsets(TP_Context *tp, CV_StringHashTable string_ht) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + ProfBegin("Count Strings"); + U64 string_count = 0; + for (U64 i = 0; i < string_ht.bucket_cap; ++i) { + if (string_ht.buckets[i] != 0) { + string_count += 1; + } + } + ProfEnd(); + + ProfBegin("Push"); + CV_StringBucket **strings = push_array_no_zero(scratch.arena, CV_StringBucket *, string_count); + ProfEnd(); + + ProfBegin("Copy Present Buckets"); + for (U64 i = 0, string_idx = 0; i < string_ht.bucket_cap; ++i) { + if (string_ht.buckets[i] != 0) { + strings[string_idx++] = string_ht.buckets[i]; + } + } + ProfEnd(); + + ProfBegin("Sort"); + radsort(strings, string_count, cv_string_bucket_is_before); + ProfEnd(); + + ProfBegin("Assign Offsets"); + for (U64 i = 0, offset_cursor = 0; i < string_count; ++i) { + CV_StringBucket *s = strings[i]; + s->u.offset = offset_cursor; + offset_cursor += s->string.size + 1; + } + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); +} + +internal CV_StringBucket * +cv_string_hash_table_lookup(CV_StringHashTable ht, String8 string) +{ + U64 hash = cv_string_hash_table_hash(string); + U64 best_idx = hash % ht.bucket_cap; + U64 idx = best_idx; + + do { + if (ht.buckets[idx] == 0) { + break; + } + + if (str8_match(ht.buckets[idx]->string, string, 0)) { + return ht.buckets[idx]; + } + + idx = (idx + 1 % ht.bucket_cap); + } while (idx != best_idx); + + return 0; +} + +internal +THREAD_POOL_TASK_FUNC(cv_pack_string_hash_table_task) +{ + ProfBeginFunction(); + CV_PackStringHashTableTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + for (U64 bucket_idx = range.min; bucket_idx < range.max; ++bucket_idx) { + CV_StringBucket *bucket = task->buckets[bucket_idx]; + if (bucket) { + MemoryCopy(task->buffer + bucket->u.offset, bucket->string.str, bucket->string.size); + task->buffer[bucket->u.offset + bucket->string.size] = '\0'; + } + } + ProfEnd(); +} + +internal String8 +cv_pack_string_hash_table(Arena *arena, TP_Context *tp, CV_StringHashTable string_ht) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + U64 buffer_size = string_ht.total_string_size + /* nulls: */ string_ht.total_insert_count; + U8 *buffer = push_array_no_zero(arena, U8, buffer_size); + + CV_PackStringHashTableTask task = {0}; + task.buckets = string_ht.buckets; + task.buffer = buffer; + task.ranges = tp_divide_work(scratch.arena, string_ht.bucket_cap, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, cv_pack_string_hash_table_task, &task); + + String8 result = str8(buffer, buffer_size); + scratch_end(scratch); + ProfEnd(); + return result; +} + +//////////////////////////////// +//~ Symbol Deduper + +internal int +cv_symbol_deduper_is_before(void *raw_a, void *raw_b) +{ + return raw_a < raw_b; +} + +internal CV_SymbolNode ** +cv_symbol_deduper_insert_or_update(CV_SymbolNode ***buckets, U64 cap, U64 hash, CV_SymbolNode **new_bucket) +{ + CV_SymbolNode **result = 0; + B32 is_inserted_or_updated = 0; + + U64 best_idx = hash % cap; + U64 idx = best_idx; + + do { + retry:; + CV_SymbolNode **curr_bucket = buckets[idx]; + + Assert(curr_bucket != new_bucket); + + if (curr_bucket == 0) { + CV_SymbolNode **compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + is_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } else if ((*curr_bucket)->data.kind == (*new_bucket)->data.kind && + (*curr_bucket)->data.data.size == (*new_bucket)->data.data.size && + MemoryMatch((*curr_bucket)->data.data.str, (*new_bucket)->data.data.str, (*new_bucket)->data.data.size)) { + if (cv_symbol_deduper_is_before(curr_bucket, new_bucket)) { + result = new_bucket; + + is_inserted_or_updated = 1; + + // don't need to update, more recent leaf is in the bucket + break; + } + + CV_SymbolNode **compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + if (compare_bucket == curr_bucket) { + result = compare_bucket; + + is_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } + + // advance + idx = (idx + 1) % cap; + } while (idx != best_idx); + + Assert(is_inserted_or_updated); + + return result; +} + +internal +THREAD_POOL_TASK_FUNC(cv_symbol_deduper_insert_task) +{ + ProfBeginFunction(); + CV_SymbolDeduperTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + for (U64 symbol_idx = range.min; symbol_idx < range.max; ++symbol_idx) { + CV_SymbolNode **symbol_node = &task->symbols[symbol_idx]; + U64 hash = hash_from_cv_symbol(&(*symbol_node)->data); + cv_symbol_deduper_insert_or_update(task->u.buckets, task->cap, hash, symbol_node); + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(cv_symbol_deduper_deref_buckets_task) +{ + ProfBeginFunction(); + CV_SymbolDeduperTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + for (U64 bucket_idx = range.min; bucket_idx < range.max; ++bucket_idx) { + CV_SymbolNode **bucket = task->u.buckets[bucket_idx]; + if (bucket) { + task->u.deref_buckets[bucket_idx] = *bucket; + } + } + ProfEnd(); +} + +internal void +cv_dedup_symbol_ptr_array(TP_Context *tp, CV_SymbolPtrArray *symbols) +{ + ProfBeginDynamic("Dedup Symbols [Count %llu]", symbols->count); + Temp scratch = scratch_begin(0, 0); + + ProfBegin("Setup Task"); + CV_SymbolDeduperTask task = {0}; + task.symbols = symbols->v; + task.cap = (U64)((F64)symbols->count * 1.3); + task.u.buckets = push_array(scratch.arena, CV_SymbolNode **, task.cap); + ProfEnd(); + + ProfBegin("Dedup"); + task.ranges = tp_divide_work(scratch.arena, symbols->count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, cv_symbol_deduper_insert_task, &task); + ProfEnd(); + + ProfBegin("Deref Buckets"); + task.ranges = tp_divide_work(scratch.arena, task.cap, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, cv_symbol_deduper_deref_buckets_task, &task); + ProfEnd(); + + ProfBegin("Copy Extant Buckets"); + U64 unique_symbol_count = 0; + for (U64 bucket_idx = 0; bucket_idx < task.cap; ++bucket_idx) { + CV_SymbolNode *bucket = task.u.deref_buckets[bucket_idx]; + if (bucket) { + symbols->v[unique_symbol_count++] = bucket; + } + } + ProfEnd(); + + Assert(unique_symbol_count < symbols->count); + symbols->count = unique_symbol_count; + + ProfBeginDynamic("Sort [Count %llu]", symbols->count); + radsort(symbols->v, symbols->count, cv_symbol_deduper_is_before); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); +} + +//////////////////////////////// +//~ .debug$T helpers + +internal CV_DebugT +cv_debug_t_from_data_arr(Arena *arena, String8Array data_arr, U64 align) +{ + ProfBegin("Upfront parse"); + U64 max_leaf_count = 0; + for (U64 data_idx = 0; data_idx < data_arr.count; data_idx += 1) { + String8 data = data_arr.v[data_idx]; + for (U64 cursor = 0; cursor < data.size; ) { + CV_Leaf leaf; + cursor += cv_deserial_leaf(data, cursor, align, &leaf); + max_leaf_count += 1; + } + } + ProfEnd(); + + U8 **leaf_arr = push_array_no_zero(arena, U8 *, max_leaf_count); + U64 leaf_count = 0; + for (U64 data_idx = 0; data_idx < data_arr.count; data_idx += 1) { + String8 data = data_arr.v[data_idx]; + + U64 cursor = 0; + while (cursor < data.size) { + CV_Leaf leaf; + U64 read_size = cv_deserial_leaf(data, cursor, align, &leaf); + + Assert(leaf_count < max_leaf_count); + leaf_arr[leaf_count] = str8_deserial_get_raw_ptr(data, cursor, read_size); + leaf_count += 1; + + // advance cursor + cursor += read_size; + } + } + + CV_DebugT debug_t = {0}; + debug_t.count = leaf_count; + debug_t.v = leaf_arr; + return debug_t; +} + +internal CV_DebugT +cv_debug_t_from_data(Arena *arena, String8 data, U64 align) +{ + String8Array arr = {0}; + arr.count = 1; + arr.v = &data; + return cv_debug_t_from_data_arr(arena, arr, align); +} + +internal CV_Leaf +cv_debug_t_get_leaf(CV_DebugT debug_t, U64 leaf_idx) +{ + Assert(leaf_idx < debug_t.count); + + U8 *ptr = debug_t.v[leaf_idx]; + String8 data = str8(ptr, max_U64); + + CV_Leaf leaf; + cv_deserial_leaf(data, 0, 1, &leaf); + + U64 size = cv_header_struct_size_from_leaf_kind(leaf.kind); + Assert(size <= leaf.data.size); + + return leaf; +} + +internal String8 +cv_debug_t_get_raw_leaf(CV_DebugT debug_t, U64 leaf_idx) +{ + Assert(leaf_idx < debug_t.count); + U8 *leaf_ptr = debug_t.v[leaf_idx]; + CV_LeafSize *size_ptr = (CV_LeafSize *)leaf_ptr; + CV_LeafSize total_size = sizeof(*size_ptr) + *size_ptr; + String8 raw_leaf = str8(leaf_ptr, total_size); + return raw_leaf; +} + +internal CV_LeafHeader * +cv_debug_t_get_leaf_header(CV_DebugT debug_t, U64 leaf_idx) +{ + Assert(leaf_idx < debug_t.count); + CV_LeafHeader *leaf_header = (CV_LeafHeader *) debug_t.v[leaf_idx]; + return leaf_header; +} + +internal B32 +cv_debug_t_is_pch(CV_DebugT debug_t) +{ + if (debug_t.count > 0) { + CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, 0); + return cv_is_leaf_pch(leaf.kind); + } + return 0; +} + +internal B32 +cv_debug_t_is_type_server(CV_DebugT debug_t) +{ + if (debug_t.count > 0) { + CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, 0); + return cv_is_leaf_type_server(leaf.kind); + } + return 0; +} + +internal U64 +cv_debug_t_array_count_leaves(U64 count, CV_DebugT *arr) +{ + U64 total_leaf_count = 0; + for (U64 i = 0; i < count; i += 1) { + total_leaf_count += arr[i].count; + } + return total_leaf_count; +} + +THREAD_POOL_TASK_FUNC(cv_str8_list_from_debug_t_task) +{ + CV_Str8ListFromDebugT *task = raw_task; + for (U64 leaf_idx = task->ranges[task_id].min; leaf_idx < task->ranges[task_id].max; ++leaf_idx) { + String8Node *node = &task->nodes[leaf_idx]; + node->string = cv_debug_t_get_raw_leaf(task->debug_t, leaf_idx); + str8_list_push_node(&task->lists[task_id], node); + } +} + +internal String8List +cv_str8_list_from_debug_t_parallel(TP_Context *tp, Arena *arena, CV_DebugT debug_t) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + // build lists in parallel + CV_Str8ListFromDebugT task = {0}; + task.debug_t = debug_t; + task.ranges = tp_divide_work(scratch.arena, debug_t.count, tp->worker_count); + task.lists = push_array(scratch.arena, String8List, tp->worker_count); + task.nodes = push_array_no_zero(arena, String8Node, debug_t.count); + tp_for_parallel(tp, 0, tp->worker_count, cv_str8_list_from_debug_t_task, &task); + + // concat output lists + String8List list = {0}; + for (U64 task_id = 0; task_id < tp->worker_count; ++task_id) { + str8_list_concat_in_place(&list, &task.lists[task_id]); + } + + scratch_end(scratch); + ProfEnd(); + return list; +} + +// $$Symbols + +internal void +cv_parse_symbol_sub_section(Arena *arena, CV_SymbolList *list, U64 offset_base, String8 data, U64 align) +{ + for (U64 cursor = 0, opl = data.size; cursor < opl; ) { + // read symbol header + CV_SymbolHeader header; + cursor += str8_deserial_read_struct(data, cursor, &header); + + // size from header has to be larger than 2 bytes + if (header.size < sizeof(header.kind)) { + Assert(!"TODO: error handle invalid symbol data"); + break; + } + + // is there enough bytes in the range? + U64 symbol_opl = cursor + (header.size - sizeof(header.kind)); + if (symbol_opl > opl) { + Assert(!"TODO: error handle corrupted symbol data"); + break; + } + + // get symbol data + Rng1U64 symbol_data_range = r1u64(cursor, symbol_opl); + String8 symbol_data = str8_substr(data, symbol_data_range); + + // init symbol + CV_SymbolNode *node = cv_symbol_list_push(arena, list); + node->data.offset = offset_base + cursor; + node->data.kind = header.kind; + node->data.data = symbol_data; + + // advance cursor + cursor = symbol_opl; + cursor = AlignPow2(cursor, align); + } +} + +internal CV_SymbolList +cv_symbol_list_from_data_list(Arena *arena, String8List data_list, U64 align) +{ + CV_SymbolList symbol_list = {0}; + U64 cursor = 0; + for (String8Node *sect = data_list.first; sect != 0; cursor += sect->string.size, sect = sect->next) { + cv_parse_symbol_sub_section(arena, &symbol_list, cursor, sect->string, align); + } + return symbol_list; +} + +internal void +cv_symbol_list_push_node(CV_SymbolList *list, CV_SymbolNode *node) +{ + node->prev = 0; + node->next = 0; + DLLPushBack(list->first, list->last, node); + list->count += 1; +} + +internal CV_SymbolNode * +cv_symbol_list_push(Arena *arena, CV_SymbolList *list) +{ + CV_SymbolNode *node = push_array(arena, CV_SymbolNode, 1); + cv_symbol_list_push_node(list, node); + return node; +} + +internal CV_SymbolNode * +cv_symbol_list_push_data(Arena *arena, CV_SymbolList *list, CV_SymKind kind, String8 data) +{ + CV_SymbolNode *node = cv_symbol_list_push(arena, list); + node->data.kind = kind; + node->data.data = data; + return node; +} + +internal CV_SymbolNode * +cv_symbol_list_push_many(Arena *arena, CV_SymbolList *list, U64 count) +{ + CV_SymbolNode *node_arr = push_array_no_zero(arena, CV_SymbolNode, 1); + for (U64 node_idx = 0; node_idx < count; node_idx += 1) { + cv_symbol_list_push_node(list, &node_arr[node_idx]); + } + return node_arr; +} + +internal void +cv_symbol_list_remove_node(CV_SymbolList *list, CV_SymbolNode *node) +{ + Assert(list->count > 0); + list->count -= 1; + DLLRemove(list->first, list->last, node); +} + +internal void +cv_symbol_list_concat_in_place(CV_SymbolList *list, CV_SymbolList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal void +cv_symbol_list_concat_in_place_arr(CV_SymbolList *list, U64 count, CV_SymbolList *to_concat) +{ + SLLConcatInPlaceArray(list, to_concat, count); +} + +internal U64 +cv_symbol_list_arr_get_count(U64 count, CV_SymbolList *list_arr) +{ + U64 result = 0; + for (U64 idx = 0; idx < count; idx += 1) { + result += list_arr[idx].count; + } + return result; +} + +internal String8List +cv_data_from_symbol_list(Arena *arena, CV_SymbolList symbol_list, U64 align) +{ + String8List data_list = {0}; + for (CV_SymbolNode *node = symbol_list.first; node != 0; node = node->next) { + String8 data = cv_serialize_symbol(arena, &node->data, align); + str8_list_push(arena, &data_list, data); + } + return data_list; +} + +internal +THREAD_POOL_TASK_FUNC(cv_symbol_list_syncer) +{ + ProfBeginFunction(); + + CV_SymbolListSyncer *task = raw_task; + + // context shortcuts + Rng1U64 list_range = task->list_range_arr[task_id]; + U64 symbol_base = task->symbol_base_arr[task_id]; + + for (U64 list_idx = list_range.min, symbol_idx = symbol_base; list_idx < list_range.max; list_idx += 1) { + // pick up assigned list + CV_SymbolList list = task->list_arr[list_idx]; + + // fill out assigned range in the symbol array + for (CV_SymbolNode *node = list.first; node != 0; node = node->next, symbol_idx += 1) { + task->symbol_arr[symbol_idx] = node; + } + } + + ProfEnd(); +} + +internal CV_SymbolPtrArray +cv_symbol_ptr_array_from_list(Arena *arena, TP_Context *tp, U64 count, CV_SymbolList *list_arr) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + U64 total_count = cv_symbol_list_arr_get_count(count, list_arr); + + CV_SymbolListSyncer task = {0}; + task.list_arr = list_arr; + task.symbol_arr = push_array_no_zero(arena, CV_SymbolNode *, total_count); + task.symbol_base_arr = push_array_no_zero(scratch.arena, U64, tp->worker_count); + task.list_range_arr = tp_divide_work(scratch.arena, count, tp->worker_count); + + for (U64 thread_idx = 0, symbol_base = 0; thread_idx < tp->worker_count; thread_idx += 1) { + task.symbol_base_arr[thread_idx] = symbol_base; + Rng1U64 range = task.list_range_arr[thread_idx]; + for (U64 list_idx = range.min; list_idx < range.max; list_idx += 1) { + symbol_base += list_arr[list_idx].count; + } + } + + tp_for_parallel(tp, 0, tp->worker_count, cv_symbol_list_syncer, &task); + + CV_SymbolPtrArray result = {0}; + result.count = total_count; + result.v = task.symbol_arr; + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal CV_Scope * +cv_scope_list_push(Arena *arena, CV_ScopeList *list) +{ + CV_Scope *node = push_array(arena, CV_Scope, 1); + SLLQueuePush(list->first, list->last, node); + return node; +} + +internal CV_SymbolList +cv_global_scope_symbols_from_list(Arena *arena, CV_SymbolList list) +{ + CV_SymbolList gsym_list = {0}; + S64 scope_depth = 0; + for (CV_SymbolNode *symbol_n = list.first; symbol_n != 0; symbol_n = symbol_n->next) { + CV_Symbol symbol = symbol_n->data; + if (cv_is_global_symbol(symbol.kind) && scope_depth == 0) { + cv_symbol_list_push_data(arena, &gsym_list, symbol.kind, symbol.data); + } else if (cv_is_scope_symbol(symbol.kind)) { + scope_depth += 1; + } else if (cv_is_end_symbol(symbol.kind)) { + scope_depth -= 1; + if (scope_depth < 0) { + break; + } + } + } + return gsym_list; +} + +internal CV_ScopeList +cv_symbol_tree_from_symbol_list(Arena *arena, CV_SymbolList list) +{ + Temp scratch = scratch_begin(&arena, 1); + + CV_ScopeList root = {0}; + + // setup root frame + CV_ScopeFrame *stack = push_array(scratch.arena, CV_ScopeFrame, 1); + stack->list = &root; + + for (CV_SymbolNode *symbol_node = list.first; symbol_node != 0; symbol_node = symbol_node->next) { + // store symbol in current scope + CV_Scope *scope = cv_scope_list_push(arena, stack->list); + scope->symbol = symbol_node->data; + + // does this symbol define a new scope? + if (cv_is_scope_symbol(symbol_node->data.kind)) { + CV_ScopeFrame *frame = push_array(scratch.arena, CV_ScopeFrame, 1); + frame->list = push_array(arena, CV_ScopeList, 1); + SLLStackPush(stack, frame); + } + // does this symbol end current scope? + else if (cv_is_end_symbol(symbol_node->data.kind)) { + CV_ScopeFrame *prev_stack_frame = stack->next; + if (prev_stack_frame) { + // set children in parent scope + CV_Scope *parent_scope = prev_stack_frame->list->last; + parent_scope->children = stack->list; + } + + // pop frame + SLLStackPop(stack); + } + } + + scratch_end(scratch); + return root; +} + +internal CV_SymbolList +cv_build_symbol_tree(Arena *arena, CV_ScopeList symbol_tree, U64 symbol_base, U64 align) +{ + Temp scratch = scratch_begin(&arena, 1); + + CV_SymbolList result = {0}; + + U64 cursor = symbol_base; + + // setup root frame + CV_ScopeFrame *stack = push_array(scratch.arena, CV_ScopeFrame, 1); + stack->list = &symbol_tree; + stack->curr = stack->list->first; + stack->symbol_off = cursor; + + for (; stack != 0; ) { + for (; stack->curr != 0; stack->curr = stack->curr->next) { + CV_Scope *scope = stack->curr; + CV_Symbol *symbol = &scope->symbol; + + // store symbol + CV_SymbolNode *symbol_node = cv_symbol_list_push_data(arena, &result, symbol->kind, symbol->data); + symbol_node->data.offset = cursor; + + // read & advance cursor + U64 record_size = cv_compute_symbol_record_size(symbol, align); + cursor += record_size; + + if (scope->children) { + // in every scoped symbol parent and end offsets follow record header + U32 *parent_off_ptr = (U32 *)symbol->data.str; + U32 *end_off_ptr = (U32 *)(parent_off_ptr + 1); + + // write parent symbol offset + U64 parent_off64 = stack->symbol_off; + U32 parent_off32 = safe_cast_u32(parent_off64); + *parent_off_ptr = parent_off32; + + // advance to next node so after stack pop we resume from correct node + stack->curr = stack->curr->next; + + // push new scope frame + CV_ScopeFrame *frame = push_array(scratch.arena, CV_ScopeFrame, 1); + frame->symbol_off = symbol_node->data.offset; + frame->list = scope->children; + frame->curr = frame->list->first; + frame->parent_off_ptr = parent_off_ptr; + frame->end_off_ptr = end_off_ptr; + SLLStackPush(stack, frame); + + break; + } + } + + if (stack->curr == 0) { + // write end symbol offset + if (stack->end_off_ptr) { + U64 end_symbol_size = sizeof(CV_SymbolHeader); + U64 end_off64 = cursor - end_symbol_size; + U32 end_off32 = safe_cast_u32(end_off64); + *stack->end_off_ptr = end_off32; + } + + // pop scope + SLLStackPop(stack); + } + } + + scratch_end(scratch); + return result; +} + +internal U64 +cv_patch_symbol_tree_offsets(CV_SymbolList list, U64 base_offset, U64 align) +{ + Temp scratch = scratch_begin(0, 0); + + struct Stack { + struct Stack *next; + CV_Symbol *symbol; + U64 offset; + }; + struct Stack *stack = 0; + struct Stack *free_list = 0; + + U64 cursor = base_offset; + + for (CV_SymbolNode *symbol_n = list.first; symbol_n != 0; symbol_n = symbol_n->next) { + CV_Symbol symbol = symbol_n->data; + if (cv_is_scope_symbol(symbol.kind)) { + // NOTE: We don't patch 'next' offset in PROC symbols because + // it's not used by visual studio and MSVC leaves the offsets + // zeroed. LLD is on the same page. + Assert(symbol.data.size >= sizeof(U32)*2); + + // patch symbol parent + if (stack) { + U32 *parent_off_ptr = (U32 *)symbol.data.str; + *parent_off_ptr = stack->offset; + } + + // reuse/alloc frame + struct Stack *frame; + if (free_list) { + frame = free_list; + SLLStackPop(free_list); + } else { + frame = push_array_no_zero(scratch.arena, struct Stack, 1); + } + + // push frame to the stack + frame->symbol = &symbol_n->data; + frame->offset = cursor; + SLLStackPush(stack, frame); + } else if (cv_is_end_symbol(symbol.kind)) { + // patch symbol end + U32 *end_off_ptr = (U32 *)stack->symbol->data.str + /* skip parent off */ 1; + *end_off_ptr = cursor; + + // recycle frame + struct Stack *free_frame = stack; + SLLStackPop(stack); + SLLStackPush(free_list, free_frame); + } + + // advance cursor + cursor += cv_compute_symbol_record_size(&symbol, align); + } + + scratch_end(scratch); + U64 serial_size = cursor - base_offset; + return serial_size; +} + +// $$FileChksms + +#if 0 +internal String8 +cv_c13_file_chksms_from_sub_sections(String8 c13_data, CV_C13Parsed *ss) +{ + ProfBeginFunction(); + String8 file_chksms = str8(0,0); + CV_C13SubSectionList file_chksms_list = ss->v[CV_C13SubSectionIdxKind_FileChksms]; + if (file_chksms_list.count > 0) { + Assert(file_chksms_list.count == 1); + CV_C13SubSectionNode *file_chksms_node = file_chksms_list.first; + Assert(file_chksms_node->kind == CV_C13SubSectionKind_FileChksms); + file_chksms = str8_substr(c13_data, file_chksms_node->range); + } + ProfEnd(); + return file_chksms; +} +#endif + +internal void +cv_parse_checksum_data(Arena *arena, CV_ChecksumList *list, String8 checksum_data) +{ + for (U64 cursor = 0, cursor_opl = checksum_data.size; cursor < cursor_opl; ) { + U64 expected_cursor_after_checksum = cursor + sizeof(CV_C13Checksum); + if (expected_cursor_after_checksum > cursor_opl) { + break; + } + CV_C13Checksum *header = (CV_C13Checksum *)str8_deserial_get_raw_ptr(checksum_data, cursor, sizeof(CV_C13Checksum)); + cursor += sizeof(CV_C13Checksum); + + U64 expected_cursor_after_value = cursor + header->len; + if (expected_cursor_after_value > cursor_opl) { + break; + } + String8 value = str8(0,0); + cursor += str8_deserial_read_block(checksum_data, cursor, header->len, &value); + cursor = AlignPow2(cursor, 4); + + CV_ChecksumNode *node = push_array(arena, CV_ChecksumNode, 1); + node->next = 0; + + CV_Checksum *data = &node->data; + data->header = header; + data->value = value; + + SLLQueuePush(list->first, list->last, node); + list->count += 1; + } +} + +internal CV_ChecksumList +cv_c13_parse_checksum_data_list(Arena *arena, String8List checksum_data_list) +{ + CV_ChecksumList result = {0}; + for (String8Node *node = checksum_data_list.first; node != 0; node = node->next) { + cv_parse_checksum_data(arena, &result, node->string); + } + return result; +} + +internal void +cv_c13_patch_string_offsets_in_checksum_list(CV_ChecksumList checksum_list, String8 string_data, U64 string_data_base_offset, CV_StringHashTable string_ht) +{ + for (CV_ChecksumNode *node = checksum_list.first; node != 0; node = node->next) { + CV_Checksum *checksum = &node->data; + CV_C13Checksum *header = checksum->header; + String8 name = str8_cstring_capped(string_data.str + header->name_off, string_data.str + string_data.size); + CV_StringBucket *bucket = cv_string_hash_table_lookup(string_ht, name); + + U64 name_off64 = string_data_base_offset + bucket->u.offset; + header->name_off = safe_cast_u32(name_off64); + } +} + +internal String8List +cv_c13_collect_source_file_names(Arena *arena, CV_ChecksumList checksum_list, String8 string_data) +{ + String8List source_file_name_list = {0}; + for (CV_ChecksumNode *node = checksum_list.first; node != 0; node = node->next) { + CV_Checksum *checksum = &node->data; + CV_C13Checksum *header = checksum->header; + Assert(header->name_off < string_data.size); + String8 name = str8_cstring_capped(string_data.str + header->name_off, string_data.str + string_data.size); + str8_list_push(arena, &source_file_name_list, name); + } + return source_file_name_list; +} + +// $$Lines + +internal CV_C13LinesHeaderList +cv_c13_lines_from_sub_sections(Arena *arena, String8 c13_data, Rng1U64 ss_range) +{ + ProfBeginFunction(); + + CV_C13LinesHeaderList parsed_line_list = {0}; + + String8 sub_sect_data = str8_substr(c13_data, ss_range); + + for (U64 cursor = 0; cursor + sizeof(CV_C13SubSecLinesHeader) <= sub_sect_data.size; ) { + CV_C13SubSecLinesHeader *hdr = (CV_C13SubSecLinesHeader *)(sub_sect_data.str + cursor); + cursor += sizeof(*hdr); + + // read files + for (; cursor + sizeof(CV_C13File) <= sub_sect_data.size; ) { + // grab next file header + CV_C13File *file = (CV_C13File *)(sub_sect_data.str + cursor); + cursor += sizeof(CV_C13File); + + // parse lines and columns + // + // TODO: export columns + U64 max_line_count = (sub_sect_data.size - cursor) / sizeof(CV_C13Line); + U32 line_count = Min(file->num_lines, max_line_count); + + // TODO(allen): check order correctness here + + U64 line_array_off = cursor; + //U64 col_array_off = line_array_off + line_count * sizeof(CV_C13Line); + + // compute line entry size + U64 line_entry_size = sizeof(CV_C13Line); + if (hdr->flags & CV_C13SubSecLinesFlag_HasColumns) { + line_entry_size += sizeof(CV_C13Column); + } + + // advance past line and column entries + cursor += line_count * line_entry_size; + + // emit parsed lines + CV_C13LinesHeaderNode *lines_parsed_node = push_array_no_zero(arena, CV_C13LinesHeaderNode, 1); + lines_parsed_node->next = 0; + + CV_C13LinesHeader *lines_parsed = &lines_parsed_node->v; + lines_parsed->sec_idx = hdr->sec; + lines_parsed->sec_off_lo = hdr->sec_off; + lines_parsed->sec_off_hi = hdr->sec_off + hdr->len; + lines_parsed->file_off = file->file_off; + lines_parsed->line_count = line_count; + lines_parsed->col_count = 0; // TODO: columns + lines_parsed->line_array_off = ss_range.min + line_array_off; + lines_parsed->col_array_off = 0; // TODO: columns + + SLLQueuePush(parsed_line_list.first, parsed_line_list.last, lines_parsed_node); + parsed_line_list.count += 1; + } + } + + ProfEnd(); + return parsed_line_list; +} + +internal CV_LineArray +cv_c13_line_array_from_data(Arena *arena, String8 c13_data, U64 sec_base, CV_C13LinesHeader parsed_lines) +{ + CV_LineArray result; + result.file_off = parsed_lines.file_off; + result.line_count = parsed_lines.line_count; + result.col_count = parsed_lines.col_count; + result.voffs = push_array_no_zero(arena, U64, parsed_lines.line_count + 1); + result.line_nums = push_array_no_zero(arena, U32, parsed_lines.line_count); + result.col_nums = 0; + + CV_C13Line *raw_lines = (CV_C13Line *)str8_deserial_get_raw_ptr(c13_data, parsed_lines.line_array_off, parsed_lines.line_count * sizeof(raw_lines[0])); + + for(U64 line_idx = 0; line_idx < parsed_lines.line_count; line_idx += 1) + { + CV_C13Line line = raw_lines[line_idx]; + result.voffs[line_idx] = sec_base + parsed_lines.sec_off_lo + line.off; + result.line_nums[line_idx] = CV_C13LineFlags_ExtractLineNumber(line.flags); + } + + // emit voff ender + result.voffs[result.line_count] = sec_base + parsed_lines.sec_off_hi; + + return result; +} + +internal void +cv_c13_patch_checksum_offsets_in_line_data_list(String8List line_data, U64 checksum_rebase) +{ + for(String8Node *node = line_data.first; node != 0; node = node->next) + { + String8 raw_data = node->string; + if(raw_data.size < sizeof(CV_C13SubSecLinesHeader)) + { + Assert(!"unable to patch checksum in line sub seciton header"); + continue; + } + CV_C13File *file_header = (CV_C13File *)(raw_data.str + sizeof(CV_C13SubSecLinesHeader)); + U64 rebased_file_off = file_header->file_off + checksum_rebase; + file_header->file_off = safe_cast_u32(rebased_file_off); + } +} + +// $$InlineeLines + +internal CV_C13InlineeLinesParsedList +cv_c13_inlinee_lines_from_sub_sections(Arena *arena, String8List raw_inlinee_lines) +{ + ProfBeginFunction(); + + CV_C13InlineeLinesParsedList inlinee_lines_list = {0}; + + for (String8Node *raw_data_node = raw_inlinee_lines.first; raw_data_node != 0; raw_data_node = raw_data_node->next) { + U64 cursor = 0; + + CV_C13InlineeLinesSig sig = 0; + cursor += str8_deserial_read_struct(raw_data_node->string, cursor, &sig); + + for (; cursor + sizeof(CV_C13InlineeSourceLineHeader) <= raw_data_node->string.size; ) { + CV_C13InlineeSourceLineHeader *hdr = (CV_C13InlineeSourceLineHeader *)(raw_data_node->string.str + cursor); + cursor += sizeof(*hdr); + + CV_C13InlineeLinesParsedNode *inlinee_parsed_node = push_array_no_zero(arena, CV_C13InlineeLinesParsedNode, 1); + inlinee_parsed_node->next = 0; + SLLQueuePush(inlinee_lines_list.first, inlinee_lines_list.last, inlinee_parsed_node); + inlinee_lines_list.count += 1; + + CV_C13InlineeLinesParsed *inlinee_parsed = &inlinee_parsed_node->v; + inlinee_parsed->inlinee = hdr->inlinee; + inlinee_parsed->file_off = hdr->file_off; + inlinee_parsed->first_source_ln = hdr->first_source_ln; + inlinee_parsed->extra_file_count = 0; + inlinee_parsed->extra_files = 0; + + if (sig == CV_C13InlineeLinesSig_EXTRA_FILES) { + if (cursor + sizeof(U32) <= raw_data_node->string.size) { + U32 *extra_file_count_ptr = (U32 *)(raw_data_node->string.str + cursor); + cursor += sizeof(*extra_file_count_ptr); + + U32 max_extra_file_count = (raw_data_node->string.size - cursor) / sizeof(U32); + U32 extra_file_count = Min(*extra_file_count_ptr, max_extra_file_count); + U32 *extra_files = (U32 *)(raw_data_node->string.str + cursor); + cursor += sizeof(*extra_files) * extra_file_count; + + inlinee_parsed->extra_file_count = extra_file_count; + inlinee_parsed->extra_files = extra_files; + } + } + } + } + + ProfEnd(); + return inlinee_lines_list; +} + +// $$FrameData + +internal void +cv_c13_patch_checksum_offsets_in_frame_data_list(String8List frame_data, U32 checksum_rebase) +{ + for(String8Node *node = frame_data.first; node != 0; node = node->next) + { + String8 raw_data = node->string; + U64 count = raw_data.size / sizeof(CV_C13FrameData); + CV_C13FrameData *arr = (CV_C13FrameData *)raw_data.str; + CV_C13FrameData *ptr = arr; + CV_C13FrameData *opl = arr + count; + for(; ptr < opl; ptr += 1) + { + U64 rebased_frame_func = ptr->frame_func + checksum_rebase; + ptr->frame_func = safe_cast_u32(rebased_frame_func); + } + } +} + +//////////////////////////////// +// $$Lines Accel + +int +cv_c13_voff_map_compar(const void *raw_a, const void *raw_b) +{ + CV_Line *a = (CV_Line*)raw_a; + CV_Line *b = (CV_Line*)raw_b; + int cmp = a->voff < b->voff ? -1 : + a->voff > b->voff ? +1 : + 0; + return cmp; +} + +internal CV_LinesAccel * +cv_c13_make_lines_accel(Arena *arena, U64 lines_count, CV_LineArray *lines) +{ + ProfBeginFunction(); + + U64 total_voff_count = 0; + for(U64 arr_idx = 0; arr_idx < lines_count; arr_idx += 1) { + total_voff_count += lines[arr_idx].line_count + 1; + } + + CV_Line *map = push_array_no_zero(arena, CV_Line, total_voff_count); + U64 map_idx = 0; + + for(U64 line_idx = 0; line_idx < lines_count; line_idx += 1) { + CV_LineArray *l = lines + line_idx; + if (l->line_count > 0) { + for(U64 voff_idx = 0; voff_idx < l->line_count; voff_idx += 1) { + map[map_idx].voff = l->voffs[voff_idx]; + map[map_idx].file_off = l->file_off; + map[map_idx].line_num = l->line_nums[voff_idx]; + map[map_idx].col_num = 0; // TODO: columns + map_idx += 1; + } + + map[map_idx].voff = l->voffs[l->line_count]; + map[map_idx].file_off = l->file_off; + map[map_idx].line_num = 0; + map[map_idx].col_num = 0; + map_idx += 1; + } + } + Assert(map_idx == total_voff_count); + + qsort(map, total_voff_count, sizeof(map[0]), cv_c13_voff_map_compar); + + CV_LinesAccel *accel = push_array(arena, CV_LinesAccel, 1); + accel->map_count = total_voff_count; + accel->map = map; + + ProfEnd(); + return accel; +} + +#if 0 +internal CV_Line * +cv_line_from_voff(CV_LinesAccel *accel, U64 voff, U64 *out_line_count) +{ + ProfBeginFunction(); + + U64 voff_line_count = 0; + CV_Line *lines = 0; + + U64 map_idx = bsearch_nearest_u64(accel->map, accel->map_count, voff, sizeof(accel->map[0]), OffsetOf(CV_Line, voff)); + if(map_idx < accel->map_count) { + U64 near_voff = accel->map[map_idx].voff; + + for (; map_idx > 0; map_idx -= 1) { + if(accel->map[map_idx - 1].voff != near_voff) { + break; + } + } + + lines = accel->map + map_idx; + + for(; map_idx < (accel->map_count-1); map_idx += 1) { + if(accel->map[map_idx].voff != near_voff) { + break; + } + voff_line_count += 1; + } + } + + *out_line_count = voff_line_count; + + ProfEnd(); + return lines; +} +#endif + +//////////////////////////////// +// $$InlineeLines Accel + +internal U64 +cv_c13_inlinee_lines_accel_hash(void *buffer, U64 size) +{ + XXH64_hash_t hash64 = XXH3_64bits(buffer, size); + return hash64; +} + +internal B32 +cv_c13_inlinee_lines_accel_push(CV_InlineeLinesAccel *accel, CV_C13InlineeLinesParsed *parsed) +{ + U64 load_factor = accel->bucket_max * 2/3 + 1; + if(accel->bucket_count > load_factor) { + Assert("TODO: increase max count and rehash buckets"); + } + + B32 is_pushed = 0; + + U64 hash = cv_c13_inlinee_lines_accel_hash(&parsed->inlinee, sizeof(parsed->inlinee)); + U64 best_idx = hash % accel->bucket_max; + U64 idx = best_idx; + + do { + if(accel->buckets[idx] == 0) { + accel->buckets[idx] = parsed; + accel->bucket_count += 1; + is_pushed = 1; + break; + } + + idx = (idx + 1) % accel->bucket_max; + } while(idx != best_idx); + + return is_pushed; +} + +internal CV_C13InlineeLinesParsed * +cv_c13_inlinee_lines_accel_find(CV_InlineeLinesAccel *accel, CV_ItemId inlinee) +{ + CV_C13InlineeLinesParsed *match = 0; + + U64 hash = cv_c13_inlinee_lines_accel_hash(&inlinee, sizeof(inlinee)); + U64 best_idx = hash % accel->bucket_max; + U64 idx = best_idx; + + do { + if(accel->buckets[idx] != 0) { + if(accel->buckets[idx]->inlinee == inlinee) { + match = accel->buckets[idx]; + break; + } + } + + idx = (idx + 1) % accel->bucket_max; + } while(idx != best_idx); + + return match; +} + +internal CV_InlineeLinesAccel * +cv_c13_make_inlinee_lines_accel(Arena *arena, CV_C13InlineeLinesParsedList inlinee_lines) +{ + ProfBeginFunction(); + + // alloc hash table + CV_InlineeLinesAccel *accel = push_array(arena, CV_InlineeLinesAccel, 1); + accel->bucket_count = 0; + accel->bucket_max = (U64)((F64)inlinee_lines.count * 2.5); + accel->buckets = push_array(arena, CV_C13InlineeLinesParsed *, accel->bucket_max); + + // push parsed inlinees + for(CV_C13InlineeLinesParsedNode *inlinee = inlinee_lines.first; inlinee != 0; inlinee = inlinee->next) { + cv_c13_inlinee_lines_accel_push(accel, &inlinee->v); + } + + ProfEnd(); + return accel; +} + +//////////////////////////////// + +internal S32 +cv_inline_annot_convert_to_signed_operand(U32 value) +{ + if (value & 1) { + value = -(value >> 1); + } else { + value = value >> 1; + } + S32 result = (S32)value; + return result; +} + +internal CV_InlineBinaryAnnotsParsed +cv_c13_parse_inline_binary_annots(Arena *arena, + U64 parent_voff, + CV_C13InlineeLinesParsed *inlinee_parsed, + String8 binary_annots) +{ + Temp scratch = scratch_begin(&arena, 1); + + struct CodeRange { + struct CodeRange *next; + Rng1U64 range; + }; + struct SourceLine { + struct SourceLine *next; + U64 voff; + U64 length; + U64 ln; + U64 cn; + CV_InlineRangeKind kind; + }; + struct SourceFile { + struct SourceFile *next; + struct SourceLine *line_first; + struct SourceLine *line_last; + U64 line_count; + U64 checksum_off; + Rng1U64 last_code_range; + }; + + Rng1U64List code_ranges = {0}; + struct SourceFile *file_first = 0; + struct SourceFile *file_last = 0; + U64 file_count = 0; + + CV_InlineRangeKind range_kind = 0; (void)range_kind; + U32 code_length = 0; + U32 code_offset = 0; + U32 file_off = inlinee_parsed->file_off; + S32 ln = (S32)inlinee_parsed->first_source_ln; + S32 cn = 1; + U64 code_offset_lo = 0; + B32 code_offset_changed = 0; + B32 code_offset_lo_changed = 0; + B32 code_length_changed = 0; + B32 ln_changed = 1; + B32 file_off_changed = 0; + + for (U64 cursor = 0, keep_running = 1; cursor < binary_annots.size && keep_running; ) { + U32 op = CV_InlineBinaryAnnotation_Null; + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &op); + + switch (op) { + case CV_InlineBinaryAnnotation_Null: { + keep_running = 0; + + // this is last run, append range with left over code bytes + code_length = code_offset - code_offset_lo; + code_length_changed = 1; + }break; + case CV_InlineBinaryAnnotation_CodeOffset: { + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &code_offset); + code_offset_changed = 1; + }break; + case CV_InlineBinaryAnnotation_ChangeCodeOffsetBase: { + AssertAlways(!"TODO: test case"); + // U32 delta = 0; + // cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &delta); + // code_offset_base = code_offset; + // code_offset_end = code_offset + delta; + // code_offset += delta; + }break; + case CV_InlineBinaryAnnotation_ChangeCodeOffset: { + U32 delta = 0; + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &delta); + + code_offset += delta; + + if (!code_offset_lo_changed) { + code_offset_lo = code_offset; + code_offset_lo_changed = 1; + } + code_offset_changed = 1; + }break; + case CV_InlineBinaryAnnotation_ChangeCodeLength: { + code_length = 0; + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &code_length); + code_length_changed = 1; + }break; + case CV_InlineBinaryAnnotation_ChangeFile: { + U32 old_file_off = file_off; + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &file_off); + file_off_changed = old_file_off != file_off; + // Compiler isn't obligated to terminate code sequence before chaning files, + // so we have to always force emit code range on file change. + code_length_changed = file_off_changed; + }break; + case CV_InlineBinaryAnnotation_ChangeLineOffset: { + S32 delta = 0; + cursor += cv_decode_inline_annot_s32(binary_annots, cursor, &delta); + + ln += delta; + ln_changed = 1; + }break; + case CV_InlineBinaryAnnotation_ChangeLineEndDelta: { + AssertAlways(!"TODO: test case"); + // S32 end_delta = 1; + // cursor += cv_decode_inline_annot_s32(binary_annots, cursor, &end_delta); + // ln += end_delta; + }break; + case CV_InlineBinaryAnnotation_ChangeRangeKind: { + AssertAlways(!"TODO: test case"); + // cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &range_kind); + }break; + case CV_InlineBinaryAnnotation_ChangeColumnStart: { + AssertAlways(!"TODO: test case"); + // S32 delta; + // cursor += cv_decode_inline_annot_s32(binary_annots, cursor, &delta); + // cn += delta; + }break; + case CV_InlineBinaryAnnotation_ChangeColumnEndDelta: { + AssertAlways(!"TODO: test case"); + // S32 end_delta; + // cursor += cv_decode_inline_annot_s32(binary_annots, cursor, &end_delta); + // cn += end_delta; + }break; + case CV_InlineBinaryAnnotation_ChangeCodeOffsetAndLineOffset: { + U32 code_offset_and_line_offset = 0; + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &code_offset_and_line_offset); + + S32 line_delta = cv_inline_annot_convert_to_signed_operand(code_offset_and_line_offset >> 4); + U32 code_delta = (code_offset_and_line_offset & 0xf); + + code_offset += code_delta; + ln += line_delta; + + if (!code_offset_lo_changed) { + code_offset_lo = code_offset; + code_offset_lo_changed = 1; + } + + code_offset_changed = 1; + ln_changed = 1; + }break; + case CV_InlineBinaryAnnotation_ChangeCodeLengthAndCodeOffset: { + U32 offset_delta = 0; + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &code_length); + cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &offset_delta); + + code_offset += offset_delta; + + if (!code_offset_lo_changed) { + code_offset_lo = code_offset; + code_offset_lo_changed = 1; + } + + code_offset_changed = 1; + code_length_changed = 1; + }break; + case CV_InlineBinaryAnnotation_ChangeColumnEnd: { + AssertAlways(!"TODO: test case"); + // U32 column_end = 0; + // cursor += cv_decode_inline_annot_u32(binary_annots, cursor, &column_end); + }break; + } + + U64 line_code_offset = code_offset; + + if (code_length_changed) { + // compute upper bound of the range + U64 code_offset_hi = code_offset + code_length; + + // can last code range be extended to cover current sequence too? + if (code_ranges.last != 0 && code_ranges.last->v.max == parent_voff + code_offset_lo) { + code_ranges.last->v.max = parent_voff + code_offset_hi; + } else { + // append range + rng_1u64_list_push(arena, &code_ranges, rng_1u64(parent_voff + code_offset_lo, parent_voff + code_offset_hi)); + + // update last code range in file + if (file_last) { + file_last->last_code_range = code_ranges.last->v; + } + } + + // update low offset for next range + code_offset_lo = code_offset_hi; + + // advance code offset + code_offset += code_length; + + // reset state + code_offset_lo_changed = 0; + code_length_changed = 0; + code_length = 0; + } + + if (file_off_changed || (file_first == 0)) { + // append file + struct SourceFile *file = push_array(scratch.arena, struct SourceFile, 1); + file->checksum_off = file_off; + SLLQueuePush(file_first, file_last, file); + ++file_count; + + // update last code range in file + if (code_ranges.last) { + file->last_code_range = code_ranges.last->v; + } + + // reset state + file_off_changed = 0; + } + + if (code_offset_changed && ln_changed) { + if (file_last->line_last == 0 || file_last->line_last->ln != (U64)ln) { + // append line + struct SourceLine *line = push_array(scratch.arena, struct SourceLine, 1); + line->voff = parent_voff + line_code_offset; + line->ln = (U64)ln; + line->cn = (U64)cn; + SLLQueuePush(file_last->line_first, file_last->line_last, line); + ++file_last->line_count; + } + + // reset state + code_offset_changed = 0; + ln_changed = 0; + } + } + + CV_LineArray *lines = push_array(arena, CV_LineArray, file_count); + { + U64 lines_idx = 0; + for (struct SourceFile *file = file_first; file != 0; file = file->next, lines_idx += 1) { + CV_LineArray *l = lines + lines_idx; + + l->file_off = file->checksum_off; + l->line_count = file->line_count; + l->col_count = 0; + + if (file->line_count > 0) { + l->voffs = push_array_no_zero(arena, U64, file->line_count + 1); + l->line_nums = push_array_no_zero(arena, U32, file->line_count); + l->col_nums = 0; // TODO: column info + + U64 line_idx = 0; + for (struct SourceLine *line = file->line_first; line != NULL; line = line->next, ++line_idx) { + // emit line voff and line number + l->voffs[line_idx] = line->voff; + l->line_nums[line_idx] = (U32)line->ln; + } + Assert(line_idx == file->line_count); + l->voffs[line_idx] = file->last_code_range.max; + } + } + } + + // fill out result + CV_InlineBinaryAnnotsParsed result = {0}; + result.lines_count = file_count; + result.lines = lines; + result.code_ranges = code_ranges; + + scratch_end(scratch); + return result; +} + +//////////////////////////////// + +internal CV_EncodedFramePtrReg +cv_pick_fp_encoding(CV_SymFrameproc *frameproc, B32 is_local_param) +{ + CV_EncodedFramePtrReg fp_reg = 0; + if (is_local_param) { + fp_reg = CV_FrameprocFlags_ExtractParamBasePointer(frameproc->flags); + } else { + fp_reg = CV_FrameprocFlags_ExtractLocalBasePointer(frameproc->flags); + } + return fp_reg; +} + +internal CV_Reg +cv_decode_fp_reg(CV_Arch arch, CV_EncodedFramePtrReg encoded_reg) +{ + CV_Reg fp_reg = 0; + switch (arch) { + case CV_Arch_8086: { + switch (encoded_reg) { + case CV_EncodedFramePtrReg_None : break; + case CV_EncodedFramePtrReg_StackPtr: AssertAlways(!"TODO(nick): not tested, this is a guess"); + fp_reg = CV_Regx86_ESP; break; + case CV_EncodedFramePtrReg_FramePtr: fp_reg = CV_Regx86_EBP; break; + case CV_EncodedFramePtrReg_BasePtr : fp_reg = CV_Regx86_EBX; break; + default: InvalidPath; + } + } break; + case CV_Arch_X64: { + switch (encoded_reg) { + case CV_EncodedFramePtrReg_None : break; + case CV_EncodedFramePtrReg_StackPtr: fp_reg = CV_Regx64_RSP; break; + case CV_EncodedFramePtrReg_FramePtr: fp_reg = CV_Regx64_RBP; break; + case CV_EncodedFramePtrReg_BasePtr : fp_reg = CV_Regx64_R13; break; + default: InvalidPath; + } + } break; + default: NotImplemented; + } + return fp_reg; +} + +internal Rng1U64List +cv_make_defined_range_list_from_gaps(Arena *arena, Rng1U64 defrange, CV_LvarAddrGap *gaps, U64 gap_count) +{ + Rng1U64List result = {0}; + + if (gap_count == 0) { + // no gaps, push whole range + rng_1u64_list_push(arena, &result, defrange); + } else { + U64 cursor = defrange.min; + for (U64 gap_idx = 0; gap_idx < gap_count; ++gap_idx) { + // make range + Rng1U64 range = rng_1u64(cursor, cursor + gaps[gap_idx].off); + rng_1u64_list_push(arena, &result, range); + + // advance + cursor = defrange.min + gaps[gap_idx].off + gaps[gap_idx].len; + } + + + // emit range past last gap + if (gap_count > 0) { + CV_LvarAddrGap last_gap = gaps[gap_count - 1]; + U64 last_range_byte_size = dim_1u64(defrange) - (last_gap.off + last_gap.len); + if (last_range_byte_size) { + Rng1U64 last_range = rng_1u64(defrange.min + last_gap.off + last_gap.len, defrange.max); + rng_1u64_list_push(arena, &result, last_range); + } + } + } + + return result; +} + +//////////////////////////////// + +internal U64 +cv_size_from_reg_x86(CV_Reg reg) +{ + switch (reg) { +#define X(NAME, CODE, RDI_NAME, BYTE_POS, BYTE_SIZE) case CV_Regx86_##NAME: return BYTE_SIZE; + CV_Reg_X86_XList(X) +#undef X + } + return 0; +} + +internal U64 +cv_size_from_reg_x64(CV_Reg reg) +{ + switch (reg) { +#define X(NAME, CODE, RDI_NAME, BYTE_POS, BYTE_SIZE) case CV_Regx64_##NAME: return BYTE_SIZE; + CV_Reg_X64_XList(X) +#undef X + } + return 0; +} + +internal U64 +cv_size_from_reg(CV_Arch arch, CV_Reg reg) +{ + switch (arch) { + case CV_Arch_8086: return cv_size_from_reg_x86(reg); + case CV_Arch_X64 : return cv_size_from_reg_x64(reg); + } + return 0; +} + +//////////////////////////////// + +internal CV_Arch +cv_arch_from_coff_machine(COFF_MachineType machine) +{ + CV_Arch arch = 0; + switch (machine) { + case COFF_MachineType_X64: arch = CV_Arch_X64; break; + case COFF_MachineType_X86: arch = CV_Arch_8086; break; + case COFF_MachineType_AM33: arch = CV_Arch_AM33; break; + case COFF_MachineType_ARM: NotImplemented; break; + case COFF_MachineType_ARM64: arch = CV_Arch_ARM64; break; + case COFF_MachineType_ARMNT: arch = CV_Arch_ARMNT; break; + case COFF_MachineType_EBC: arch = CV_Arch_EBC; break; + case COFF_MachineType_IA64: arch = CV_Arch_IA64; break; + case COFF_MachineType_M32R: arch = CV_Arch_M32R; break; + case COFF_MachineType_MIPS16: arch = CV_Arch_MIPS16; break; + case COFF_MachineType_MIPSFPU: NotImplemented; break; + case COFF_MachineType_MIPSFPU16: NotImplemented; break; + case COFF_MachineType_POWERPC: NotImplemented; break; + case COFF_MachineType_POWERPCFP: arch = CV_Arch_PPCFP; break; + case COFF_MachineType_R4000: NotImplemented; break; + case COFF_MachineType_RISCV32: NotImplemented; break; + case COFF_MachineType_RISCV64: NotImplemented; break; + case COFF_MachineType_RISCV128: NotImplemented; break; + case COFF_MachineType_SH3: arch = CV_Arch_SH3; break; + case COFF_MachineType_SH3DSP: arch = CV_Arch_SH3DSP; break; + case COFF_MachineType_SH4: arch = CV_Arch_SH4; break; + case COFF_MachineType_SH5: NotImplemented; break; + case COFF_MachineType_THUMB: arch = CV_Arch_THUMB; break; + case COFF_MachineType_WCEMIPSV2: NotImplemented; break; + } + return arch; +} + +internal String8 +cv_string_from_type_index_source(CV_TypeIndexSource ti_source) +{ + switch (ti_source) { + case CV_TypeIndexSource_NULL: return str8_lit(""); break; + case CV_TypeIndexSource_TPI: return str8_lit("TPI"); break; + case CV_TypeIndexSource_IPI: return str8_lit("IPI"); break; + case CV_TypeIndexSource_COUNT: break; + } + return str8_zero(); +} + diff --git a/src/linker/codeview_ext/codeview.h b/src/linker/codeview_ext/codeview.h new file mode 100644 index 00000000..1dfa42be --- /dev/null +++ b/src/linker/codeview_ext/codeview.h @@ -0,0 +1,571 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#define CV_MinComplexTypeIndex 0x1000 + +//////////////////////////////// +// Aligns + +#define CV_LeafAlign 4 +#define CV_SymbolAlign 1 +#define CV_C13SubSectionAlign 4 +#define CV_FileCheckSumsAlign 4 + +//////////////////////////////// + +//- Symbol and Leaf Headers + +#define CV_LeafSize_Max max_U16 +typedef U16 CV_LeafSize; + +#define CV_SymSize_Max max_U16 +typedef U16 CV_SymSize; + +typedef struct CV_LeafHeader +{ + CV_LeafSize size; + CV_LeafKind kind; +} CV_LeafHeader; + +typedef struct CV_SymbolHeader +{ + CV_SymSize size; + CV_SymKind kind; +} CV_SymbolHeader; + +//////////////////////////////// +// Type Index Helpers + +typedef enum CV_TypeIndexSource +{ + CV_TypeIndexSource_NULL, + CV_TypeIndexSource_TPI, + CV_TypeIndexSource_IPI, + CV_TypeIndexSource_COUNT +} CV_TypeIndexSource; + +typedef struct CV_TypeIndexInfo +{ + struct CV_TypeIndexInfo *next; + U64 offset; + CV_TypeIndexSource source; +} CV_TypeIndexInfo; + +typedef struct CV_TypeIndexInfoList +{ + U64 count; + CV_TypeIndexInfo *first; + CV_TypeIndexInfo *last; +} CV_TypeIndexInfoList; + +typedef struct CV_TypeIndexArray +{ + U32 count; + CV_TypeIndex *v; +} CV_TypeIndexArray; + +//- $$Symbols + +typedef struct CV_Symbol +{ + CV_SymKind kind; + U64 offset; + String8 data; +} CV_Symbol; + +typedef struct CV_SymbolNode +{ + struct CV_SymbolNode *next; + struct CV_SymbolNode *prev; + CV_Symbol data; +} CV_SymbolNode; + +typedef struct CV_SymbolPtrNode +{ + struct CV_SymbolPtrNode *next; + CV_Symbol *data; +} CV_SymbolPtrNode; + +typedef struct CV_SymbolList +{ + U64 count; + CV_Signature signature; + CV_SymbolNode *first; + CV_SymbolNode *last; +} CV_SymbolList; + +typedef struct CV_SymbolListArray +{ + U64 count; + CV_SymbolList *v; +} CV_SymbolListArray; + +typedef struct CV_SymbolPtrArray +{ + U64 count; + CV_SymbolNode **v; +} CV_SymbolPtrArray; + +typedef struct CV_Scope +{ + struct CV_ScopeList *children; + struct CV_Scope *next; + struct CV_Scope *prev; + CV_Symbol symbol; +} CV_Scope; + +typedef struct CV_ScopeList +{ + CV_Scope *first; + CV_Scope *last; +} CV_ScopeList; + +typedef struct CV_ScopeFrame +{ + struct CV_ScopeFrame *next; + CV_ScopeList *list; + CV_Scope *curr; + U64 symbol_off; + U32 *parent_off_ptr; + U32 *end_off_ptr; +} CV_ScopeFrame; + +//- $$FileChksms + +typedef struct CV_Checksum +{ + CV_C13Checksum *header; + String8 value; +} CV_Checksum; + +typedef struct CV_ChecksumNode +{ + struct CV_ChecksumNode *next; + CV_Checksum data; +} CV_ChecksumNode; + +typedef struct CV_ChecksumList +{ + U64 count; + CV_ChecksumNode *first; + CV_ChecksumNode *last; +} CV_ChecksumList; + +//- $$Lines + +typedef struct CV_LineArray +{ + U32 file_off; + U64 line_count; + U64 col_count; + U64 *voffs; // [line_count + 1] + U32 *line_nums; // [line_count] + U16 *col_nums; // [line_count * 2] +} CV_LineArray; + +typedef struct CV_File +{ + U32 file_off; + CV_LineArray lines; +} CV_File; + +typedef struct CV_C13LinesHeader +{ + U64 sec_idx; + U64 sec_off_lo; + U64 sec_off_hi; + U64 file_off; + U64 line_count; + U64 col_count; + U64 line_array_off; + U64 col_array_off; +} CV_C13LinesHeader; + +typedef struct CV_C13LinesHeaderNode +{ + struct CV_C13LinesHeaderNode *next; + CV_C13LinesHeader v; +} CV_C13LinesHeaderNode; + +typedef struct CV_C13LinesHeaderList +{ + CV_C13LinesHeaderNode *first; + CV_C13LinesHeaderNode *last; + U64 count; +} CV_C13LinesHeaderList; + +//////////////////////////////// + +typedef struct CV_UDTInfo +{ + String8 name; + String8 unique_name; + CV_TypeProps props; +} CV_UDTInfo; + +typedef struct CV_TypeServerInfo +{ + String8 name; + COFF_Guid sig; + U32 age; +} CV_TypeServerInfo; + +typedef struct CV_PrecompInfo +{ + CV_TypeIndex start_index; + U32 sig; + U32 leaf_count; + String8 obj_name; +} CV_PrecompInfo; + +typedef struct CV_ObjInfo +{ + U32 sig; + String8 name; +} CV_ObjInfo; + +//////////////////////////////// +// Accels + +typedef struct CV_Line +{ + U64 voff; + U32 file_off; + U32 line_num; + U16 col_num; +} CV_Line; + +typedef struct CV_LinesAccel +{ + U64 map_count; + CV_Line *map; +} CV_LinesAccel; + +typedef struct CV_InlineeLinesAccel +{ + U64 bucket_count; + U64 bucket_max; + CV_C13InlineeLinesParsed **buckets; +} CV_InlineeLinesAccel; + +typedef struct CV_InlineBinaryAnnotsParsed +{ + U64 lines_count; + CV_LineArray *lines; + Rng1U64List code_ranges; +} CV_InlineBinaryAnnotsParsed; + +typedef struct CV_C13InlineeLinesParsedList +{ + CV_C13InlineeLinesParsedNode *first; + CV_C13InlineeLinesParsedNode *last; + U64 count; +} CV_C13InlineeLinesParsedList; + +//////////////////////////////// + +typedef U32 CV_C13SubSectionIdxKind; +enum +{ + CV_C13SubSectionIdxKind_NULL, +#define X(N,c) CV_C13SubSectionIdxKind_##N, + CV_C13SubSectionKindXList(X) +#undef X + CV_C13SubSectionIdxKind_COUNT +}; + +typedef struct CV_C13SubSectionList +{ + CV_C13SubSectionNode *first; + CV_C13SubSectionNode *last; + U64 count; +} CV_C13SubSectionList; + +//////////////////////////////// + +typedef struct CV_DebugS +{ + String8List data_list[CV_C13SubSectionIdxKind_COUNT]; +} CV_DebugS; + +typedef struct CV_DebugT +{ + U64 size; + U64 count; + U8 **v; +} CV_DebugT; + +//////////////////////////////// +//~ Leaf Helpers + +typedef struct CV_Leaf +{ + CV_LeafKind kind; + String8 data; +} CV_Leaf; + +typedef struct CV_LeafNode +{ + struct CV_LeafNode *next; + CV_Leaf data; +} CV_LeafNode; + +typedef struct CV_LeafList +{ + U64 count; + CV_LeafNode *first; + CV_LeafNode *last; +} CV_LeafList; + +//////////////////////////////// +//~ String Hash Table + +typedef struct CV_StringTableRange +{ + struct CV_StringTableRange *next; + Rng1U64 range; + U64 debug_s_idx; +} CV_StringTableRange; + +typedef struct CV_StringBucket +{ + String8 string; + union { + struct { + U32 idx0; + U32 idx1; + }; + U64 offset; + } u; +} CV_StringBucket; + +typedef struct CV_StringHashTable +{ + U64 total_string_size; + U64 total_insert_count; + U64 bucket_cap; + CV_StringBucket **buckets; +} CV_StringHashTable; + +typedef struct CV_StringHashTableResult +{ + U64 string_count; + CV_StringBucket **buckets; +} CV_StringHashTableResult; + +//////////////////////////////// +//~ Task Contexts + +typedef struct +{ + U64 cap; + union { + CV_SymbolNode ***buckets; + CV_SymbolNode **deref_buckets; + } u; + Rng1U64 *ranges; + CV_SymbolNode **symbols; +} CV_SymbolDeduperTask; + +typedef struct +{ + CV_SymbolList *list_arr; + Rng1U64 *list_range_arr; + U64 *symbol_base_arr; + CV_SymbolNode **symbol_arr; +} CV_SymbolListSyncer; + +typedef struct +{ + CV_DebugS *arr; + CV_StringTableRange **range_lists; + U64 *string_counts; + U64 bucket_cap; + CV_StringBucket **buckets; + U64 total_string_size; + U64 total_insert_count; +} CV_DedupStringTablesTask; + +typedef struct +{ + U8 *buffer; + Rng1U64 *ranges; + CV_StringBucket **buckets; +} CV_PackStringHashTableTask; + +typedef struct +{ + CV_DebugT debug_t; + Rng1U64 *ranges; + String8List *lists; + String8Node *nodes; +} CV_Str8ListFromDebugT; + +//////////////////////////////// +// Type Index Helpers + +internal CV_TypeIndexInfo * cv_symbol_type_index_info_push(Arena *arena, CV_TypeIndexInfoList *list, CV_TypeIndexSource source, U64 offset); +internal CV_TypeIndexInfoList cv_get_symbol_type_index_offsets(Arena *arena, CV_SymKind kind, String8 data); +internal CV_TypeIndexInfoList cv_get_leaf_type_index_offsets(Arena *arena, CV_LeafKind leaf_kind, String8 data); +internal CV_TypeIndexInfoList cv_get_inlinee_type_index_offsets(Arena *arena, String8 raw_data); +internal String8Array cv_get_data_around_type_indices(Arena *arena, CV_TypeIndexInfoList ti_list, String8 data); +internal CV_TypeIndexSource cv_type_index_source_from_leaf_kind(CV_LeafKind leaf_kind); + +//////////////////////////////// + +internal U64 cv_name_offset_from_symbol(CV_SymKind kind, String8 data); +internal String8 cv_name_from_symbol (CV_SymKind kind, String8 data); +internal String8 cv_name_from_udt_info (CV_UDTInfo udt_info); + +internal B32 cv_is_udt_name_anon (String8 name); +internal B32 cv_is_udt (CV_LeafKind kind); +internal B32 cv_is_global_symbol (CV_SymKind kind); +internal B32 cv_is_typedef (CV_SymKind kind); +internal B32 cv_is_scope_symbol (CV_SymKind kind); +internal B32 cv_is_end_symbol (CV_SymKind kind); +internal B32 cv_is_leaf_type_server(CV_LeafKind kind); +internal B32 cv_is_leaf_pch (CV_LeafKind kind); + +internal CV_ObjInfo cv_obj_info_from_symbol(CV_Symbol symbol); +internal CV_TypeServerInfo cv_type_server_info_from_leaf(CV_Leaf leaf); +internal CV_PrecompInfo cv_precomp_info_from_leaf(CV_Leaf leaf); + +internal B32 cv_is_reg_sp(CV_Arch arch, CV_Reg reg); + +//////////////////////////////// +//~ Leaf Helpers + +internal U64 cv_compute_leaf_record_size(String8 data, U64 align); +internal U64 cv_serialize_leaf_to_buffer(U8 *buffer, U64 buffer_cursor, U64 buffer_size, CV_LeafKind kind, String8 data, U64 align); +internal String8 cv_serialize_leaf_ex(Arena *arena, CV_LeafKind kind, String8 data, U64 align); +internal String8 cv_serialize_leaf(Arena *arena, CV_Leaf *leaf, U64 align); +internal CV_Leaf cv_make_leaf(Arena *arena, CV_LeafKind kind, String8 data); +internal U64 cv_deserial_leaf(String8 raw_data, U64 off, U64 align, CV_Leaf *leaf_out); +internal CV_Leaf cv_leaf_from_string(String8 raw_data); + +//////////////////////////////// +//~ Symbol Helpers + +internal U64 cv_compute_symbol_record_size(CV_Symbol *symbol, U64 align); +internal U64 cv_serialize_symbol_to_buffer(U8 *buffer, U64 buffer_cursor, U64 buffer_size, CV_Symbol *symbol, U64 align); +internal String8 cv_serialize_symbol(Arena *arena, CV_Symbol *symbol, U64 align); + +internal String8 cv_make_symbol(Arena *arena, CV_SymKind kind, String8 data); +internal String8 cv_make_obj_name(Arena *arena, String8 obj_path, U32 sig); +internal String8 cv_make_comp3(Arena *arena, + CV_Compile3Flags flags, CV_Language lang, CV_Arch arch, + U16 ver_fe_major, U16 ver_fe_minor, U16 ver_fe_build, U16 ver_feqfe, + U16 ver_major, U16 ver_minor, U16 ver_build, U16 ver_qfe, + String8 version_string); +internal String8 cv_make_envblock(Arena *arena, String8List string_list); +internal CV_Symbol cv_make_proc_ref(Arena *arena, CV_ModIndex imod, U32 stream_offset, String8 name, B32 is_local); +internal CV_Symbol cv_make_pub32(Arena *arena, CV_Pub32Flags flags, U32 off, U16 isect, String8 name); +internal CV_SymbolList cv_make_proc_refs(Arena *arena, CV_ModIndex imod, CV_SymbolList symbol_list); + +//////////////////////////////// +// .debug$S Helpers + +internal CV_DebugS cv_parse_debug_s_c13(Arena *arena, String8 raw_debug_s); +internal CV_DebugS cv_parse_debug_s_c13_list(Arena *arena, String8List raw_debug_s); +internal CV_DebugS cv_parse_debug_s(Arena *arena, String8 raw_debug_s); +internal void cv_debug_s_concat_in_place(CV_DebugS *dst, CV_DebugS *src); +internal String8List cv_data_c13_from_debug_s(Arena *arena, CV_DebugS *debug_s, B32 write_sig); + +internal CV_C13SubSectionIdxKind cv_c13_sub_section_idx_from_kind(CV_C13SubSectionKind kind); +internal String8List * cv_sub_section_ptr_from_debug_s(CV_DebugS *debug_s, CV_C13SubSectionKind kind); +internal String8List cv_sub_section_from_debug_s(CV_DebugS debug_s, CV_C13SubSectionKind kind); +internal String8 cv_string_table_from_debug_s(CV_DebugS debug_s); +internal String8 cv_file_chksms_from_debug_s(CV_DebugS debug_s); + +//////////////////////////////// +//~ .debug$T helpers + +internal CV_DebugT cv_debug_t_from_data_arr(Arena *arena, String8Array data_arr, U64 align); +internal CV_DebugT cv_debug_t_from_data(Arena *arena, String8 data, U64 align); +internal CV_Leaf cv_debug_t_get_leaf(CV_DebugT debug_t, U64 leaf_idx); +internal String8 cv_debug_t_get_raw_leaf(CV_DebugT debug_t, U64 leaf_idx); +internal CV_LeafHeader * cv_debug_t_get_leaf_header(CV_DebugT debug_t, U64 leaf_idx); +internal B32 cv_debug_t_is_pch(CV_DebugT debug_t); +internal B32 cv_debug_t_is_type_server(CV_DebugT debug_t); +internal U64 cv_debug_t_array_count_leaves(U64 count, CV_DebugT *arr); + +internal String8List cv_str8_list_from_debug_t_parallel(TP_Context *tp, Arena *arena, CV_DebugT types); + +// $$Symbols +internal void cv_parse_symbol_sub_section(Arena *arena, CV_SymbolList *list, U64 offset_base, String8 data, U64 align); +internal void cv_symbol_list_push_node(CV_SymbolList *list, CV_SymbolNode *node); +internal CV_SymbolNode * cv_symbol_list_push(Arena *arena, CV_SymbolList *list); +internal CV_SymbolNode * cv_symbol_list_push_data(Arena *arena, CV_SymbolList *list, CV_SymKind kind, String8 data); +internal CV_SymbolNode * cv_symbol_list_push_many(Arena *arena, CV_SymbolList *list, U64 count); +internal void cv_symbol_list_remove_node(CV_SymbolList *list, CV_SymbolNode *node); +internal void cv_symbol_list_concat_in_place(CV_SymbolList *list, CV_SymbolList *to_concat); +internal void cv_symbol_list_concat_in_place_arr(CV_SymbolList *list, U64 count, CV_SymbolList *to_concat); +internal U64 cv_symbol_list_arr_get_count(U64 count, CV_SymbolList *list_arr); +internal String8List cv_data_from_symbol_list(Arena *arena, CV_SymbolList symbol_list, U64 align); +internal CV_SymbolList cv_global_scope_symbols_from_list(Arena *arena, CV_SymbolList list); +internal CV_ScopeList cv_symbol_tree_from_symbol_list(Arena *arena, CV_SymbolList list); +internal CV_SymbolList cv_build_symbol_tree(Arena *arena, CV_ScopeList symbol_tree, U64 symbol_base, U64 align); +internal CV_SymbolPtrArray cv_symbol_ptr_array_from_list(Arena *arena, TP_Context *tp, U64 count, CV_SymbolList *symbol_list_arr); + +// $$FileChksms +#define CV_MAP_STRING_TO_OFFSET_FUNC(name) U64 name(void *ud, String8 string) +typedef CV_MAP_STRING_TO_OFFSET_FUNC(CV_MapStringToOffsetFunc); + +//internal String8 cv_c13_file_chksms_from_sub_sections(String8 c13_data, CV_C13Parsed *ss); +internal void cv_c13_patch_string_offsets_in_checksum_list(CV_ChecksumList checksum_list, String8 string_data, U64 string_data_base_offset, CV_StringHashTable string_ht); +internal String8List cv_c13_collect_source_file_names(Arena *arena, CV_ChecksumList checksum_list, String8 string_data); + +// $$Lines +internal CV_C13LinesHeaderList cv_c13_lines_from_sub_sections(Arena *arena, String8 c13_data, Rng1U64 ss_range); +internal CV_LineArray cv_c13_line_array_from_data(Arena *arena, String8 c13_data, U64 sec_base, CV_C13LinesHeader parsed_lines); + +// $$InlineeLines +internal CV_C13InlineeLinesParsedList cv_c13_inlinee_lines_from_sub_sections(Arena *arena, String8List raw_inlinee_lines); +internal CV_InlineBinaryAnnotsParsed cv_c13_parse_inline_binary_annots(Arena *arena, U64 parent_voff, CV_C13InlineeLinesParsed *inlinee_parsed, String8 binary_annots); + +// $$FrameData +internal void cv_c13_patch_checksum_offsets_in_frame_data_list(String8List frame_data, U32 checksum_rebase); + +//////////////////////////////// +// $$Lines Accel + +internal void cv_make_c13_files(Arena *arena, String8 c13_data, CV_C13SubSectionList lines, U64 *file_count_out, CV_C13File **files_out); +internal CV_LinesAccel * cv_make_lines_accel(Arena *arena, U64 lines_count, CV_LineArray *lines); +internal CV_Line * cv_line_from_voff(CV_LinesAccel *accel, U64 voff, U64 *out_line_count); + +//////////////////////////////// +// $$InlineeLines Accel + +internal U64 cv_c13_inlinee_lines_accel_hash(void *buffer, U64 size); +internal B32 cv_c13_inlinee_lines_accel_push(CV_InlineeLinesAccel *accel, CV_C13InlineeLinesParsed *parsed); +internal CV_C13InlineeLinesParsed * cv_c13_inlinee_lines_accel_find(CV_InlineeLinesAccel *accel, CV_ItemId inlinee); +internal CV_InlineeLinesAccel * cv_c13_make_inlinee_lines_accel(Arena *arena, CV_C13InlineeLinesParsedList sub_sects); + +//////////////////////////////// +// String Hash Table + +internal U64 cv_string_hash_table_hash(String8 string); +internal CV_StringHashTable cv_dedup_string_tables(TP_Arena *arena, TP_Context *tp, U64 count, CV_DebugS *arr); +internal CV_StringHashTableResult cv_serialize_string_hash_table(Arena *arena, TP_Context *tp, CV_StringHashTable string_ht); +internal String8 cv_pack_string_hash_table(Arena *arena, TP_Context *tp, CV_StringHashTable string_ht); + +//////////////////////////////// + +internal CV_EncodedFramePtrReg cv_pick_fp_encoding(CV_SymFrameproc *frameproc, B32 is_local_param); +internal CV_Reg cv_decode_fp_reg(CV_Arch arch, CV_EncodedFramePtrReg encoded_reg); +internal Rng1U64List cv_make_defined_range_list_from_gaps(Arena *arena, Rng1U64 defrange, CV_LvarAddrGap *gaps, U64 gap_count); + +//////////////////////////////// + +internal U64 cv_size_from_reg_x86(CV_Reg reg); +internal U64 cv_size_from_reg_x64(CV_Reg reg); +internal U64 cv_size_from_reg(CV_Arch arch, CV_Reg reg); + +//////////////////////////////// + +internal CV_Arch cv_arch_from_coff_machine(COFF_MachineType machine); +internal String8 cv_string_from_type_index_source(CV_TypeIndexSource ti_source); + + diff --git a/src/linker/hash_table.c b/src/linker/hash_table.c new file mode 100644 index 00000000..b48c9a10 --- /dev/null +++ b/src/linker/hash_table.c @@ -0,0 +1,270 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal void +bucket_list_concat_in_place(BucketList *list, BucketList *to_concat) +{ + SLLConcatInPlaceNoCount(list, to_concat); +} + +internal BucketNode * +bucket_list_pop(BucketList *list) +{ + BucketNode *result = list->first; + SLLQueuePop(list->first, list->last); + return result; +} + +//////////////////////////////// + +internal U64 +hash_table_hasher(String8 string) +{ + XXH64_hash_t hash64 = XXH3_64bits(string.str, string.size); + return hash64; +} + +internal HashTable * +hash_table_init(Arena *arena, U64 cap) +{ + HashTable *ht = push_array(arena, HashTable, 1); + ht->cap = cap; + ht->buckets = push_array(arena, BucketList, cap); + return ht; +} + +internal void +hash_table_purge(HashTable *ht) +{ + // reset key count + ht->count = 0; + + // concat buckets + for (U64 ibucket = 0; ibucket < ht->cap; ++ibucket) { + bucket_list_concat_in_place(&ht->free_buckets, &ht->buckets[ibucket]); + } +} + +internal BucketNode * +hash_table_push(Arena *arena, HashTable *ht, U64 hash, KeyValuePair v) +{ + BucketNode *node; + if (ht->free_buckets.first != 0) { + node = bucket_list_pop(&ht->free_buckets); + } else { + node = push_array(arena, BucketNode, 1); + } + node->next = 0; + node->v = v; + + U64 ibucket = hash % ht->cap; + SLLQueuePush(ht->buckets[ibucket].first, ht->buckets[ibucket].last, node); + ++ht->count; + + return node; +} + +internal BucketNode * +hash_table_push_string_string(Arena *arena, HashTable *ht, String8 key, String8 value) +{ + U64 hash = hash_table_hasher(key); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_string = key, .value_string = value }); +} + +internal BucketNode * +hash_table_push_string_raw(Arena *arena, HashTable *ht, String8 key, void *value) +{ + U64 hash = hash_table_hasher(key); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_string = key, .value_raw = value }); +} + +internal BucketNode * +hash_table_push_string_u64(Arena *arena, HashTable *ht, String8 key, U64 value) +{ + U64 hash = hash_table_hasher(key); + return hash_table_push(arena, ht, hash, (KeyValuePair){.key_string = key, .value_u64 = value }); +} + +internal BucketNode * +hash_table_push_u32_raw(Arena *arena, HashTable *ht, U32 key, void *value) +{ + U64 hash = hash_table_hasher(str8_struct(&key)); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_u32 = key, .value_raw = value }); +} + +internal BucketNode * +hash_table_push_u32_string(Arena *arena, HashTable *ht, U32 key, String8 value) +{ + U64 hash = hash_table_hasher(str8_struct(&key)); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_u32 = key, .value_string = value }); +} + +internal BucketNode * +hash_table_push_u64_raw(Arena *arena, HashTable *ht, U64 key, void *value) +{ + U64 hash = hash_table_hasher(str8_struct(&key)); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_u64 = key, .value_raw = value }); +} + +internal BucketNode * +hash_table_push_u64_string(Arena *arena, HashTable *ht, U64 key, String8 value) +{ + U64 hash = hash_table_hasher(str8_struct(&key)); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_u64 = key, .value_string = value }); +} + +internal BucketNode * +hash_table_push_u64_u64(Arena *arena, HashTable *ht, U64 key, U64 value) +{ + U64 hash = hash_table_hasher(str8_struct(&key)); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_u64 = key, .value_u64 = value }); +} + +internal BucketNode * +hash_table_push_path_string(Arena *arena, HashTable *ht, String8 path, String8 value) +{ + String8 path_canon = path_canon_from_regular_path(arena, path); + return hash_table_push_string_string(arena, ht, path_canon, value); +} + +internal BucketNode * +hash_table_push_path_u64(Arena *arena, HashTable *ht, String8 path, U64 value) +{ + String8 path_canon = path_canon_from_regular_path(arena, path); + U64 hash = hash_table_hasher(path_canon); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_string = path_canon, .value_u64 = value }); +} + +internal BucketNode * +hash_table_push_path_raw(Arena *arena, HashTable *ht, String8 path, void *value) +{ + String8 path_canon = path_canon_from_regular_path(arena, path); + U64 hash = hash_table_hasher(path_canon); + return hash_table_push(arena, ht, hash, (KeyValuePair){ .key_string = path_canon, .value_raw = value }); +} + +//////////////////////////////// + +internal KeyValuePair * +hash_table_search_string(HashTable *ht, String8 key_string) +{ + U64 hash = hash_table_hasher(key_string); + U64 ibucket = hash % ht->cap; + BucketList *bucket = ht->buckets + ibucket; + for (BucketNode *n = bucket->first; n != 0; n = n->next) { + if (str8_match(n->v.key_string, key_string, 0)) { + return &n->v; + } + } + return 0; +} + +internal KeyValuePair * +hash_table_search_u32(HashTable *ht, U32 key_u32) +{ + U64 hash = hash_table_hasher(str8_struct(&key_u32)); + U64 ibucket = hash % ht->cap; + BucketList *bucket = ht->buckets + ibucket; + for (BucketNode *n = bucket->first; n != 0; n = n->next) { + if (n->v.key_u32 == key_u32) { + return &n->v; + } + } + return 0; +} + +internal KeyValuePair * +hash_table_search_u64(HashTable *ht, U64 key_u64) +{ + U64 hash = hash_table_hasher(str8_struct(&key_u64)); + U64 ibucket = hash % ht->cap; + BucketList *bucket = ht->buckets + ibucket; + for (BucketNode *n = bucket->first; n != 0; n = n->next) { + if (n->v.key_u64 == key_u64) { + return &n->v; + } + } + return 0; +} + +internal KeyValuePair * +hash_table_search_path(HashTable *ht, String8 path) +{ + Temp scratch = scratch_begin(0,0); + String8 path_canon = path; + path_canon = lower_from_str8(scratch.arena, path_canon); + path_canon = path_convert_slashes(scratch.arena, path_canon, PathStyle_UnixAbsolute); + KeyValuePair *result = hash_table_search_string(ht, path_canon); + scratch_end(scratch); + return result; +} + +internal B32 +hash_table_search_path_u64(HashTable *ht, String8 key, U64 *value_out) +{ + KeyValuePair *result = hash_table_search_path(ht, key); + if (result != 0) { + if (value_out != 0) { + *value_out = result->value_u64; + } + return 1; + } + return 0; +} + +internal B32 +hash_table_search_string_u64(HashTable *ht, String8 key, U64 *value_out) +{ + KeyValuePair *result = hash_table_search_string(ht, key); + if (result != 0) { + if (value_out != 0) { + *value_out = result->value_u64; + } + return 1; + } + return 0; +} + +//////////////////////////////// + +internal int +key_value_pair_is_before_u32(void *raw_a, void *raw_b) +{ + KeyValuePair *a = raw_a; + KeyValuePair *b = raw_b; + return a->key_u32 < b->key_u32; +} + +internal int +key_value_pair_is_before_u64(void *raw_a, void *raw_b) +{ + KeyValuePair *a = raw_a; + KeyValuePair *b = raw_b; + return a->key_u64 < b->key_u64; +} + +internal KeyValuePair * +key_value_pairs_from_hash_table(Arena *arena, HashTable *ht) +{ + KeyValuePair *pairs = push_array_no_zero(arena, KeyValuePair, ht->count); + for (U64 bucket_idx = 0, cursor = 0; bucket_idx < ht->cap; ++bucket_idx) { + for (BucketNode *n = ht->buckets[bucket_idx].first; n != 0; n = n->next) { + Assert(cursor < ht->count); + pairs[cursor++] = n->v; + } + } + return pairs; +} + +internal void +sort_key_value_pairs_as_u32(KeyValuePair *pairs, U64 count) +{ + radsort(pairs, count, key_value_pair_is_before_u32); +} + +internal void +sort_key_value_pairs_as_u64(KeyValuePair *pairs, U64 count) +{ + radsort(pairs, count, key_value_pair_is_before_u64); +} + diff --git a/src/linker/hash_table.h b/src/linker/hash_table.h new file mode 100644 index 00000000..70786d5b --- /dev/null +++ b/src/linker/hash_table.h @@ -0,0 +1,81 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct KeyValuePair +{ + union { + String8 key_string; + U32 key_u32; + U64 key_u64; + }; + union { + String8 value_string; + void *value_raw; + U32 value_u32; + U64 value_u64; + }; +} KeyValuePair; + +typedef struct BucketNode +{ + struct BucketNode *next; + KeyValuePair v; +} BucketNode; + +typedef struct BucketList +{ + BucketNode *first; + BucketNode *last; +} BucketList; + +typedef struct HashTable +{ + U64 count; + U64 cap; + BucketList *buckets; + BucketList free_buckets; +} HashTable; + +//////////////////////////////// + +//- bucket list helpers + +internal void bucket_list_concat_in_place(BucketList *list, BucketList *to_concat); +internal BucketNode * bucket_list_pop(BucketList *list); + +//- main + +internal U64 hash_table_hasher(String8 string); +internal HashTable * hash_table_init(Arena *arena, U64 cap); +internal void hash_table_purge(HashTable *ht); + +//- push + +internal BucketNode * hash_table_push (Arena *arena, HashTable *ht, U64 hash, KeyValuePair v); +internal BucketNode * hash_table_push_u32_string (Arena *arena, HashTable *ht, U32 key, String8 value); +internal BucketNode * hash_table_push_u64_string (Arena *arena, HashTable *ht, U64 key, String8 value); +internal BucketNode * hash_table_push_string_string(Arena *arena, HashTable *ht, String8 key, String8 value); +internal BucketNode * hash_table_push_path_string (Arena *arena, HashTable *ht, String8 key, String8 value); +internal BucketNode * hash_table_push_u32_raw (Arena *arena, HashTable *ht, U32 key, void *value); +internal BucketNode * hash_table_push_u64_raw (Arena *arena, HashTable *ht, U64 key, void *value); +internal BucketNode * hash_table_push_path_raw (Arena *arena, HashTable *ht, String8 path, void *value); +internal BucketNode * hash_table_push_path_u64 (Arena *arena, HashTable *ht, String8 path, U64 value); +internal BucketNode * hash_table_push_u64_u64 (Arena *arena, HashTable *ht, U64 key, U64 value); + +//- search + +internal KeyValuePair * hash_table_search_string (HashTable *ht, String8 string); +internal KeyValuePair * hash_table_search_u32 (HashTable *ht, U32 key); +internal KeyValuePair * hash_table_search_u64 (HashTable *ht, U64 key); +internal KeyValuePair * hash_table_search_path (HashTable *ht, String8 path); + +internal B32 hash_table_search_path_u64(HashTable *ht, String8 key, U64 *value_out); + +//- key-value helpers + +internal KeyValuePair * key_value_pairs_from_hash_table(Arena *arena, HashTable *ht); +internal void sort_key_value_pairs_as_u32(KeyValuePair *pairs, U64 count); +internal void sort_key_value_pairs_as_u64(KeyValuePair *pairs, U64 count); + diff --git a/src/linker/lnk.c b/src/linker/lnk.c new file mode 100644 index 00000000..b4e51fa2 --- /dev/null +++ b/src/linker/lnk.c @@ -0,0 +1,4580 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +//////////////////////////////// +// Build Options + +#define BUILD_CONSOLE_INTERFACE 1 +#define BUILD_VERSION_MAJOR 0 +#define BUILD_VERSION_MINOR 6 +#define BUILD_VERSION_PATCH 0 +#define BUILD_RELEASE_PHASE_STRING_LITERAL "ALPHA" +#define BUILD_VERSION_STRING Stringify(BUILD_VERSION_MAJOR) "." Stringify(BUILD_VERSION_MINOR) "." Stringify(BUILD_VERSION_PATCH) +#define BUILD_TITLE "Epic Games Tools (R) RAD PE/COFF Linker " BUILD_VERSION_STRING + +//////////////////////////////// + +#define ARENA_FREE_LIST 1 + +//////////////////////////////// +// Third Party + +#include "base_ext/base_blake3.h" +#include "base_ext/base_blake3.c" +#include "third_party_ext/md5/md5.c" +#include "third_party_ext/md5/md5.h" +#include "third_party_ext/xxHash/xxhash.c" +#include "third_party_ext/xxHash/xxhash.h" + +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable:4789) +#endif +#include "third_party_ext/radsort/radsort.h" +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +//////////////////////////////// +// Code Base + +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Winitializer-overrides" +# pragma clang diagnostic ignored "-Wswitch" +#endif + +#include "base/base_inc.h" +#include "os/os_inc.h" +#include "path/path.h" +#include "coff/coff.h" +#include "pe/pe.h" +#include "codeview/codeview.h" +#include "msf/msf.h" +#include "msf/msf_parse.h" +#include "pdb/pdb.h" + +#include "base/base_inc.c" +#include "os/os_inc.c" +#include "path/path.c" +#include "coff/coff.c" +#include "pe/pe.c" +#include "codeview/codeview.c" +#include "msf/msf_parse.c" +#include "pdb/pdb.c" + +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + +//////////////////////////////// +// RDI + +#include "rdi/rdi_overrides.h" +#include "lib_rdi_format/rdi_format.h" +#include "rdi/rdi.h" +#include "lib_rdi_format/rdi_format.c" +#include "rdi/rdi.c" + +//////////////////////////////// +// Code Base Extensions + +#include "base_ext/base_inc.h" +#include "path_ext/path.h" +#include "hash_table.h" +#include "thread_pool/thread_pool.h" +#include "os_ext/os_inc.h" +#include "codeview_ext/codeview.h" +#include "pdb_ext/msf_builder.h" +#include "pdb_ext/pdb.h" +#include "pdb_ext/pdb_helpers.h" +#include "pdb_ext/pdb_builder.h" + +#include "base_ext/base_inc.c" +#include "path_ext/path.c" +#include "hash_table.c" +#include "thread_pool/thread_pool.c" +#include "os_ext/os_inc.c" +#include "codeview_ext/codeview.c" +#include "pdb_ext/msf_builder.c" +#include "pdb_ext/pdb.c" +#include "pdb_ext/pdb_helpers.c" +#include "pdb_ext/pdb_builder.c" + +//////////////////////////////// +// RDI Builder + +#include "rdi/rdi_builder.h" +#include "rdi/rdi_coff.h" +#include "rdi/rdi_cv.h" + +#include "rdi/rdi_builder.c" +#include "rdi/rdi_coff.c" +#include "rdi/rdi_cv.c" + +//////////////////////////////// +// Linker + +#include "lnk_error.h" +#include "lnk_log.h" +#include "lnk_timer.h" +#include "lnk_cmd_line.h" +#include "lnk_config.h" +#include "lnk_chunk.h" +#include "lnk_reloc.h" +#include "lnk_directive.h" +#include "lnk_symbol_table.h" +#include "lnk_section_table.h" +#include "lnk_obj.h" +#include "lnk_import_table.h" +#include "lnk_export_table.h" +#include "lnk_lib.h" +#include "lnk_debug_info.h" +#include "lnk.h" + +#include "lnk_error.c" +#include "lnk_log.c" +#include "lnk_timer.c" +#include "lnk_cmd_line.c" +#include "lnk_config.c" +#include "lnk_chunk.c" +#include "lnk_reloc.c" +#include "lnk_directive.c" +#include "lnk_symbol_table.c" +#include "lnk_section_table.c" +#include "lnk_obj.c" +#include "lnk_import_table.c" +#include "lnk_export_table.c" +#include "lnk_lib.c" +#include "lnk_debug_info.c" + +//////////////////////////////// + +internal LNK_InputImport * +lnk_input_import_list_push(Arena *arena, LNK_InputImportList *list) +{ + LNK_InputImport *node = push_array(arena, LNK_InputImport, 1); + SLLQueuePush(list->first, list->last, node); + list->count += 1; + return node; +} + +internal void +lnk_input_import_list_concat_in_place(LNK_InputImportList *list, LNK_InputImportList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal LNK_InputImport ** +lnk_input_import_arr_from_list(Arena *arena, LNK_InputImportList list) +{ + LNK_InputImport **result = push_array_no_zero(arena, LNK_InputImport *, list.count); + U64 idx = 0; + for (LNK_InputImport *node = list.first; node != 0; node = node->next) { + Assert(idx < list.count); + result[idx++] = node; + } + return result; +} + +internal LNK_InputImportList +lnk_list_from_input_import_arr(LNK_InputImport **arr, U64 count) +{ + LNK_InputImportList list; MemoryZeroStruct(&list); + for (U64 i = 0; i < count; i += 1) { + SLLQueuePush(list.first, list.last, arr[i]); + list.count += 1; + } + return list; +} + +int +lnk_input_import_is_before(void *raw_a, void *raw_b) +{ + LNK_InputImport **a = raw_a; + LNK_InputImport **b = raw_b; + int cmp = str8_compar_ignore_case(&(*a)->import_header.dll_name, &(*b)->import_header.dll_name); + if (cmp == 0) { + cmp = str8_compar_case_sensetive(&(*a)->import_header.func_name, &(*b)->import_header.func_name); + } + return cmp < 0; +} + +int +lnk_input_import_compar(const void *raw_a, const void *raw_b) +{ + const LNK_InputImport **a = (const LNK_InputImport **) raw_a; + const LNK_InputImport **b = (const LNK_InputImport **) raw_b; + int cmp = str8_compar_ignore_case(&(*a)->import_header.dll_name, &(*b)->import_header.dll_name); + if (cmp == 0) { + cmp = str8_compar_case_sensetive(&(*a)->import_header.func_name, &(*b)->import_header.func_name); + } + return cmp; +} + +//////////////////////////////// + +internal void +lnk_write_data_list_to_file_path(String8 path, String8List data) +{ +#if PROFILE_TELEMETRY + Temp scratch = scratch_begin(0, 0); + String8 size_str = str8_from_memory_size2(scratch.arena, data.total_size); + ProfBeginDynamic("Write %.*s to %.*s", str8_varg(size_str), str8_varg(path)); + scratch_end(scratch); +#endif + + B32 is_written = os_write_data_list_to_file_path(path, data); + if (is_written) { + if (lnk_get_log_status(LNK_Log_IO)) { + Temp scratch = scratch_begin(0,0); + String8 size_str = str8_from_memory_size2(scratch.arena, data.total_size); + lnk_log(LNK_Log_IO, "File \"%S\" %S written", path, size_str); + scratch_end(scratch); + } + } else { + lnk_error(LNK_Error_NoAccess, "don't have access to write to %S", path); + } + ProfEnd(); +} + +internal void +lnk_write_data_to_file_path(String8 path, String8 data) +{ + Temp scratch = scratch_begin(0,0); + String8List data_list = {0}; + str8_list_push(scratch.arena, &data_list, data); + lnk_write_data_list_to_file_path(path, data_list); + scratch_end(scratch); +} + +internal String8 +lnk_make_full_path(Arena *arena, String8 work_dir, PathStyle system_path_style, String8 path) +{ + ProfBeginFunction(); + String8 result = str8(0,0); + PathStyle path_style = path_style_from_str8(path); + if (path_style == PathStyle_Relative) { + Temp scratch = scratch_begin(&arena, 1); + String8List list; MemoryZeroStruct(&list); + str8_list_push(scratch.arena, &list, work_dir); + str8_list_push(scratch.arena, &list, path); + result = str8_path_list_join_by_style(arena, &list, system_path_style); + scratch_end(scratch); + } else { + result = push_str8_copy(arena, path); + } + ProfEnd(); + return result; +} + +//////////////////////////////// + +internal String8List +lnk_make_linker_manifest(Arena *arena, + B32 manifest_uac, + String8 manifest_level, + String8 manifest_ui_access, + String8List manifest_dependency_list) +{ + String8List srl = {0}; + str8_serial_begin(arena, &srl); + str8_serial_push_string(arena, &srl, str8_lit( + "\n" + "\n")); + if (manifest_uac) { + String8 uac = push_str8f(arena, + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n", + manifest_level, + manifest_ui_access); + str8_serial_push_string(arena, &srl, uac); + } + for (String8Node *node = manifest_dependency_list.first; node != 0; node = node->next) { + String8 dep = push_str8f(arena, + " \n" + " \n" + " \n" + " \n" + " \n", + node->string); + str8_serial_push_string(arena, &srl, dep); + } + str8_serial_push_string(arena, &srl, str8_lit("\n")); + return srl; +} + +internal String8 +lnk_merge_manifest_files(Arena *arena, String8 mt_path, String8 manifest_name, String8List manifest_path_list) +{ + ProfBeginFunction(); + + Temp scratch = scratch_begin(&arena,1); + + String8List invoke_cmd_line = {0}; + str8_list_push(arena, &invoke_cmd_line, mt_path); + String8 work_dir = os_get_current_path(arena); + for (String8Node *man_node = manifest_path_list.first; + man_node != 0; + man_node = man_node->next) { + String8 full_path = path_absolute_dst_from_relative_dst_src(arena, man_node->string, work_dir); + full_path = path_convert_slashes(arena, full_path, PathStyle_UnixAbsolute); + str8_list_pushf(arena, &invoke_cmd_line, "-manifest"); + str8_list_push(arena, &invoke_cmd_line, full_path); + } + str8_list_pushf(arena, &invoke_cmd_line, "-out:%S", manifest_name); + str8_list_pushf(arena, &invoke_cmd_line, "-nologo"); + + OS_ProcessLaunchParams launch_opts = {0}; + launch_opts.cmd_line = invoke_cmd_line; + launch_opts.path = str8_chop_last_slash(mt_path); + launch_opts.inherit_env = 1; + launch_opts.consoleless = 1; + + OS_Handle mt_handle = os_process_launch(&launch_opts); + if (!os_handle_match(mt_handle, os_handle_zero())) { + if (os_process_join(mt_handle, max_U64)) { + if (!os_file_path_exists(manifest_name)) { + lnk_error(LNK_Error_Mt, "something went wrong, manifest was not written to \"%S\"", manifest_name); + } + } + os_process_detach(mt_handle); + } else { + lnk_error(LNK_Error_Mt, "unable to start process for %S", mt_path); + } + + scratch_end(scratch); + ProfEnd(); + return manifest_name; +} +internal String8 +lnk_res_from_data(Arena *arena, String8 data) +{ + Temp scratch = scratch_begin(&arena, 1); + + COFF_ResourceID type; + type.type = COFF_ResourceIDType_NUMBER; + type.u.number = PE_ResourceKind_MANIFEST; + + COFF_ResourceID name; + name.type = COFF_ResourceIDType_NUMBER; + name.u.number = 1; + + String8List res_list = coff_write_resource(arena, type, name, 1, 0, 1033, 0, 0, data); + String8 res_data = str8_serial_end(arena, &res_list); + + scratch_end(scratch); + return res_data; +} + +//////////////////////////////// + +internal int +lnk_res_string_id_is_before(void *raw_a, void *raw_b) +{ + PE_Resource *a = raw_a; + PE_Resource *b = raw_b; + Assert(a->id.type == COFF_ResourceIDType_STRING); + Assert(b->id.type == COFF_ResourceIDType_STRING); + int is_before = str8_is_before_case_sensetive(&a->id.u.string, &b->id.u.string); + return is_before; +} + +internal int +lnk_res_number_id_is_before(void *raw_a, void *raw_b) +{ + PE_Resource *a = raw_a; + PE_Resource *b = raw_b; + Assert(a->id.type == COFF_ResourceIDType_NUMBER); + Assert(b->id.type == COFF_ResourceIDType_NUMBER); + int is_before = u16_is_before(&a->id.u.number, &b->id.u.number); + return is_before; +} + +internal void +lnk_serialize_pe_resource_tree(LNK_SectionTable *st, LNK_SymbolTable *symtab, PE_ResourceDir *root_dir) +{ + ProfBeginFunction(); + + static const U64 ALIGN = 4; + + struct stack_s { + struct stack_s *next; + U64 arr_idx; + U64 res_idx[2]; + PE_ResourceArray res_arr[2]; + LNK_Chunk *coff_entry_array_chunk; + LNK_Chunk *coff_entry_chunk; + }; + + Temp scratch = scratch_begin(0, 0); + + LNK_Section *dir_sect = lnk_section_table_push(st, str8_lit(".rsrc$01"), LNK_RSRC_SECTION_FLAGS); + LNK_Section *data_sect = lnk_section_table_push(st, str8_lit(".rsrc$02"), LNK_RSRC_SECTION_FLAGS); + + LNK_Chunk *dir_tree_chunk = lnk_section_push_chunk_list(dir_sect, dir_sect->root, str8(0,0)); + LNK_Chunk *dir_data_chunk = lnk_section_push_chunk_list(dir_sect, dir_sect->root, str8(0,0)); + LNK_Chunk *dir_string_chunk = lnk_section_push_chunk_list(dir_sect, dir_sect->root, str8(0,0)); + + dir_tree_chunk->sort_idx = str8_lit("a"); + dir_string_chunk->sort_idx = str8_lit("b"); + dir_data_chunk->sort_idx = str8_lit("c"); + + PE_Resource root_wrapper; MemoryZeroStruct(&root_wrapper); + root_wrapper.id.type = COFF_ResourceIDType_NUMBER; + root_wrapper.id.u.number = 0; + root_wrapper.kind = PE_ResDataKind_DIR; + root_wrapper.u.dir = root_dir; + + struct stack_s *stack = push_array(scratch.arena, struct stack_s, 1); + stack->res_arr[0].count = 1; + stack->res_arr[0].v = &root_wrapper; + + U64 res_counter = 0; + + while (stack) { + while (stack->arr_idx < ArrayCount(stack->res_arr)) { + while (stack->res_idx[stack->arr_idx] < stack->res_arr[stack->arr_idx].count) { + PE_Resource *res = &stack->res_arr[stack->arr_idx].v[stack->res_idx[stack->arr_idx]]; + stack->res_idx[stack->arr_idx] += 1; + + String8 flag_name = push_str8f(symtab->arena, "flag_%u", res_counter); + String8 offset_name = push_str8f(symtab->arena, "offset_%u", res_counter); + ++res_counter; + + if (stack->coff_entry_array_chunk) { + COFF_ResourceDirEntry *entry = push_array(dir_sect->arena, COFF_ResourceDirEntry, 1); + stack->coff_entry_chunk = lnk_section_push_chunk_data(dir_sect, stack->coff_entry_array_chunk, str8_struct(entry), str8(0,0)); + + switch (res->id.type) { + case COFF_ResourceIDType_NUMBER: { + entry->name.id = res->id.u.number; + } break; + case COFF_ResourceIDType_STRING: { + // TODO: we can make string table smaller by reusing offsets for same strings + + // not sure why high bit has to be turned on here since number id and string id entries are + // in separate arrays but windows doesn't treat name offset like string without this bit. + entry->name.offset |= (1 << 31); + + // convert name to utf-16 + String16 name16 = str16_from_8(dir_sect->arena, res->id.u.string); + + // build name string + U64 name16_byte_size = name16.size * sizeof(U16); + U64 buffer_size = /* char count: */ sizeof(U16) + name16_byte_size; + U8 *buffer = push_array_no_zero(dir_sect->arena, U8, buffer_size); + *(U16*)buffer = name16.size; + MemoryCopy(buffer + sizeof(U16), name16.str, name16_byte_size); + + // push string table chunk + String8 name_data = str8(buffer, buffer_size); + LNK_Chunk *name_chunk = lnk_section_push_chunk_data(dir_sect, dir_string_chunk, name_data, str8(0,0)); + + // push name chunk symbol + LNK_Symbol *name_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit("COFF_RESOURCE_ID_STRING"), LNK_DefinedSymbolVisibility_Static, 0, name_chunk, 0, 0, 0); + lnk_section_push_reloc(dir_sect, stack->coff_entry_chunk, LNK_Reloc_SECT_REL, OffsetOf(COFF_ResourceDirEntry, name.offset), name_symbol); + } break; + case COFF_ResourceIDType_NULL: break; + default: InvalidPath; + } + } + + switch (res->kind) { + case PE_ResDataKind_DIR: { + // initialize directory header + COFF_ResourceDirTable *dir_header = push_array(dir_sect->arena, COFF_ResourceDirTable, 1); + dir_header->characteristics = res->u.dir->characteristics; + dir_header->time_stamp = res->u.dir->time_stamp; + dir_header->major_version = res->u.dir->major_version; + dir_header->minor_version = res->u.dir->minor_version; + dir_header->name_entry_count = res->u.dir->named_list.count; + dir_header->id_entry_count = res->u.dir->id_list.count; + + // push sub directory chunk layout + LNK_Chunk *dir_node_chunk = lnk_section_push_chunk_list(dir_sect, dir_tree_chunk, str8(0,0)); + dir_node_chunk->align = ALIGN; + LNK_Chunk *dir_header_chunk = lnk_section_push_chunk_data(dir_sect, dir_node_chunk, str8_struct(dir_header), str8(0,0)); + LNK_Chunk *entry_array_chunk = lnk_section_push_chunk_list(dir_sect, dir_node_chunk, str8(0,0)); + lnk_chunk_set_debugf(dir_sect->arena, dir_header_chunk, "DIR_HEADER_CHUNK"); + lnk_chunk_set_debugf(dir_sect->arena, entry_array_chunk, "DIR_ENTRY_ARRAY_CHUNK"); + + // push symbols to patch coff entry + LNK_Symbol *flag_symbol = lnk_make_defined_symbol_va(symtab->arena, flag_name, LNK_DefinedSymbolVisibility_Internal, 0, COFF_RESOURCE_SUB_DIR_FLAG); + LNK_Symbol *offset_symbol = lnk_make_defined_symbol_chunk(symtab->arena, offset_name, LNK_DefinedSymbolVisibility_Internal, 0, dir_header_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, flag_symbol); // set high bit to indicate directory + lnk_symbol_table_push(symtab, offset_symbol); // write offset for this directory + + // patch resource dir header + if (stack->coff_entry_chunk) { + lnk_section_push_reloc(dir_sect, stack->coff_entry_chunk, LNK_Reloc_ADDR_32, OffsetOf(COFF_ResourceDirEntry, id.data_entry_offset), flag_symbol); + lnk_section_push_reloc(dir_sect, stack->coff_entry_chunk, LNK_Reloc_SECT_REL, OffsetOf(COFF_ResourceDirEntry, id.data_entry_offset), offset_symbol); + } + + // sort entries by id + PE_ResourceArray named_array = pe_resource_list_to_array(scratch.arena, &res->u.dir->named_list); + PE_ResourceArray id_array = pe_resource_list_to_array(scratch.arena, &res->u.dir->id_list); + radsort(named_array.v, named_array.count, lnk_res_string_id_is_before); + radsort(id_array.v, id_array.count, lnk_res_number_id_is_before); + + // frame for sub directory + struct stack_s *frame = push_array(scratch.arena, struct stack_s, 1); + frame->coff_entry_array_chunk = entry_array_chunk; + frame->res_arr[0] = named_array; + frame->res_arr[1] = id_array; + SLLStackPush(stack, frame); + } goto yeild; // recurse to sub directory + + case PE_ResDataKind_COFF_RESOURCE: { + COFF_ResourceDataEntry *coff_resource_data_entry = push_array(dir_sect->arena, COFF_ResourceDataEntry, 1); + coff_resource_data_entry->data_size = res->u.coff_res.data.size; + coff_resource_data_entry->data_voff = 0; // relocated + coff_resource_data_entry->code_page = 0; // TODO: whats this for? (lld-link writes zero) + + // push layout chunks + LNK_Chunk *coff_resource_data_entry_chunk = lnk_section_push_chunk_data(dir_sect, dir_data_chunk, str8_struct(coff_resource_data_entry), str8(0,0)); + LNK_Chunk *resource_data_chunk = lnk_section_push_chunk_data(data_sect, data_sect->root, res->u.coff_res.data, str8(0,0)); + + // windows errors out on unaligned data + coff_resource_data_entry_chunk->align = ALIGN; + resource_data_chunk->align = ALIGN; + + // relocate data + String8 resource_data_symbol_name = push_str8f(symtab->arena, "$R%06X", res_counter); + LNK_Symbol *resource_data_symbol = lnk_make_defined_symbol_chunk(symtab->arena, resource_data_symbol_name, LNK_DefinedSymbolVisibility_Static, 0, resource_data_chunk, 0, 0, 0); + lnk_section_push_reloc(dir_sect, coff_resource_data_entry_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(COFF_ResourceDataEntry, data_voff), resource_data_symbol); + + // push symbol for data offset relocation + LNK_Symbol *coff_data_offset_symbol = lnk_make_defined_symbol_chunk(symtab->arena, offset_name, LNK_DefinedSymbolVisibility_Internal, 0, coff_resource_data_entry_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, coff_data_offset_symbol); + + Assert(stack->coff_entry_chunk); + lnk_section_push_reloc(dir_sect, stack->coff_entry_chunk, LNK_Reloc_SECT_REL, OffsetOf(COFF_ResourceDirEntry, id.data_entry_offset), coff_data_offset_symbol); + } break; + + case PE_ResDataKind_NULL: break; + + // we must not have this resource node here, it is used to represent on-disk version of entry + case PE_ResDataKind_COFF_LEAF: InvalidPath; + } + } + ++stack->arr_idx; + } + SLLStackPop(stack); + yeild:; + } + + scratch_end(scratch); + ProfEnd(); +} + +internal void +lnk_add_resource_debug_s(LNK_SectionTable *st, + LNK_SymbolTable *symtab, + String8 obj_path, + String8 cwd_path, + String8 exe_path, + CV_Arch arch, + String8List res_file_list, + MD5Hash *res_hash_array) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + // init serial for tables + String8List string_srl, file_srl; MemoryZeroStruct(&string_srl); MemoryZeroStruct(&file_srl); + str8_serial_begin(scratch.arena, &string_srl); + str8_serial_begin(scratch.arena, &file_srl); + + // reserve first byte for null + str8_serial_push_u8(scratch.arena, &string_srl, 0); + + // build file and string table + U64 node_idx = 0; + for (String8Node *n = res_file_list.first; n != NULL; n = n->next, ++node_idx) { + CV_C13Checksum checksum = {0}; + checksum.name_off = string_srl.total_size; + checksum.len = sizeof(MD5Hash); + checksum.kind = CV_C13ChecksumKind_MD5; + str8_serial_push_struct(scratch.arena, &file_srl, &checksum); + str8_serial_push_struct(scratch.arena, &file_srl, &res_hash_array[node_idx]); + str8_serial_push_align(scratch.arena, &file_srl, CV_FileCheckSumsAlign); + str8_serial_push_cstr(scratch.arena, &string_srl, n->string); + } + + // build symbols + String8 obj_data = cv_make_obj_name(scratch.arena, obj_path, 0); + + String8 exe_name_with_ext = str8_skip_last_slash(exe_path); + String8 exe_name_ext = str8_skip_last_dot(exe_name_with_ext); + String8 exe_name = str8_chop(exe_name_with_ext, exe_name_ext.size); + if (exe_name_ext.size > 0) { + exe_name = str8_chop(exe_name, 1); + } + String8 version_string = push_str8f(scratch.arena, BUILD_TITLE); + String8 comp_data = cv_make_comp3(scratch.arena, CV_Compile3Flag_EC, CV_Language_CVTRES, arch, + 0, 0, 0, 0, + 1, 0, 1, 0, + version_string); + + String8List env_list; MemoryZeroStruct(&env_list); + str8_list_push(scratch.arena, &env_list, str8_lit("cwd")); + str8_list_push(scratch.arena, &env_list, cwd_path); + str8_list_push(scratch.arena, &env_list, str8_lit("exe")); + str8_list_push(scratch.arena, &env_list, exe_path); + str8_list_push(scratch.arena, &env_list, str8_lit("")); + str8_list_push(scratch.arena, &env_list, str8_lit("")); + String8 envblock_data = cv_make_envblock(scratch.arena, env_list); + + String8 obj_symbol = cv_make_symbol(scratch.arena, CV_SymKind_OBJNAME, obj_data); + String8 comp_symbol = cv_make_symbol(scratch.arena, CV_SymKind_COMPILE3, comp_data); + String8 envblock_symbol = cv_make_symbol(scratch.arena, CV_SymKind_ENVBLOCK, envblock_data); + + String8List symbol_srl; MemoryZeroStruct(&symbol_srl); + str8_serial_begin(scratch.arena, &symbol_srl); + str8_serial_push_string(scratch.arena, &symbol_srl, obj_symbol); + str8_serial_push_string(scratch.arena, &symbol_srl, comp_symbol); + str8_serial_push_string(scratch.arena, &symbol_srl, envblock_symbol); + + // build code view sub-sections + String8List sub_sect_srl; MemoryZeroStruct(&sub_sect_srl); + str8_serial_begin(scratch.arena, &sub_sect_srl); + CV_Signature sig = CV_Signature_C13; + str8_serial_push_struct(scratch.arena, &sub_sect_srl, &sig); + + CV_C13SubSectionHeader string_header; + string_header.kind = CV_C13SubSectionKind_StringTable; + string_header.size = string_srl.total_size; + str8_serial_push_struct(scratch.arena, &sub_sect_srl, &string_header); + str8_serial_push_data_list(scratch.arena, &sub_sect_srl, string_srl.first); + str8_serial_push_align(scratch.arena, &sub_sect_srl, CV_C13SubSectionAlign); + + CV_C13SubSectionHeader file_header; + file_header.kind = CV_C13SubSectionKind_FileChksms; + file_header.size = file_srl.total_size; + str8_serial_push_struct(scratch.arena, &sub_sect_srl, &file_header); + str8_serial_push_data_list(scratch.arena, &sub_sect_srl, file_srl.first); + str8_serial_push_align(scratch.arena, &sub_sect_srl, CV_C13SubSectionAlign); + + CV_C13SubSectionHeader symbol_header; + symbol_header.kind = CV_C13SubSectionKind_Symbols; + symbol_header.size = symbol_srl.total_size; + str8_serial_push_struct(scratch.arena, &sub_sect_srl, &symbol_header); + str8_serial_push_data_list(scratch.arena, &sub_sect_srl, symbol_srl.first); + str8_serial_push_align(scratch.arena, &sub_sect_srl, CV_C13SubSectionAlign); + + LNK_Section *debug_s = lnk_section_table_push(st, str8_lit(".debug$S"), LNK_DEBUG_SECTION_FLAGS); + String8 sub_sect_data = str8_serial_end(debug_s->arena, &sub_sect_srl); + lnk_section_push_chunk_data(debug_s, debug_s->root, sub_sect_data, str8(0,0)); + + scratch_end(scratch); + ProfEnd(); +} + +internal String8 +lnk_make_res_obj(TP_Context *tp, + Arena *arena, + PE_ResourceDir *root_dir, + COFF_MachineType machine, + COFF_TimeStamp time_stamp, + String8 path, + String8 cwd_path, + String8 exe_path, + String8List res_file_list, + MD5Hash *res_hash_array) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + static const U64 sect_virt_align = 1; + static const U64 sect_file_align = 1; + + LNK_SymbolTable *symtab = lnk_symbol_table_alloc(); + LNK_SectionTable *st = lnk_section_table_alloc(0, sect_virt_align, sect_file_align); + LNK_Section *header_sect = lnk_section_table_push(st, str8_lit(".null"), 0); + + lnk_serialize_pe_resource_tree(st, symtab, root_dir); + + CV_Arch cv_arch = cv_arch_from_coff_machine(machine); + lnk_add_resource_debug_s(st, symtab, path, cwd_path, exe_path, cv_arch, res_file_list, res_hash_array); + + // register section symbols (after this point don't push new sections) + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + LNK_Symbol *sect_symbol = lnk_make_defined_symbol_chunk(symtab->arena, sect->name, LNK_DefinedSymbolVisibility_Internal, 0, sect->root, 0, 0, 0); + lnk_symbol_table_push(symtab, sect_symbol); + } + st->null_sect = lnk_section_list_remove(&st->list, str8_lit(".null")); + lnk_section_table_build_data(tp, st, machine); + lnk_section_table_push_null(st); + lnk_section_table_assign_indices(st); + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + + COFF_Symbol16List coff_symbol_list = {0}; + + COFF_Symbol16 coff_feat00 = {0}; + MemoryCopyStr8(&coff_feat00.name, str8_lit("@feat.00")); + coff_feat00.value = COFF_FeatFlag_HAS_SAFE_SEH|COFF_FeatFlag_UNKNOWN_4; + coff_feat00.section_number = COFF_SYMBOL_ABS_SECTION_16; + coff_feat00.storage_class = COFF_SymStorageClass_STATIC; + coff_symbol16_list_push(scratch.arena, &coff_symbol_list, coff_feat00); + + // emit coff symbols for section definitions + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + if (sect == header_sect) continue; + if (!sect->emit_header) continue; + + U64 reloc_count = 0; + LNK_Symbol *coff_reloc_count_symbol = lnk_symbol_table_searchf(symtab, LNK_SymbolScopeFlag_Internal, "%S.coff_relocs[].count", sect->name); + if (coff_reloc_count_symbol) { + reloc_count = coff_reloc_count_symbol->u.defined.u.va; + } + + U64 sect_size = lnk_virt_size_from_chunk_ref(sect_id_map, sect->root->ref); + + COFF_Symbol16 coff_sect_symbol = {0}; + Assert(sect->name.size <= 8); + MemoryCopyStr8(&coff_sect_symbol.name, sect->name); + coff_sect_symbol.value = 0; + coff_sect_symbol.section_number = sect->isect; + coff_sect_symbol.aux_symbol_count = 1; + coff_sect_symbol.storage_class = COFF_SymStorageClass_STATIC; + + COFF_SymbolSecDef secdef = {0}; + secdef.length = safe_cast_u32(sect_size); + secdef.number = sect->isect; + secdef.number_of_relocations = safe_cast_u32(reloc_count); + + coff_symbol16_list_push(scratch.arena, &coff_symbol_list, coff_sect_symbol); + coff_symbol16_list_push(scratch.arena, &coff_symbol_list, *((COFF_Symbol16*)&secdef)); + } + + // convert relocations and symbols to coff format + { + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + + // filter out resource relocations + LNK_RelocList reloc_list = {0}; + LNK_RelocList res_data_reloc_list = {0}; + for (LNK_Reloc *reloc = sect->reloc_list.first; reloc != 0; reloc = reloc->next) { + B32 is_reloc_symbol = str8_match(str8_lit("$R"), reloc->symbol->name, StringMatchFlag_RightSideSloppy); + LNK_Reloc *dst; + if (is_reloc_symbol) { + dst = lnk_reloc_list_push(sect->arena, &res_data_reloc_list); + } else { + dst = lnk_reloc_list_push(sect->arena, &reloc_list); + } + dst->chunk = reloc->chunk; + dst->type = reloc->type; + dst->apply_off = reloc->apply_off; + dst->symbol = reloc->symbol; + } + sect->reloc_list = reloc_list; + + COFF_RelocList coff_reloc_list = {0}; + for (LNK_Reloc *reloc = res_data_reloc_list.first; reloc != 0; reloc = reloc->next) { + LNK_Symbol *symbol = reloc->symbol; + + Assert(LNK_Symbol_IsDefined(symbol->type)); + Assert(symbol->u.defined.value_type == LNK_DefinedSymbolValue_Chunk); + LNK_DefinedSymbol *def = &symbol->u.defined; + + // resolve symbol offset + LNK_Section *symbol_sect = lnk_sect_from_chunk_ref(sect_id_map, def->u.chunk->ref); + U64 chunk_off = lnk_off_from_chunk_ref(sect_id_map, def->u.chunk->ref); + U64 symbol_offset = chunk_off + def->u.chunk_offset; + U64 symbol_idx = coff_symbol_list.count; + + // push coff symbol + COFF_Symbol16 coff_symbol = {0}; + Assert(symbol->name.size <= 8); + String8 symbol_name = push_str8f(scratch.arena, "$R%06X", symbol_offset); + MemoryCopyStr8(&coff_symbol.name, symbol_name); + coff_symbol.value = symbol_offset; + coff_symbol.section_number = symbol_sect->isect; + coff_symbol.storage_class = COFF_SymStorageClass_STATIC; + coff_symbol16_list_push(scratch.arena, &coff_symbol_list, coff_symbol); + + // push coff reloc + U64 reloc_off = lnk_off_from_chunk_ref(sect_id_map, reloc->chunk->ref); + reloc_off += reloc->apply_off; + + COFF_Reloc coff_reloc = {0}; + coff_reloc.apply_off = reloc_off; + coff_reloc.isymbol = safe_cast_u32(symbol_idx); + coff_reloc.type = lnk_ext_reloc_type_to_coff(machine, reloc->type); + coff_reloc_list_push(scratch.arena, &coff_reloc_list, coff_reloc); + } + + if (coff_reloc_list.count == 0) continue; + + // push section for relocation data + String8 sect_name = push_str8f(st->arena, "%S.relocs", sect->name); + LNK_Section *reloc_sect = lnk_section_table_push(st, sect_name, 0); + reloc_sect->emit_header = 0; + + // push chunk layout for relocations + LNK_Chunk *reloc_array_chunk = lnk_section_push_chunk_list(reloc_sect, reloc_sect->root, str8(0,0)); + for (COFF_RelocNode *i = coff_reloc_list.first; i != 0; i = i->next) { + String8 reloc_data = push_str8_copy(reloc_sect->arena, str8_struct(&i->data)); + lnk_section_push_chunk_data(reloc_sect, reloc_array_chunk, reloc_data, str8(0,0)); + } + + // emit symbols for coff section header patch + String8 coff_reloc_symbol_name = push_str8f(symtab->arena, "%S.coff_reloc[]", sect->name); + String8 coff_reloc_count_symbol_name = push_str8f(symtab->arena, "%S.coff_reloc[].count", sect->name); + LNK_Symbol *coff_reloc_symbol = lnk_make_defined_symbol_chunk(symtab->arena, coff_reloc_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, reloc_array_chunk, 0, 0, 0); + LNK_Symbol *coff_reloc_count_symbol = lnk_make_defined_symbol_va(symtab->arena, coff_reloc_count_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, coff_reloc_list.count); + lnk_symbol_table_push(symtab, coff_reloc_symbol); + lnk_symbol_table_push(symtab, coff_reloc_count_symbol); + } + } + + LNK_Section *misc_sect = lnk_section_table_push(st, str8_lit(".misc"), COFF_SectionFlag_LNK_INFO|COFF_SectionFlag_LNK_REMOVE); + misc_sect->emit_header = 0; + + // serialize coff symbol list + String8List srl = {0}; + str8_serial_begin(scratch.arena, &srl); + for (COFF_Symbol16Node *i = coff_symbol_list.first; i != 0; i = i->next) { + str8_serial_push_struct(scratch.arena, &srl, &i->data); + } + String8 coff_symbol_table_data = str8_serial_end(scratch.arena, &srl); + LNK_Chunk *coff_symbol_table_chunk = lnk_section_push_chunk_data(misc_sect, misc_sect->root, coff_symbol_table_data, str8(0,0)); + LNK_Symbol *coff_symbol_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit("COFF_SYMBOL_TABLE"), LNK_DefinedSymbolVisibility_Internal, 0, coff_symbol_table_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, coff_symbol_table_symbol); + + LNK_Symbol *coff_symbol_count_symbol = lnk_make_defined_symbol_va(symtab->arena, str8_lit("COFF_SYMBOL_COUNT"), LNK_DefinedSymbolVisibility_Internal, 0, coff_symbol_list.count); + lnk_symbol_table_push(symtab, coff_symbol_count_symbol); + + // build obj header + { + // init header + COFF_Header *coff_header = push_array(header_sect->arena, COFF_Header, 1); + coff_header->machine = machine; + coff_header->section_count = 0; // relocated + coff_header->time_stamp = time_stamp; + coff_header->symbol_table_foff = 0; // relocated + coff_header->symbol_count = 0; // relocated + coff_header->optional_header_size = 0; // no PE header in obj + coff_header->flags = COFF_Flag_32BIT_MACHINE; + + // push coff header chunk + String8 coff_header_data = str8_struct(coff_header); + LNK_Chunk *coff_header_chunk = lnk_section_push_chunk_data(header_sect, header_sect->root, coff_header_data, str8(0,0)); + + // relocate coff header fields + lnk_section_push_reloc_undefined(header_sect, coff_header_chunk, LNK_Reloc_ADDR_32, OffsetOf(COFF_Header, section_count), str8_lit(LNK_COFF_SECT_HEADER_COUNT_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + lnk_section_push_reloc(header_sect, coff_header_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(COFF_Header, symbol_table_foff), coff_symbol_table_symbol); + lnk_section_push_reloc(header_sect, coff_header_chunk, LNK_Reloc_ADDR_32, OffsetOf(COFF_Header, symbol_count), coff_symbol_count_symbol); + + // push coff header symbol + LNK_Symbol *coff_header_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_COFF_HEADER_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, coff_header_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, coff_header_symbol); + } + + // build section headers + { + LNK_Chunk *coff_section_header_array_chunk = lnk_section_push_chunk_list(header_sect, header_sect->root, str8(0,0)); + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + if (sect_node == st->null_sect) continue; + if (!sect_node->data.emit_header) continue; + LNK_Section *sect = §_node->data; + + // init section header + COFF_SectionHeader *coff_sect_header = push_array(header_sect->arena, COFF_SectionHeader, 1); + Assert(sect->name.size <= sizeof(coff_sect_header->name)); + MemoryCopyStr8(&coff_sect_header->name[0], sect->name); + coff_sect_header->flags = sect->flags; + coff_sect_header->vsize = 0; // ignored + coff_sect_header->voff = 0; // ignored + coff_sect_header->fsize = 0; // relocated + coff_sect_header->foff = 0; // relocated + coff_sect_header->relocs_foff = 0; // relocated + coff_sect_header->lines_foff = 0; // obsolete + coff_sect_header->line_count = 0; // obsolete + coff_sect_header->reloc_count = 0; // relocated + + // push section header chunk + String8 coff_sect_header_data = str8_struct(coff_sect_header); + String8 sort_index = lnk_make_section_sort_index(header_sect->arena, str8(0,0), 0, sect->isect); + LNK_Chunk *coff_sect_header_chunk = lnk_section_push_chunk_data(header_sect, coff_section_header_array_chunk, coff_sect_header_data, sort_index); + lnk_chunk_set_debugf(header_sect->arena, coff_sect_header_chunk, "%S", sect->name); + + // patch reloc fields + if (sect->reloc_list.count) { + String8 coff_reloc_symbol_name = push_str8f(scratch.arena, "%S.coff_reloc[]", sect->name); + String8 coff_reloc_count_symbol_name = push_str8f(scratch.arena, "%S.coff_reloc[].count", sect->name); + lnk_section_push_reloc_undefined(header_sect, coff_sect_header_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(COFF_SectionHeader, relocs_foff), coff_reloc_symbol_name, LNK_SymbolScopeFlag_Internal); + lnk_section_push_reloc_undefined(header_sect, coff_sect_header_chunk, LNK_Reloc_ADDR_32, OffsetOf(COFF_SectionHeader, reloc_count), coff_reloc_count_symbol_name, LNK_SymbolScopeFlag_Internal); + } + + // patch file fields + if (~sect->flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA) { + LNK_Symbol *sect_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, sect->name); + lnk_section_push_reloc(header_sect, coff_sect_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_32, OffsetOf(COFF_SectionHeader, fsize), sect_symbol); + lnk_section_push_reloc(header_sect, coff_sect_header_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(COFF_SectionHeader, foff), sect_symbol); + } + } + + // push section header count symbol + U64 symbol_count = coff_section_header_array_chunk->u.list->count; + LNK_Symbol *coff_section_header_count_symbol = lnk_make_defined_symbol_va(symtab->arena, str8_lit(LNK_COFF_SECT_HEADER_COUNT_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, symbol_count); + lnk_symbol_table_push(symtab, coff_section_header_count_symbol); + } + + lnk_section_table_assign_indices(st); + lnk_section_table_build_data(tp, st, machine); + lnk_section_table_assign_file_offsets(st); + lnk_patch_relocs(tp, symtab, st, 0); + + String8 res_obj = lnk_section_table_serialize(arena, st); + + lnk_section_table_release(&st); + lnk_symbol_table_release(&symtab); + + scratch_end(scratch); + ProfEnd(); + return res_obj; +} + +internal String8 +lnk_obj_from_res_file_list(TP_Context *tp, + Arena *arena, + LNK_SectionTable *st, + LNK_SymbolTable *symtab, + String8List res_data_list, + String8List res_path_list, + COFF_MachineType machine, + U32 time_stamp, + String8 work_dir, + PathStyle system_path_style, + String8 obj_name) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + Assert(res_data_list.node_count == res_path_list.node_count); + + // load res files + PE_ResourceDir *root_dir = push_array(scratch.arena, PE_ResourceDir, 1); + MD5Hash *res_hash_array = push_array(scratch.arena, MD5Hash, res_data_list.node_count); + U64 node_idx = 0; + for (String8Node *node = res_data_list.first; node != 0; node = node->next, node_idx += 1) { + res_hash_array[node_idx] = md5_hash_from_string(node->string); + pe_resource_dir_push_res_file(scratch.arena, root_dir, node->string); + } + + // convert res paths to stable paths + String8List stable_res_file_list = {0}; + for (String8Node *node = res_path_list.first; node != 0; node = node->next) { + String8 stable_res_path = lnk_make_full_path(scratch.arena, work_dir, system_path_style, node->string); + str8_list_push(scratch.arena, &stable_res_file_list, stable_res_path); + } + + // convert res to obj + OS_ProcessInfo *process_info = os_get_process_info(); + String8List exe_path_strs = {0}; + str8_list_push(scratch.arena, &exe_path_strs, process_info->binary_path); + String8 exe_path = str8_list_first(&exe_path_strs); + String8 res_obj = lnk_make_res_obj(tp, + arena, + root_dir, + machine, + time_stamp, + obj_name, + work_dir, + exe_path, + stable_res_file_list, + res_hash_array); + + scratch_end(scratch); + ProfEnd(); + return res_obj; +} + +//////////////////////////////// + +internal String8 +lnk_make_linker_coff_obj(TP_Context *tp, + Arena *arena, + COFF_TimeStamp time_stamp, + COFF_MachineType machine, + String8 cwd_path, + String8 exe_path, + String8 pdb_path, + String8 cmd_line, + String8 obj_name) +{ + Temp scratch = scratch_begin(&arena, 1); + + LNK_SymbolTable *symtab = lnk_symbol_table_alloc(); + LNK_SectionTable *st = lnk_section_table_alloc(0, 1, 1); + + LNK_Section *header_sect = lnk_section_table_push(st, str8_lit(".coffhdr"), 0); + LNK_Section *debug_s_sect = lnk_section_table_push(st, str8_lit(".debug$S"), LNK_DEBUG_SECTION_FLAGS); + + // TODO: remove! hack! + header_sect->emit_header = 0; + + { + COFF_Header *coff_header = push_array(header_sect->arena, COFF_Header, 1); + coff_header->machine = machine; + coff_header->section_count = 0; + coff_header->time_stamp = time_stamp; + + LNK_Chunk *coff_header_chunk = lnk_section_push_chunk_raw(header_sect, header_sect->root, coff_header, sizeof(*coff_header), str8(0,0)); + lnk_section_push_reloc_undefined(header_sect, coff_header_chunk, LNK_Reloc_ADDR_32, OffsetOf(COFF_Header, section_count), str8_lit(LNK_COFF_SECT_HEADER_COUNT_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + } + + { + CV_SymbolList symbol_list = {0}; + symbol_list.signature = CV_Signature_C13; + + // S_OBJ + String8 obj_data = cv_make_obj_name(scratch.arena, obj_name, 0); + cv_symbol_list_push_data(scratch.arena, &symbol_list, CV_SymKind_OBJNAME, obj_data); + + // S_COMPILE3 + CV_Arch cv_arch = cv_arch_from_coff_machine(machine); + U64 ver_fe_major = 0; + U64 ver_fe_minor = 0; + U64 ver_fe_build = 0; + U64 ver_feqfe = 0; + U64 ver_major = 14; + U64 ver_minor = 36; + U64 ver_build = 32537; + U64 ver_qfe = 0; + String8 version_string = push_str8f(scratch.arena, "Epic Games Tools (R) RAD Linker"); + String8 comp3_data = cv_make_comp3(scratch.arena, 0, CV_Language_LINK, cv_arch, ver_fe_major, ver_fe_minor, ver_fe_build, ver_feqfe, ver_major, ver_minor, ver_build, ver_qfe, version_string); + cv_symbol_list_push_data(scratch.arena, &symbol_list, CV_SymKind_COMPILE3, comp3_data); + + // S_ENVBLOCK + String8List env_list = {0}; + str8_list_push(scratch.arena, &env_list, str8_lit("cwd")); + str8_list_push(scratch.arena, &env_list, cwd_path); + str8_list_push(scratch.arena, &env_list, str8_lit("exe")); + str8_list_push(scratch.arena, &env_list, exe_path); + str8_list_push(scratch.arena, &env_list, str8_lit("pdb")); + str8_list_push(scratch.arena, &env_list, pdb_path); + str8_list_push(scratch.arena, &env_list, str8_lit("cmd")); + str8_list_push(scratch.arena, &env_list, cmd_line); + str8_list_push(scratch.arena, &env_list, str8_lit("")); + str8_list_push(scratch.arena, &env_list, str8_lit("")); + String8 env_data = cv_make_envblock(scratch.arena, env_list); + cv_symbol_list_push_data(scratch.arena, &symbol_list, CV_SymKind_ENVBLOCK, env_data); + + // TODO: emit S_SECTION and S_COFFGROUP + // TODO: emit S_TRAMPOLINE + + String8List symbol_data_list = cv_data_from_symbol_list(scratch.arena, symbol_list, CV_SymbolAlign); + + CV_DebugS debug_s = {0}; + + String8List *symbols_list_ptr = cv_sub_section_ptr_from_debug_s(&debug_s, CV_C13SubSectionKind_Symbols); + *symbols_list_ptr = symbol_data_list; + + B32 include_sig = 1; + String8List debug_s_data_list = cv_data_c13_from_debug_s(scratch.arena, &debug_s, include_sig); + + // push debug info to section + String8 debug_s_data = str8_list_join(debug_s_sect->arena, &debug_s_data_list, 0); + lnk_section_push_chunk_data(debug_s_sect, debug_s_sect->root, debug_s_data, str8(0,0)); + } + + { + // register section symbols (after this point don't push new sections) + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + LNK_Symbol *sect_symbol = lnk_make_defined_symbol_chunk(symtab->arena, sect->name, LNK_DefinedSymbolVisibility_Internal, 0, sect->root, 0, 0, 0); + lnk_symbol_table_push(symtab, sect_symbol); + } + + LNK_Chunk *coff_section_header_array_chunk = lnk_section_push_chunk_list(header_sect, header_sect->root, str8(0,0)); + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + if (sect_node == st->null_sect) continue; + if (!sect_node->data.emit_header) continue; + LNK_Section *sect = §_node->data; + + // init section header + COFF_SectionHeader *coff_sect_header = push_array(header_sect->arena, COFF_SectionHeader, 1); + Assert(sect->name.size <= sizeof(coff_sect_header->name)); + MemoryCopy(&coff_sect_header->name[0], sect->name.str, sect->name.size); + coff_sect_header->flags = sect->flags; + coff_sect_header->vsize = 0; // ignored + coff_sect_header->voff = 0; // ignored + coff_sect_header->fsize = 0; // relocated + coff_sect_header->foff = 0; // relocated + coff_sect_header->relocs_foff = 0; // relocated + coff_sect_header->lines_foff = 0; // obsolete + coff_sect_header->line_count = 0; // obsolete + coff_sect_header->reloc_count = 0; // relocated + + // push section header chunk + String8 sort_index = lnk_make_section_sort_index(header_sect->arena, str8(0,0), 0, sect->isect); + LNK_Chunk *coff_sect_header_chunk = lnk_section_push_chunk_raw(header_sect, coff_section_header_array_chunk, coff_sect_header, sizeof(*coff_sect_header), sort_index); + lnk_chunk_set_debugf(header_sect->arena, coff_sect_header_chunk, "%S", sect->name); + + // emit relocs for reloc fields + if (sect->reloc_list.count) { + String8 coff_reloc_symbol_name = push_str8f(scratch.arena, "%S.coff_reloc[]", sect->name); + String8 coff_reloc_count_symbol_name = push_str8f(scratch.arena, "%S.coff_reloc[].count", sect->name); + lnk_section_push_reloc_undefined(header_sect, coff_sect_header_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(COFF_SectionHeader, relocs_foff), coff_reloc_symbol_name, LNK_SymbolScopeFlag_Internal); + lnk_section_push_reloc_undefined(header_sect, coff_sect_header_chunk, LNK_Reloc_ADDR_32, OffsetOf(COFF_SectionHeader, reloc_count), coff_reloc_count_symbol_name, LNK_SymbolScopeFlag_Internal); + } + + // emit relocs for file fields + if (~sect->flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA) { + LNK_Symbol *sect_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, sect->name); + lnk_section_push_reloc(header_sect, coff_sect_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_32, OffsetOf(COFF_SectionHeader, fsize), sect_symbol); + lnk_section_push_reloc(header_sect, coff_sect_header_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(COFF_SectionHeader, foff), sect_symbol); + } + } + + // push section header count symbol + U64 symbol_count = coff_section_header_array_chunk->u.list->count; + LNK_Symbol *coff_section_header_count_symbol = lnk_make_defined_symbol_va(symtab->arena, str8_lit(LNK_COFF_SECT_HEADER_COUNT_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, symbol_count); + lnk_symbol_table_push(symtab, coff_section_header_count_symbol); + } + + lnk_section_table_assign_indices(st); + lnk_section_table_build_data(tp, st, machine); + lnk_section_table_assign_file_offsets(st); + lnk_patch_relocs(tp, symtab, st, 0); + String8 coff_data = lnk_section_table_serialize(arena, st); + + lnk_section_table_release(&st); + + scratch_end(scratch); + return coff_data; +} + +//////////////////////////////// + +internal +THREAD_POOL_TASK_FUNC(lnk_load_thin_objs_task) +{ + LNK_InputObj *input = ((LNK_InputObj **)raw_task)[task_id]; + if (input->is_thin) { + input->data = os_data_from_file_path(arena, input->path); + input->has_disk_read_failed = (input->data.size == 0); + } +} + +internal String8 +lnk_get_lib_name(String8 path) +{ + static String8 LIB_EXT = str8_lit_comp(".LIB"); + + // strip path + String8 name = str8_skip_last_slash(path); + + // strip extension + String8 name_ext = str8_postfix(name, LIB_EXT.size); + if (str8_match(name_ext, LIB_EXT, StringMatchFlag_CaseInsensitive)) { + name = str8_chop(name, LIB_EXT.size); + } + + return name; +} + +internal B32 +lnk_is_lib_disallowed(HashTable *disallow_lib_ht, String8 path) +{ + String8 lib_name = lnk_get_lib_name(path); + return hash_table_search_path(disallow_lib_ht, lib_name) != 0; +} + +internal B32 +lnk_is_lib_loaded(HashTable *default_lib_ht, HashTable *loaded_lib_ht, LNK_InputSourceType input_source, String8 path) +{ + // when /defaultlib:path is comes from command line or obj directive + // check against lib name + if (input_source == LNK_InputSource_Default || + input_source == LNK_InputSource_Obj) { + String8 lib_name = str8_skip_last_slash(path); + if (hash_table_search_path(default_lib_ht, lib_name)) { + return 1; + } + } + return hash_table_search_path(loaded_lib_ht, path) != 0; +} + +internal void +lnk_push_disallow_lib(Arena *arena, HashTable *disallow_lib_ht, String8 path) +{ + String8 lib_name = lnk_get_lib_name(path); + hash_table_push_path_u64(arena, disallow_lib_ht, lib_name, 0); +} + +internal void +lnk_push_loaded_lib(Arena *arena, + HashTable *default_lib_ht, + HashTable *loaded_lib_ht, + String8 path) +{ + String8 lib_name = str8_skip_last_slash(path); + if (!hash_table_search_path(default_lib_ht, lib_name)) { + hash_table_push_path_u64(arena, default_lib_ht, lib_name, 0); + } + + if (!hash_table_search_path(loaded_lib_ht, path)) { + hash_table_push_string_u64(arena, loaded_lib_ht, path, 0); + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_lazy_initer) +{ + LNK_LazyIniter *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + for (U64 lib_idx = range.min; lib_idx < range.max; lib_idx += 1) { + LNK_Lib *lib = &task->lib_arr[lib_idx].data; + String8Node *name_node = lib->symbol_name_list.first; + for (U64 symbol_idx = 0; symbol_idx < lib->symbol_count; symbol_idx += 1, name_node = name_node->next) { + LNK_Symbol *symbol = &task->symbol_arr_arr[lib_idx][symbol_idx]; + lnk_init_lazy_symbol(symbol, name_node->string, lib, lib->member_off_arr[symbol_idx]); + } + } +} + +internal void +lnk_push_input_from_lazy(Arena *arena, PathStyle path_style, LNK_LazySymbol *lazy, LNK_InputImportList *input_import_list, LNK_InputObjList *input_obj_list) +{ + // parse member + COFF_ArchiveMember member_info = coff_read_archive_member(lazy->lib->data, lazy->member_offset); + COFF_DataType member_type = coff_data_type_from_data(member_info.data); + + switch (member_type) { + case COFF_DataType_IMPORT: { + LNK_InputImport *input = lnk_input_import_list_push(arena, input_import_list); + input->import_header = coff_archive_import_from_data(member_info.data); + } break; + case COFF_DataType_BIG_OBJ: + case COFF_DataType_OBJ: { + String8 obj_path = coff_read_archive_long_name(lazy->lib->long_names, member_info.header.name); + + // obj path in thin archive has slash appended which screws up + // file lookup on disk; it couble be there to enable paths to symbols + // but we don't use this feature + String8 slash = str8_lit("/"); + if (str8_ends_with(obj_path, slash, 0)) { + obj_path = str8_chop(obj_path, slash.size); + } + + // obj path in thin archive is relative to directory with archive + B32 is_thin = lazy->lib->type == COFF_Archive_Thin; + if (is_thin) { + Temp scratch = scratch_begin(&arena, 1); + String8List obj_path_list; MemoryZeroStruct(&obj_path_list); + str8_list_push(scratch.arena, &obj_path_list, str8_chop_last_slash(lazy->lib->path)); + str8_list_push(scratch.arena, &obj_path_list, obj_path); + obj_path = str8_path_list_join_by_style(arena, &obj_path_list, path_style); + scratch_end(scratch); + } + + LNK_InputObj *input = lnk_input_obj_list_push(arena, input_obj_list); + input->is_thin = is_thin; + input->dedup_id = push_str8f(arena, "%S(%S)", lazy->lib->path, obj_path); + input->path = obj_path; + input->data = member_info.data; + input->lib_path = lazy->lib->path; + } break; + } +} + +internal void +lnk_push_linker_symbols(LNK_SymbolTable *symtab, COFF_MachineType machine) +{ + // Emit __ImageBase symbol. + // + // This symbol is used with REL32 to compute delta from current IP + // to the image base. CRT uses this trick to get to HINSTANCE * without + // passing it around as a function argument. + // + // 100h: lea rax, [rip + ffffff00h] ; -100h + LNK_Symbol *image_base = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit("__ImageBase"), LNK_DefinedSymbolVisibility_Extern, 0, g_null_chunk_ptr, 0, COFF_ComdatSelectType_ANY, 0); + lnk_symbol_table_push(symtab, image_base); + + { // load config symbols + if (machine == COFF_MachineType_X86) { + LNK_Symbol *safe_se_handler_table = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_SAFE_SE_HANDLER_TABLE_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Extern, 0, g_null_chunk_ptr, 0, COFF_ComdatSelectType_NODUPLICATES, 0); + LNK_Symbol *safe_se_handler_count = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_SAFE_SE_HANDLER_COUNT_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Extern, 0, g_null_chunk_ptr, 0, COFF_ComdatSelectType_NODUPLICATES, 0); + lnk_symbol_table_push(symtab, safe_se_handler_table); + lnk_symbol_table_push(symtab, safe_se_handler_count); + } + + // TODO: investigate IMAGE_ENCLAVE_CONFIG 32/64 + LNK_Symbol *enclave_config = lnk_make_defined_symbol_va(symtab->arena, str8_lit(LNK_ENCLAVE_CONFIG_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Extern, 0, 0); + + LNK_Symbol *guard_flags = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_FLAGS_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_fids_table = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_FIDS_TABLE_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_fids_count = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_FIDS_COUNT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_iat_table = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_IAT_TABLE_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_iat_count = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_IAT_COUNT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_longjmp_table = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_LONGJMP_TABLE_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_longjmp_count = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_LONGJMP_COUNT_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_ehcont_table = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_EHCONT_TABLE_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Extern, 0); + LNK_Symbol *guard_ehcont_count = lnk_make_defined_symbol(symtab->arena, str8_lit(LNK_GUARD_EHCONT_COUNT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Extern, 0); + + lnk_symbol_table_push(symtab, enclave_config); + lnk_symbol_table_push(symtab, guard_flags); + lnk_symbol_table_push(symtab, guard_fids_table); + lnk_symbol_table_push(symtab, guard_fids_count); + lnk_symbol_table_push(symtab, guard_iat_table); + lnk_symbol_table_push(symtab, guard_iat_count); + lnk_symbol_table_push(symtab, guard_longjmp_table); + lnk_symbol_table_push(symtab, guard_longjmp_count); + lnk_symbol_table_push(symtab, guard_ehcont_table); + lnk_symbol_table_push(symtab, guard_ehcont_count); + } +} + +//////////////////////////////// + +internal void +lnk_push_coff_symbols_from_data(Arena *arena, LNK_SymbolList *symbol_list, String8 data, LNK_SymbolArray obj_symbols) +{ + if (data.size % sizeof(U32)) { + // TODO: report invalid data size + } + U64 count = data.size / sizeof(U32); + for (U32 *ptr = (U32*)data.str, *opl = ptr + count; ptr < opl; ++ptr) { + U32 coff_symbol_idx = *ptr; + if (coff_symbol_idx >= obj_symbols.count) { + // TODO: report invalid symbol index + continue; + } + Assert(coff_symbol_idx < obj_symbols.count); + LNK_Symbol *symbol = obj_symbols.v + coff_symbol_idx; + lnk_symbol_list_push(arena, symbol_list, symbol); + } +} + +internal String8 +lnk_build_guard_data(Arena *arena, U64Array voff_arr, U64 stride) +{ + Assert(stride >= sizeof(U32)); + + // check for duplicates +#if DEBUG + for (U64 i = 1; i < voff_arr.count; ++i) { + Assert(voff_arr.[i-1] != voff_ptr[i]); + } +#endif + + U64 buffer_size = stride * voff_arr.count; + U8 *buffer = push_array(arena, U8, buffer_size); + for (U64 i = 0; i < voff_arr.count; ++i) { + U32 *voff_ptr = (U32*)(buffer + i * stride); + *voff_ptr = voff_arr.v[i]; + } + + String8 guard_data = str8(buffer, buffer_size); + return guard_data; +} + +internal void +lnk_push_pe_debug_data_directory(LNK_Section *sect, + LNK_Chunk *dir_array_chunk, + LNK_Symbol *data_symbol, + PE_DebugDirectoryType type, + COFF_TimeStamp time_stamp) +{ + // init directory + PE_DebugDirectory *dir = push_array(sect->arena, PE_DebugDirectory, 1); + dir->time_stamp = time_stamp; + dir->type = type; + //dir->voff = 0; // relocated through 'symbol' + //dir->foff = 0; // relocated through 'symbol' + //dir->size = 0; // relocated through 'symbol' + + // push chunk + LNK_Chunk *dir_entry_chunk = lnk_section_push_chunk_data(sect, dir_array_chunk, str8_struct(dir), str8(0,0)); + lnk_chunk_set_debugf(sect->arena, dir_entry_chunk, "DebugDirectory[%u]", type); + + // push debug directory relocs + lnk_section_push_reloc(sect, dir_entry_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_DebugDirectory, voff), data_symbol); + lnk_section_push_reloc(sect, dir_entry_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(PE_DebugDirectory, foff), data_symbol); + lnk_section_push_reloc(sect, dir_entry_chunk, LNK_Reloc_CHUNK_SIZE_VIRT_32, OffsetOf(PE_DebugDirectory, size), data_symbol); +} + +internal void +lnk_build_debug_pdb(LNK_SectionTable *st, + LNK_SymbolTable *symtab, + LNK_Section *sect, + LNK_Chunk *dir_array_chunk, + COFF_TimeStamp time_stamp, + OS_Guid guid, + U32 age, + String8 pdb_path) +{ + ProfBeginFunction(); + + // push chunks + String8 debug_pdb_data = pe_make_debug_header_pdb70(sect->arena, guid, age, pdb_path); + LNK_Chunk *debug_pdb_chunk = lnk_section_push_chunk_data(sect, sect->root, debug_pdb_data, str8(0, 0)); + lnk_chunk_set_debugf(sect->arena, debug_pdb_chunk, LNK_CV_HEADER_PDB70_SYMBOL_NAME); + + // push symbols + LNK_Symbol *debug_pdb_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_CV_HEADER_PDB70_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, debug_pdb_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, debug_pdb_symbol); + + LNK_Symbol *guid_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_CV_HEADER_GUID_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, debug_pdb_chunk, OffsetOf(PE_CvHeaderPDB70, guid), 0, 0); + lnk_symbol_table_push(symtab, guid_symbol); + + // push debug directory + lnk_push_pe_debug_data_directory(sect, dir_array_chunk, debug_pdb_symbol, PE_DebugDirectoryType_CODEVIEW, time_stamp); + + ProfEnd(); +} + +internal void +lnk_build_debug_rdi(LNK_SectionTable *st, + LNK_SymbolTable *symtab, + LNK_Section *debug_sect, + LNK_Chunk *debug_dir_array_chunk, + COFF_TimeStamp time_stamp, + OS_Guid guid, + String8 rdi_path) +{ + ProfBeginFunction(); + + LNK_Section *rdi_sect = lnk_section_table_push(st, str8_lit(".raddbg"), COFF_SectionFlag_CNT_INITIALIZED_DATA|COFF_SectionFlag_MEM_READ); + + // push chunks + String8 debug_rdi = pe_make_debug_header_rdi(rdi_sect->arena, guid, rdi_path); + LNK_Chunk *debug_rdi_chunk = lnk_section_push_chunk_data(rdi_sect, rdi_sect->root, debug_rdi, str8(0,0)); lnk_chunk_set_debugf(rdi_sect->arena, debug_rdi, LNK_CV_HEADER_RDI_SYMBOL_NAME); + + // push symbols + LNK_Symbol *debug_rdi_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_CV_HEADER_RDI_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, debug_rdi_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, debug_rdi_symbol); + + LNK_Symbol *guid_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_CV_HEADER_GUID_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, debug_rdi_chunk, OffsetOf(PE_CvHeaderRDI, guid), 0, 0); + lnk_symbol_table_push(symtab, guid_symbol); + + // push debug directory + lnk_push_pe_debug_data_directory(debug_sect, debug_dir_array_chunk, debug_rdi_symbol, PE_DebugDirectoryType_CODEVIEW, time_stamp); + + ProfEnd(); +} + +internal void +lnk_build_guard_tables(TP_Context *tp, + LNK_SectionTable *st, + LNK_SymbolTable *symtab, + LNK_ExportTable *exptab, + LNK_ObjList obj_list, + COFF_MachineType machine, + String8 entry_point_name, + LNK_GuardFlags guard_flags, + B32 emit_suppress_flag) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + + enum { GUARD_FIDS, GUARD_IATS, GUARD_LJMP, GUARD_EHCONT, GUARD_COUNT }; + LNK_SymbolList guard_symbol_list_table[GUARD_COUNT]; MemoryZeroStruct(&guard_symbol_list_table[0]); + + // collect symbols from objs + for (LNK_ObjNode *obj_node = obj_list.first; obj_node != NULL; obj_node = obj_node->next) { + LNK_Obj *obj = &obj_node->data; + COFF_FeatFlags feat_flags = lnk_obj_get_features(obj); + B32 has_guard_flags = (feat_flags & COFF_FeatFlag_GUARD_CF) || (feat_flags & COFF_FeatFlag_GUARD_EH_CONT); + if (has_guard_flags) { + LNK_SymbolArray symbol_arr = lnk_symbol_array_from_list(scratch.arena, obj->symbol_list); + if (guard_flags & LNK_Guard_Cf) { + LNK_ChunkList gfids_list = lnk_obj_search_chunks(scratch.arena, obj, str8_lit(".gfids"), str8(0,0), 1); + for (LNK_ChunkNode *node = gfids_list.first; node != 0; node = node->next) { + Assert(node->data->type == LNK_Chunk_Leaf); + lnk_push_coff_symbols_from_data(scratch.arena, &guard_symbol_list_table[GUARD_FIDS], node->data->u.leaf, symbol_arr); + } + LNK_ChunkList giats_list = lnk_obj_search_chunks(scratch.arena, obj, str8_lit(".giats"), str8(0,0), 1); + for (LNK_ChunkNode *node = giats_list.first; node != 0; node = node->next) { + Assert(node->data->type == LNK_Chunk_Leaf); + lnk_push_coff_symbols_from_data(scratch.arena, &guard_symbol_list_table[GUARD_IATS], node->data->u.leaf, symbol_arr); + } + } + if (guard_flags & LNK_Guard_LongJmp) { + LNK_ChunkList gljmp_list = lnk_obj_search_chunks(scratch.arena, obj, str8_lit(".gljmp"), str8(0,0), 1); + for (LNK_ChunkNode *node = gljmp_list.first; node != 0; node = node->next) { + Assert(node->data->type == LNK_Chunk_Leaf); + lnk_push_coff_symbols_from_data(scratch.arena, &guard_symbol_list_table[GUARD_LJMP], node->data->u.leaf, symbol_arr); + } + } + if (guard_flags & LNK_Guard_EhCont) { + LNK_ChunkList gehcont_list = lnk_obj_search_chunks(scratch.arena, obj, str8_lit(".gehcont"), str8(0,0), 1); + for (LNK_ChunkNode *node = gehcont_list.first; node != 0; node = node->next) { + Assert(node->data->type == LNK_Chunk_Leaf); + lnk_push_coff_symbols_from_data(scratch.arena, &guard_symbol_list_table[GUARD_EHCONT], node->data->u.leaf, symbol_arr); + } + } + } else { + // use relocation data in code sections to get function symbols + for (U64 isect = 0; isect < obj->sect_count; ++isect) { + LNK_Chunk *chunk = &obj->chunk_arr[isect]; + if (!chunk) { + continue; + } + if (lnk_chunk_is_discarded(chunk)) { + continue; + } + if (~chunk->flags & COFF_SectionFlag_CNT_CODE) { + continue; + } + Assert(chunk->type == LNK_Chunk_Leaf); + for (LNK_Reloc *reloc = obj->sect_reloc_list_arr[isect].first; reloc != 0; reloc = reloc->next) { + LNK_Symbol *symbol = lnk_resolve_symbol(symtab, reloc->symbol); + if (!LNK_Symbol_IsDefined(symbol->type)) { + continue; + } + LNK_DefinedSymbol *defined_symbol = &symbol->u.defined; + if (~defined_symbol->flags & LNK_DefinedSymbolFlag_IsFunc) { + continue; + } + LNK_Chunk *symbol_chunk = defined_symbol->u.chunk; + if (!symbol_chunk) { + continue; + } + if (symbol_chunk->type != LNK_Chunk_Leaf) { + continue; + } + if (~symbol_chunk->flags & COFF_SectionFlag_CNT_CODE) { + continue; + } + lnk_symbol_list_push(scratch.arena, &guard_symbol_list_table[GUARD_FIDS], symbol); + } + } + } + } + + // entry point + LNK_Symbol *entry_point_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, entry_point_name); + lnk_symbol_list_push(scratch.arena, &guard_symbol_list_table[GUARD_FIDS], entry_point_symbol); + + // push exports + for (LNK_Export *exp = exptab->name_export_list.first; exp != NULL; exp = exp->next) { + lnk_symbol_list_push(scratch.arena, &guard_symbol_list_table[GUARD_FIDS], exp->symbol); + } + + // TODO: push noname exports + + // push thunks + LNK_SymbolScopeIndex scope_array[] = { LNK_SymbolScopeIndex_Defined, LNK_SymbolScopeIndex_Internal }; + for (U64 iscope = 0; iscope < ArrayCount(scope_array); ++iscope) { + LNK_SymbolScopeIndex scope = scope_array[iscope]; + for (U64 ibucket = 0; ibucket < symtab->bucket_count[scope]; ++ibucket) { + for (LNK_SymbolNode *symbol_node = symtab->buckets[scope][ibucket].first; + symbol_node != NULL; + symbol_node = symbol_node->next) { + LNK_Symbol *symbol = symbol_node->data; + if (!LNK_Symbol_IsDefined(symbol->type)) continue; + LNK_DefinedSymbol *defined_symbol = &symbol->u.defined; + if (~defined_symbol->flags & LNK_DefinedSymbolFlag_IsThunk) continue; + lnk_symbol_list_push(scratch.arena, &guard_symbol_list_table[GUARD_FIDS], symbol); + } + } + } + + // build section data + lnk_section_table_build_data(tp, st, machine); + lnk_section_table_assign_virtual_offsets(st); + + // compute symbols virtual offsets + U64Array guard_voff_arr_table[GUARD_COUNT]; + for (U64 i = 0; i < ArrayCount(guard_symbol_list_table); ++i) { + U64List voff_list; MemoryZeroStruct(&voff_list); + LNK_SymbolList symbol_list = guard_symbol_list_table[i]; + for (LNK_SymbolNode *symbol_node = symbol_list.first; symbol_node != NULL; symbol_node = symbol_node->next) { + LNK_Symbol *symbol = lnk_resolve_symbol(symtab, symbol_node->data); + if (!LNK_Symbol_IsDefined(symbol->type)) { + continue; + } + LNK_DefinedSymbol *defined_symbol = &symbol->u.defined; + LNK_Chunk *chunk = defined_symbol->u.chunk; + if (!chunk) { + continue; + } + if (lnk_chunk_is_discarded(chunk)) { + continue; + } + U64 chunk_voff = lnk_virt_off_from_chunk_ref(sect_id_map, chunk->ref); + U64 symbol_voff = chunk_voff + defined_symbol->u.chunk_offset; + Assert(symbol_voff != 0); + u64_list_push(scratch.arena, &voff_list, symbol_voff); + } + U64Array voff_arr = u64_array_from_list(scratch.arena, &voff_list); + radsort(voff_arr.v, voff_arr.count, u64_compar_is_before); + guard_voff_arr_table[i] = u64_array_remove_duplicates(scratch.arena, voff_arr); + } + + // push guard sections + static struct { + char *name; + char *symbol; + int flags; + } sect_layout[] = { + { ".gfids", LNK_GFIDS_SYMBOL_NAME, LNK_GFIDS_SECTION_FLAGS }, + { ".giats", LNK_GIATS_SYMBOL_NAME, LNK_GIATS_SECTION_FLAGS }, + { ".gljmp", LNK_GLJMP_SYMBOL_NAME, LNK_GLJMP_SECTION_FLAGS }, + { ".gehcont", LNK_GEHCONT_SYMBOL_NAME, LNK_GEHCONT_SECTION_FLAGS }, + }; + for (U64 i = 0; i < ArrayCount(sect_layout); ++i) { + LNK_Section *sect = lnk_section_table_push(st, str8_cstring(sect_layout[i].name), sect_layout[i].flags); + LNK_Symbol *symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(sect_layout[i].symbol), LNK_DefinedSymbolVisibility_Internal, 0, sect->root, 0, 0, 0); + lnk_symbol_table_push(symtab, symbol); + } + + // TODO: emit table for SEH on X86 + if (machine == COFF_MachineType_X86) { + lnk_not_implemented("__safe_se_handler_table"); + lnk_not_implemented("__safe_se_handler_count"); + } + + LNK_Symbol *gfids_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_GFIDS_SYMBOL_NAME)); + LNK_Symbol *giats_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_GIATS_SYMBOL_NAME)); + LNK_Symbol *gljmp_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_GLJMP_SYMBOL_NAME)); + LNK_Symbol *gehcont_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_GEHCONT_SYMBOL_NAME)); + + LNK_Section *gfids_sect = lnk_section_table_search_id(st, gfids_symbol->u.defined.u.chunk->ref.sect_id); + LNK_Section *giats_sect = lnk_section_table_search_id(st, giats_symbol->u.defined.u.chunk->ref.sect_id); + LNK_Section *gljmp_sect = lnk_section_table_search_id(st, gljmp_symbol->u.defined.u.chunk->ref.sect_id); + LNK_Section *gehcont_sect = lnk_section_table_search_id(st, gehcont_symbol->u.defined.u.chunk->ref.sect_id); + + LNK_Chunk *gfids_array_chunk = gfids_sect->root; + LNK_Chunk *giats_array_chunk = giats_sect->root; + LNK_Chunk *gljmp_array_chunk = gljmp_sect->root; + LNK_Chunk *gehcont_array_chunk = gehcont_sect->root; + + // first 4 bytes are call's destination virtual offset + U64 entry_stride = sizeof(U32); + if (emit_suppress_flag) { + // 4th byte tells kernel what to do when destination VA is not in the bitmap. + // If byte is 1 exception is suppressed and program keeps running. + // If zero then exception is raised with nt!_KiRaiseSecurityCheckFailure(FAST_FAIL_GUARD_ICALL_CHECK_FAILURE) and exception code 0xA. + entry_stride = 5; + } + + // make guard data from virtual offsets + String8 gfids_data = lnk_build_guard_data(gfids_sect->arena, guard_voff_arr_table[GUARD_FIDS], entry_stride); + String8 giats_data = lnk_build_guard_data(giats_sect->arena, guard_voff_arr_table[GUARD_IATS], entry_stride); + String8 gljmp_data = lnk_build_guard_data(gljmp_sect->arena, guard_voff_arr_table[GUARD_LJMP], entry_stride); + String8 gehcont_data = lnk_build_guard_data(gehcont_sect->arena, guard_voff_arr_table[GUARD_EHCONT], entry_stride); + + // push guard data + lnk_section_push_chunk_data(gfids_sect, gfids_array_chunk, gfids_data, str8(0,0)); + lnk_section_push_chunk_data(giats_sect, giats_array_chunk, giats_data, str8(0,0)); + lnk_section_push_chunk_data(gljmp_sect, gljmp_array_chunk, gljmp_data, str8(0,0)); + lnk_section_push_chunk_data(gehcont_sect, gehcont_array_chunk, gehcont_data, str8(0,0)); + + LNK_Symbol *gflags_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_FLAGS_SYMBOL_NAME)); + LNK_Symbol *gfids_table_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_FIDS_TABLE_SYMBOL_NAME)); + LNK_Symbol *gfids_count_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_FIDS_COUNT_SYMBOL_NAME)); + LNK_Symbol *giats_table_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_IAT_TABLE_SYMBOL_NAME)); + LNK_Symbol *giats_count_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_IAT_COUNT_SYMBOL_NAME)); + LNK_Symbol *gljmp_table_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_LONGJMP_TABLE_SYMBOL_NAME)); + LNK_Symbol *gljmp_count_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_LONGJMP_COUNT_SYMBOL_NAME)); + LNK_Symbol *gehcont_table_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_EHCONT_TABLE_SYMBOL_NAME)); + LNK_Symbol *gehcont_count_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, str8_lit(LNK_GUARD_EHCONT_COUNT_SYMBOL_NAME)); + + LNK_DefinedSymbol *gflags_def = &gflags_symbol->u.defined; + LNK_DefinedSymbol *gfids_table_def = &gfids_table_symbol->u.defined; + LNK_DefinedSymbol *gfids_count_def = &gfids_count_symbol->u.defined; + LNK_DefinedSymbol *giats_table_def = &giats_table_symbol->u.defined; + LNK_DefinedSymbol *giats_count_def = &giats_count_symbol->u.defined; + LNK_DefinedSymbol *gljmp_table_def = &gljmp_table_symbol->u.defined; + LNK_DefinedSymbol *gljmp_count_def = &gljmp_count_symbol->u.defined; + LNK_DefinedSymbol *gehcont_table_def = &gehcont_table_symbol->u.defined; + LNK_DefinedSymbol *gehcont_count_def = &gehcont_count_symbol->u.defined; + + // guard flags + gflags_def->value_type = LNK_DefinedSymbolValue_VA; + gflags_def->u.va = PE_LoadConfigGuardFlags_CF_INSTRUMENTED; + if ((guard_flags & LNK_Guard_Cf)) { + gflags_def->u.va |= PE_LoadConfigGuardFlags_CF_FUNCTION_TABLE_PRESENT; + } + if ((guard_flags & LNK_Guard_LongJmp) && guard_voff_arr_table[GUARD_LJMP].count) { + gflags_def->u.va |= PE_LoadConfigGuardFlags_CF_LONGJUMP_TABLE_PRESENT; + } + if ((guard_flags & LNK_Guard_EhCont) && guard_voff_arr_table[GUARD_EHCONT].count) { + gflags_def->u.va |= PE_LoadConfigGuardFlags_EH_CONTINUATION_TABLE_PRESENT; + } + { + LNK_Section *didat_sect = lnk_section_table_search(st, str8_lit(".didat")); + if (didat_sect) { + gflags_def->u.va |= PE_LoadConfigGuardFlags_DELAYLOAD_IAT_IN_ITS_OWN_SECTION; + } + } + if (entry_stride > sizeof(U32)) { + U64 size_bit = (entry_stride - 5); + if (emit_suppress_flag) { + gflags_def->u.va |= PE_LoadConfigGuardFlags_CF_EXPORT_SUPPRESSION_INFO_PRESENT; + } + gflags_def->u.va |= (1 << size_bit) << PE_LoadConfigGuardFlags_CF_FUNCTION_TABLE_SIZE_SHIFT; + } + + // gfids + if (guard_voff_arr_table[GUARD_FIDS].count) { + gfids_table_def->value_type = LNK_DefinedSymbolValue_Chunk; + gfids_table_def->u.chunk = gfids_array_chunk; + } + gfids_count_def->value_type = LNK_DefinedSymbolValue_VA; + gfids_count_def->u.va = guard_voff_arr_table[GUARD_FIDS].count; + + // giats + if (guard_voff_arr_table[GUARD_IATS].count) { + giats_table_def->value_type = LNK_DefinedSymbolValue_Chunk; + giats_table_def->u.chunk = giats_array_chunk; + } + giats_count_def->value_type = LNK_DefinedSymbolValue_VA; + giats_count_def->u.va = guard_voff_arr_table[GUARD_IATS].count; + + // gljmp + if (guard_voff_arr_table[GUARD_LJMP].count) { + gljmp_table_def->value_type = LNK_DefinedSymbolValue_Chunk; + gljmp_table_def->u.chunk = gljmp_array_chunk; + } + gljmp_count_def->value_type = LNK_DefinedSymbolValue_VA; + gljmp_count_def->u.va = guard_voff_arr_table[GUARD_LJMP].count; + + // gehcont + if (guard_voff_arr_table[GUARD_EHCONT].count) { + gehcont_table_def->value_type = LNK_DefinedSymbolValue_Chunk; + gehcont_table_def->u.chunk = gehcont_array_chunk; + } + gehcont_count_def->value_type = LNK_DefinedSymbolValue_VA; + gehcont_count_def->u.va = guard_voff_arr_table[GUARD_EHCONT].count; + + scratch_end(scratch); + ProfEnd(); +} + +internal void +lnk_emit_base_reloc_info(Arena *arena, + LNK_Section **sect_id_map, + U64 page_size, + HashTable *page_ht, + LNK_BaseRelocPageList *page_list, + LNK_Reloc *reloc) +{ + B32 is_addr = (reloc->type == LNK_Reloc_ADDR_64 || reloc->type == LNK_Reloc_ADDR_32); + if (is_addr) { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + U64 page_voff = AlignDownPow2(reloc_voff, page_size); + + LNK_BaseRelocPageNode *page; + { + KeyValuePair *is_page_present = hash_table_search_u64(page_ht, page_voff); + if (is_page_present) { + page = is_page_present->value_raw; + } else { + // fill out page + page = push_array(arena, LNK_BaseRelocPageNode, 1); + page->v.voff = page_voff; + + // push page + SLLQueuePush(page_list->first, page_list->last, page); + page_list->count += 1; + + // register page voff + hash_table_push_u64_raw(arena, page_ht, page_voff, page); + } + } + + u64_list_push(arena, &page->v.entries, reloc_voff); + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_emit_base_relocs_from_reloc_array_task) +{ + LNK_BaseRelocTask *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + LNK_BaseRelocPageList *page_list = &task->list_arr[task_id]; + HashTable *page_ht = task->page_ht_arr[task_id]; + + for (U64 reloc_idx = range.min; reloc_idx < range.max; reloc_idx += 1) { + LNK_Reloc *reloc = task->reloc_arr[reloc_idx]; + lnk_emit_base_reloc_info(arena, task->sect_id_map, task->page_size, page_ht, page_list, reloc); + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_emit_base_relocs_from_objs_task) +{ + ProfBeginFunction(); + LNK_ObjBaseRelocTask *task = raw_task; + LNK_BaseRelocPageList *page_list = &task->list_arr[task_id]; + HashTable *page_ht = task->page_ht_arr[task_id]; + Rng1U64 range = task->ranges[task_id]; + + for (U64 obj_idx = range.min; obj_idx < range.max; ++obj_idx) { + LNK_Obj *obj = task->obj_arr[obj_idx]; + for (U64 sect_idx = 0; sect_idx < obj->sect_count; sect_idx += 1) { + B32 is_live = !lnk_chunk_is_discarded(&obj->chunk_arr[sect_idx]); + if (is_live) { + LNK_RelocList reloc_list = obj->sect_reloc_list_arr[sect_idx]; + for (LNK_Reloc *reloc = reloc_list.first; reloc != 0; reloc = reloc->next) { + lnk_emit_base_reloc_info(arena, task->sect_id_map, task->page_size, page_ht, page_list, reloc); + } + } + } + } + ProfEnd(); +} + +internal LNK_BaseRelocPageArray +lnk_base_reloc_page_array_from_list(Arena* arena, LNK_BaseRelocPageList list) +{ + LNK_BaseRelocPageArray result = {0}; + result.count = 0; + result.v = push_array_no_zero(arena, LNK_BaseRelocPage, list.count); + for (LNK_BaseRelocPageNode* n = list.first; n != 0; n = n->next) { + result.v[result.count++] = n->v; + } + Assert(result.count == list.count); + return result; +} + +int +lnk_base_reloc_page_is_before(void *raw_a, void *raw_b) +{ + LNK_BaseRelocPage* a = raw_a; + LNK_BaseRelocPage* b = raw_b; + return a->voff < b->voff; +} + +internal void +lnk_base_reloc_page_array_sort(LNK_BaseRelocPageArray arr) +{ + ProfBeginFunction(); + radsort(arr.v, arr.count, lnk_base_reloc_page_is_before); + ProfEnd(); +} + +internal void +lnk_build_base_relocs(TP_Context *tp, + TP_Arena *tp_arena, + LNK_SectionTable *st, + LNK_SymbolTable *symtab, + COFF_MachineType machine, + U64 page_size, + LNK_ObjList obj_list) +{ + ProfBeginFunction(); + + TP_Temp temp = tp_temp_begin(tp_arena); + + lnk_section_table_build_data(tp, st, machine); + lnk_section_table_assign_virtual_offsets(st); + + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(tp_arena->v[0], st); + + LNK_BaseRelocPageList *page_list_arr = push_array(tp_arena->v[0], LNK_BaseRelocPageList, tp->worker_count); + HashTable **page_ht_arr = push_array_no_zero(tp_arena->v[0], HashTable *, tp->worker_count); + for (U64 i = 0; i < tp->worker_count; ++i) { + page_ht_arr[i] = hash_table_init(tp_arena->v[0], 1024); + } + + // emit pages from relocs defined in section table + ProfBegin("Emit Relocs From Section Table"); + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + LNK_BaseRelocTask task = {0}; + task.page_size = page_size; + task.sect_id_map = sect_id_map; + task.list_arr = page_list_arr; + task.page_ht_arr = page_ht_arr; + task.reloc_arr = lnk_reloc_array_from_list(tp_arena->v[0], sect_node->data.reloc_list); + task.range_arr = tp_divide_work(tp_arena->v[0], sect_node->data.reloc_list.count, tp->worker_count); + tp_for_parallel(tp, tp_arena, tp->worker_count, lnk_emit_base_relocs_from_reloc_array_task, &task); + } + ProfEnd(); + + // emit pages from relocs defined in objs + ProfBegin("Emit Relocs From Objs"); + { + LNK_ObjBaseRelocTask task = {0}; + task.ranges = tp_divide_work(tp_arena->v[0], obj_list.count, tp->worker_count); + task.page_size = page_size; + task.sect_id_map = sect_id_map; + task.page_ht_arr = page_ht_arr; + task.list_arr = page_list_arr; + task.obj_arr = lnk_obj_arr_from_list(tp_arena->v[0], obj_list); + tp_for_parallel(tp, tp_arena, tp->worker_count, lnk_emit_base_relocs_from_objs_task, &task); + } + ProfEnd(); + + // merge page lists + + ProfBegin("Merge Worker Page Lists"); + + HashTable *main_ht = page_ht_arr[0]; + LNK_BaseRelocPageList *main_page_list = &page_list_arr[0]; + + for (U64 list_idx = 1; list_idx < tp->worker_count; ++list_idx) { + LNK_BaseRelocPageList src = page_list_arr[list_idx]; + + for (LNK_BaseRelocPageNode *src_page = src.first, *src_next; src_page != 0; src_page = src_next) { + src_next = src_page->next; + + KeyValuePair *is_page_present = hash_table_search_u64(main_ht, src_page->v.voff); + if (is_page_present) { + // page exists concat voffs + LNK_BaseRelocPageNode *page = is_page_present->value_raw; + Assert(page != src_page); + u64_list_concat_in_place(&page->v.entries, &src_page->v.entries); + } else { + // push page to main list + SLLQueuePush(main_page_list->first, main_page_list->last, src_page); + main_page_list->count += 1; + + // store lookup voff + hash_table_push_u64_raw(tp_arena->v[0], main_ht, src_page->v.voff, src_page); + } + } + } + + ProfEnd(); + + // push storage for section + LNK_Section *base_reloc_sect = lnk_section_table_push(st, str8_lit(".reloc"), LNK_RELOC_SECTION_FLAGS); + LNK_Symbol *base_reloc_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_BASE_RELOC_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, base_reloc_sect->root, 0, 0, 0); + lnk_symbol_table_push(symtab, base_reloc_symbol); + + ProfBegin("Page List -> Array"); + LNK_BaseRelocPageArray page_arr = lnk_base_reloc_page_array_from_list(base_reloc_sect->arena, *main_page_list); + ProfEnd(); + + ProfBegin("Sort Pages on VOFF"); + lnk_base_reloc_page_array_sort(page_arr); + ProfEnd(); + + HashTable *voff_ht = hash_table_init(tp_arena->v[0], page_size); + + ProfBegin("Serialize Pages"); + for (U64 page_idx = 0; page_idx < page_arr.count; ++page_idx) { + LNK_BaseRelocPage *page = &page_arr.v[page_idx]; + + // push buffer + U64 buf_align = sizeof(U32); + U64 buf_size = AlignPow2(sizeof(U32)*2 + sizeof(U16)*page->entries.count, buf_align); + U8 *buf = push_array_no_zero(base_reloc_sect->arena, U8, buf_size); + + // setup pointers into buffer + U32 *page_voff_ptr = (U32*)buf; + U32 *block_size_ptr = page_voff_ptr + 1; + U16 *reloc_arr_base = (U16*)(block_size_ptr + 1); + U16 *reloc_arr_ptr = reloc_arr_base; + + // write reloc array + for (U64Node *i = page->entries.first; i != 0; i = i->next) { + // was base reloc entry made? + if (hash_table_search_u64(voff_ht, i->data)) { + continue; + } + hash_table_push_u64_u64(tp_arena->v[0], voff_ht, i->data, 0); + + // write entry + U64 rel_off = i->data - page->voff; + Assert(rel_off <= page_size); + *reloc_arr_ptr++ = PE_BaseRelocMake(PE_BaseRelocKind_DIR64, rel_off); + } + + // write pad + U64 pad_reloc_count = AlignPadPow2(page->entries.count, sizeof(reloc_arr_ptr[0])); + MemoryZeroTyped(reloc_arr_ptr, pad_reloc_count); // fill pad with PE_BaseRelocKind_ABSOLUTE + reloc_arr_ptr += pad_reloc_count; + + // compute block size + U64 reloc_arr_size = (U64)((U8*)reloc_arr_ptr - (U8*)reloc_arr_base); + U64 block_size = sizeof(*page_voff_ptr) + sizeof(*block_size_ptr) + reloc_arr_size; + + // write header + *page_voff_ptr = safe_cast_u32(page->voff); + *block_size_ptr = safe_cast_u32(block_size); + Assert(*block_size_ptr <= buf_size); + + // push page chunk + lnk_section_push_chunk_raw(base_reloc_sect, base_reloc_sect->root, buf, block_size, str8(0,0)); + + // purge voffs for next run + hash_table_purge(voff_ht); + } + ProfEnd(); + + tp_temp_end(temp); + ProfEnd(); +} + +internal LNK_Chunk * +lnk_build_dos_header(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent_chunk) +{ + U32 dos_stub_size = sizeof(PE_DosHeader) + pe_dos_program.size; + + PE_DosHeader *dos_header = push_array(header_sect->arena, PE_DosHeader, 1); + dos_header->magic = PE_DOS_MAGIC; + dos_header->last_page_size = dos_stub_size % 512; + dos_header->page_count = CeilIntegerDiv(dos_stub_size, 512); + dos_header->paragraph_header_size = sizeof(PE_DosHeader) / 16; + dos_header->min_paragraph = 0; + dos_header->max_paragraph = 0; + dos_header->init_ss = 0; + dos_header->init_sp = 0; + dos_header->checksum = 0; + dos_header->init_ip = 0xFFFF; + dos_header->init_cs = 0; + dos_header->reloc_table_file_off = sizeof(PE_DosHeader); + dos_header->overlay_number = 0; + MemoryZeroStruct(dos_header->reserved); + dos_header->oem_id = 0; + dos_header->oem_info = 0; + MemoryZeroArray(dos_header->reserved2); + dos_header->coff_file_offset = 0; // :coff_file_offset + + LNK_Chunk *dos_chunk = lnk_section_push_chunk_list(header_sect, parent_chunk, str8(0,0)); + LNK_Chunk *dos_header_chunk = lnk_section_push_chunk_raw(header_sect, dos_chunk, dos_header, sizeof(*dos_header), str8(0,0)); + LNK_Chunk *dos_program_chunk = lnk_section_push_chunk_data(header_sect, dos_chunk, pe_dos_program, str8(0,0)); + lnk_chunk_set_debugf(header_sect->arena, dos_chunk, "DOS Header & Stub"); + lnk_chunk_set_debugf(header_sect->arena, dos_header_chunk, LNK_DOS_HEADER_SYMBOL_NAME); + lnk_chunk_set_debugf(header_sect->arena, dos_program_chunk, LNK_DOS_PROGRAM_SYMBOL_NAME); + + LNK_Symbol *dos_header_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_DOS_HEADER_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, dos_header_chunk, 0, 0, 0); + LNK_Symbol *dos_program_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_DOS_PROGRAM_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, dos_program_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, dos_header_symbol); + lnk_symbol_table_push(symtab, dos_program_symbol); + + // :coff_file_offset + lnk_section_push_reloc_undefined(header_sect, dos_header_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(PE_DosHeader, coff_file_offset), str8_lit(LNK_NT_HEADERS_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + + return dos_chunk; +} + +internal LNK_Chunk * +lnk_build_pe_magic(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent) +{ + U32 *pe_magic = push_array_no_zero(header_sect->arena, U32, 1); + *pe_magic = PE_MAGIC; + + LNK_Chunk *pe_magic_chunk = lnk_section_push_chunk_raw(header_sect, parent, pe_magic, sizeof(*pe_magic), str8(0,0)); + lnk_chunk_set_debugf(header_sect->arena, pe_magic_chunk, LNK_PE_MAGIC_SYMBOL_NAME); + + LNK_Symbol *pe_magic_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_PE_MAGIC_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, pe_magic_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, pe_magic_symbol); + + return pe_magic_chunk; +} + +internal LNK_Chunk * +lnk_build_coff_file_header(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent, + COFF_MachineType machine, COFF_TimeStamp time_stamp, PE_ImageFileCharacteristics file_characteristics) +{ + COFF_Header *file_header = push_array_no_zero(header_sect->arena, COFF_Header, 1); + file_header->machine = machine; + file_header->time_stamp = time_stamp; + file_header->symbol_table_foff = 0; + file_header->symbol_count = 0; + file_header->section_count = 0; // :section_count + file_header->optional_header_size = 0; // :optional_header_size + file_header->flags = file_characteristics; + + LNK_Chunk *file_header_chunk = lnk_section_push_chunk_raw(header_sect, parent, file_header, sizeof(*file_header), str8(0,0)); + lnk_chunk_set_debugf(header_sect->arena, file_header_chunk, LNK_COFF_HEADER_SYMBOL_NAME); + + LNK_Symbol *file_header_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_COFF_HEADER_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, file_header_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, file_header_symbol); + + // :section_count + lnk_section_push_reloc_undefined(header_sect, file_header_chunk, LNK_Reloc_ADDR_16, OffsetOf(COFF_Header, section_count), str8_lit(LNK_COFF_SECT_HEADER_COUNT_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + + // :optional_header_size + lnk_section_push_reloc_undefined(header_sect, file_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_16, OffsetOf(COFF_Header, optional_header_size), str8_lit(LNK_PE_OPT_HEADER_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + lnk_section_push_reloc_undefined(header_sect, file_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_16, OffsetOf(COFF_Header, optional_header_size), str8_lit(LNK_PE_DIRECTORY_ARRAY_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + + return file_header_chunk; +} + +internal LNK_Chunk * +lnk_build_pe_optional_header_x64(LNK_SymbolTable *symtab, + LNK_Section *header_sect, + LNK_Chunk *parent, + COFF_MachineType machine, + U64 base_addr, + U64 sect_align, + U64 file_align, + Version linker_ver, + Version os_ver, + Version image_ver, + Version subsystem_ver, + PE_WindowsSubsystem subsystem, + PE_DllCharacteristics dll_characteristics, + U64 stack_reserve, + U64 stack_commit, + U64 heap_reserve, + U64 heap_commit, + String8 entry_point_name, + LNK_SectionArray sect_arr) +{ + PE_OptionalHeader32Plus *opt_header = push_array_no_zero(header_sect->arena, PE_OptionalHeader32Plus, 1); + opt_header->magic = PE_PE32PLUS_MAGIC; + opt_header->major_linker_version = linker_ver.major; + opt_header->minor_linker_version = linker_ver.minor; + opt_header->sizeof_code = 0; // :sizeof_code + opt_header->sizeof_inited_data = 0; // :sizeof_inited_data + opt_header->sizeof_uninited_data = 0; // :sizeof_uninited_data + opt_header->entry_point_va = 0; // :entry_point_va + opt_header->code_base = 0; // :code_base + opt_header->image_base = base_addr; + opt_header->section_alignment = sect_align; + opt_header->file_alignment = file_align; + opt_header->major_os_ver = os_ver.major; + opt_header->minor_os_ver = os_ver.minor; + opt_header->major_img_ver = image_ver.major; + opt_header->minor_img_ver = image_ver.minor; + opt_header->major_subsystem_ver = subsystem_ver.major; + opt_header->minor_subsystem_ver = subsystem_ver.minor; + opt_header->win32_version_value = 0; // MSVC writes zero + opt_header->sizeof_image = 0; // :sizeof_image + opt_header->sizeof_headers = 0; // :sizeof_headers + opt_header->check_sum = 0; // :check_sum + opt_header->subsystem = subsystem; + opt_header->dll_characteristics = dll_characteristics; + opt_header->sizeof_stack_reserve = stack_reserve; + opt_header->sizeof_stack_commit = stack_commit; + opt_header->sizeof_heap_reserve = heap_reserve; + opt_header->sizeof_heap_commit = heap_commit; + opt_header->loader_flags = 0; // for dynamic linker, always zero + opt_header->data_dir_count = 0; // :data_dir_count + + // push chunk + LNK_Chunk *opt_header_chunk = lnk_section_push_chunk_raw(header_sect, parent, opt_header, sizeof(*opt_header), str8(0,0)); + lnk_chunk_set_debugf(header_sect->arena, opt_header_chunk, LNK_PE_OPT_HEADER_SYMBOL_NAME); + + // define optional header symbol + LNK_Symbol *opt_header_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_PE_OPT_HEADER_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, opt_header_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, opt_header_symbol); + + // :entry_point_va + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_OptionalHeader32Plus, entry_point_va), entry_point_name, LNK_SymbolScopeFlag_Main); + + // :code_base + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_OptionalHeader32Plus, code_base), str8_lit(LNK_TEXT_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + + LNK_Section *last_sect = 0; + for (LNK_Section *sect = §_arr.v[0], *sect_opl = sect + sect_arr.count; sect < sect_opl; sect += 1) { + if (!sect->has_layout) { + continue; + } + // :sizeof_uninited_data + if (sect->flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA) { + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_CHUNK_SIZE_VIRT_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_uninited_data), sect->name, LNK_SymbolScopeFlag_Internal); + } + + // :sizeof_inited_data + if (sect->flags & COFF_SectionFlag_CNT_INITIALIZED_DATA) { + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_inited_data), sect->name, LNK_SymbolScopeFlag_Internal); + } + + // :sizeof_code + if (sect->flags & COFF_SectionFlag_CNT_CODE) { + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_code), sect->name, LNK_SymbolScopeFlag_Internal); + } + + last_sect = sect; + } + + // :sizeof_image + if (last_sect) { + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_image), last_sect->name, LNK_SymbolScopeFlag_Internal); + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_CHUNK_SIZE_VIRT_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_image), last_sect->name, LNK_SymbolScopeFlag_Internal); + lnk_section_push_reloc(header_sect, opt_header_chunk, LNK_Reloc_VIRT_ALIGN_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_image), &g_null_symbol); + } + + // :sizeof_headers + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_headers), str8_lit(LNK_WIN32_HEADER_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + lnk_section_push_reloc(header_sect, opt_header_chunk, LNK_Reloc_FILE_ALIGN_32, OffsetOf(PE_OptionalHeader32Plus, sizeof_headers), &g_null_symbol); + + // :check_sum + LNK_Symbol *checksum_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_PE_CHECKSUM_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, opt_header_chunk, OffsetOf(PE_OptionalHeader32Plus, check_sum), COFF_ComdatSelectType_NODUPLICATES, 0); + lnk_symbol_table_push(symtab, checksum_symbol); + + // :data_dir_count + lnk_section_push_reloc_undefined(header_sect, opt_header_chunk, LNK_Reloc_ADDR_32, OffsetOf(PE_OptionalHeader32Plus, data_dir_count), str8_lit(LNK_PE_DIRECTORY_COUNT_SYMBOL_NAME), LNK_SymbolScopeFlag_Internal); + + return opt_header_chunk; +} + +internal LNK_Chunk * +lnk_build_pe_directories(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent) +{ + static struct { + char *name; + PE_DataDirectoryIndex index; + LNK_SymbolScopeFlags scope; + } directory_map[] = { + { LNK_LOAD_CONFIG_SYMBOL_NAME , PE_DataDirectoryIndex_LOAD_CONFIG , LNK_SymbolScopeFlag_Main }, + { LNK_PDATA_SYMBOL_NAME , PE_DataDirectoryIndex_EXCEPTIONS , LNK_SymbolScopeFlag_Internal }, + { LNK_EDATA_SYMBOL_NAME , PE_DataDirectoryIndex_EXPORT , LNK_SymbolScopeFlag_Internal }, + { LNK_BASE_RELOC_SYMBOL_NAME , PE_DataDirectoryIndex_BASE_RELOC , LNK_SymbolScopeFlag_Internal }, + { LNK_IMPORT_DLL_TABLE_SYMBOL_NAME , PE_DataDirectoryIndex_IMPORT , LNK_SymbolScopeFlag_Internal }, + { LNK_IMPORT_IAT_SYMBOL_NAME , PE_DataDirectoryIndex_IMPORT_ADDR , LNK_SymbolScopeFlag_Internal }, + { LNK_DELAYED_IMPORT_DLL_TABLE_SYMBOL_NAME, PE_DataDirectoryIndex_DELAY_IMPORT, LNK_SymbolScopeFlag_Internal }, + { LNK_TLS_SYMBOL_NAME , PE_DataDirectoryIndex_TLS , LNK_SymbolScopeFlag_Main }, + { LNK_DEBUG_DIR_SYMBOL_NAME , PE_DataDirectoryIndex_DEBUG , LNK_SymbolScopeFlag_Internal }, + { LNK_RSRC_SYMBOL_NAME , PE_DataDirectoryIndex_RESOURCES , LNK_SymbolScopeFlag_Internal }, + }; + + // init directory virtual coords from symbol names + U64 directory_count = PE_DataDirectoryIndex_COUNT; + PE_DataDirectory *directory_array = push_array(header_sect->arena, PE_DataDirectory, directory_count); + + LNK_Chunk *directory_array_chunk = lnk_section_push_chunk_raw(header_sect, parent, directory_array, sizeof(directory_array[0])*directory_count, str8(0,0)); + lnk_chunk_set_debugf(header_sect->arena, directory_array_chunk, LNK_PE_DIRECTORY_ARRAY_SYMBOL_NAME); + + // define PE directory symbols + LNK_Symbol *directory_array_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_PE_DIRECTORY_ARRAY_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, directory_array_chunk, 0, 0, 0); + LNK_Symbol *directory_count_symbol = lnk_make_defined_symbol_va(symtab->arena, str8_lit(LNK_PE_DIRECTORY_COUNT_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, directory_count); + lnk_symbol_table_push(symtab, directory_array_symbol); + lnk_symbol_table_push(symtab, directory_count_symbol); + + for (U64 dir_idx = 0; dir_idx < ArrayCount(directory_map); dir_idx += 1) { + String8 symbol_name = str8_cstring(directory_map[dir_idx].name); + LNK_Symbol *symbol = lnk_symbol_table_search(symtab, directory_map[dir_idx].scope, symbol_name); + if (symbol) { + U64 virt_off_field_off = sizeof(PE_DataDirectory) * directory_map[dir_idx].index + OffsetOf(PE_DataDirectory, virt_off); + U64 virt_size_field_off = sizeof(PE_DataDirectory) * directory_map[dir_idx].index + OffsetOf(PE_DataDirectory, virt_size); + lnk_section_push_reloc(header_sect, directory_array_chunk, LNK_Reloc_VIRT_OFF_32, virt_off_field_off, symbol); + lnk_section_push_reloc(header_sect, directory_array_chunk, LNK_Reloc_CHUNK_SIZE_VIRT_32, virt_size_field_off, symbol); + } + } + + return directory_array_chunk; +} + +internal LNK_Chunk * +lnk_build_coff_section_table(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent_chunk, LNK_SectionArray sect_arr) +{ + // register section symbols + for (LNK_Section *sect = §_arr.v[0], *sect_opl = sect + sect_arr.count; + sect < sect_opl; + sect += 1) { + // was section symbol defined elsewhere? + LNK_Symbol *test_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, sect->name); + Assert(!test_symbol); (void)test_symbol; + + // define symbol + String8 sect_symbol_name = push_str8_copy(symtab->arena, sect->name); + LNK_Symbol *sect_symbol = lnk_make_defined_symbol_chunk(symtab->arena, sect_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, sect->root, 0, 0, 0); + lnk_symbol_table_push(symtab, sect_symbol); + } + + // push COFF header array chunk + LNK_Chunk *coff_header_array_chunk = lnk_section_push_chunk_list(header_sect, parent_chunk, str8(0,0)); + lnk_chunk_set_debugf(header_sect->arena, coff_header_array_chunk, LNK_COFF_SECT_HEADER_ARRAY_SYMBOL_NAME); + + // define symbol for COFF header array + LNK_Symbol *coff_header_array_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_COFF_SECT_HEADER_ARRAY_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, coff_header_array_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, coff_header_array_symbol); + + // push headers + for (LNK_Section *sect = §_arr.v[0], *sect_opl = sect + sect_arr.count; sect < sect_opl; sect += 1) { + if (!sect->emit_header) { + continue; + } + if (!sect->has_layout) { + continue; + } + COFF_SectionHeader *coff_header = push_array_no_zero(header_sect->arena, COFF_SectionHeader, 1); + + // TODO: for objs we can store long name in string table and write here /offset + if (sect->name.size > sizeof(coff_header->name)) { + lnk_error(LNK_Warning_LongSectionName, "not enough space in COFF section header to store entire name \"%S\"", sect->name); + } + + MemorySet(&coff_header->name[0], 0, sizeof(coff_header->name)); + MemoryCopy(&coff_header->name[0], sect->name.str, Min(sect->name.size, sizeof(coff_header->name))); + coff_header->vsize = 0; // :vsize + coff_header->voff = 0; // :voff + coff_header->fsize = 0; // :fsize + coff_header->foff = 0; // :foff + coff_header->relocs_foff = 0; // :relocs_foff + coff_header->lines_foff = 0; // obsolete + coff_header->reloc_count = 0; // :reloc_count + coff_header->line_count = 0; // obsolete + coff_header->flags = sect->flags; + + // push chunk + LNK_Chunk *coff_header_chunk = lnk_section_push_chunk_raw(header_sect, coff_header_array_chunk, coff_header, sizeof(*coff_header), str8(0,0)); + + // :vsize + lnk_section_push_reloc_undefined(header_sect, coff_header_chunk, LNK_Reloc_CHUNK_SIZE_VIRT_32, OffsetOf(COFF_SectionHeader, vsize), sect->name, LNK_SymbolScopeFlag_Internal); + // :voff + lnk_section_push_reloc_undefined(header_sect, coff_header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(COFF_SectionHeader, voff), sect->name, LNK_SymbolScopeFlag_Internal); + + if (~sect->flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA) { + // :fsize + lnk_section_push_reloc_undefined(header_sect, coff_header_chunk, LNK_Reloc_CHUNK_SIZE_FILE_32, OffsetOf(COFF_SectionHeader, fsize), sect->name, LNK_SymbolScopeFlag_Internal); + // :foff + lnk_section_push_reloc_undefined(header_sect, coff_header_chunk, LNK_Reloc_FILE_OFF_32, OffsetOf(COFF_SectionHeader, foff), sect->name, LNK_SymbolScopeFlag_Internal); + } + + // TODO: :reloc_off + // TODO: :reloc_count + } + + // push symbol for section header count + U64 header_count = coff_header_array_chunk->u.list->count; + LNK_Symbol *header_symbol = lnk_make_defined_symbol_va(symtab->arena, str8_lit(LNK_COFF_SECT_HEADER_COUNT_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, header_count); + lnk_symbol_table_push(symtab, header_symbol); + + return coff_header_array_chunk; +} + +internal LNK_Chunk * +lnk_build_win32_image_header(LNK_SymbolTable *symtab, + LNK_Section *header_sect, + LNK_Chunk *parent_chunk, + LNK_Config *config, + LNK_SectionArray sect_arr) +{ + ProfBeginFunction(); + + // header sections must be written first + Assert(header_sect->id == 0); + + LNK_Chunk *win32_header_chunk = lnk_section_push_chunk_list(header_sect, parent_chunk , str8(0,0) ); + LNK_Chunk *dos_chunk = lnk_section_push_chunk_list(header_sect, win32_header_chunk, str8_lit("a")); + LNK_Chunk *nt_chunk = lnk_section_push_chunk_list(header_sect, win32_header_chunk, str8_lit("b")); + LNK_Chunk *pe_magic_chunk = lnk_section_push_chunk_list(header_sect, nt_chunk , str8_lit("a")); + LNK_Chunk *coff_file_header_chunk = lnk_section_push_chunk_list(header_sect, nt_chunk , str8_lit("b")); + LNK_Chunk *pe_optional_chunk = lnk_section_push_chunk_list(header_sect, nt_chunk , str8_lit("c")); + LNK_Chunk *coff_sect_header_chunk = lnk_section_push_chunk_list(header_sect, nt_chunk , str8_lit("d")); + + lnk_chunk_set_debugf(header_sect->arena, win32_header_chunk , "Win32 Headers" ); + lnk_chunk_set_debugf(header_sect->arena, dos_chunk , "DOS Chunk" ); + lnk_chunk_set_debugf(header_sect->arena, nt_chunk , "NT Chunk" ); + lnk_chunk_set_debugf(header_sect->arena, pe_magic_chunk , "PE Magic Container" ); + lnk_chunk_set_debugf(header_sect->arena, coff_file_header_chunk, "COFF File Header Container" ); + lnk_chunk_set_debugf(header_sect->arena, pe_optional_chunk , "PE Optional Header Container" ); + lnk_chunk_set_debugf(header_sect->arena, coff_sect_header_chunk, "COFF Section Headers Container"); + + LNK_Symbol *win32_header_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_WIN32_HEADER_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, win32_header_chunk , 0, 0, 0); + LNK_Symbol *dos_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_DOS_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, dos_chunk , 0, 0, 0); + LNK_Symbol *nt_headers_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_NT_HEADERS_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, nt_chunk , 0, 0, 0); + LNK_Symbol *pe_magic_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_PE_MAGIC_CONTAINER_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, pe_magic_chunk , 0, 0, 0); + LNK_Symbol *coff_file_header_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_COFF_FILE_HEADER_CONTAINER_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, coff_file_header_chunk, 0, 0, 0); + LNK_Symbol *pe_optional_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_PE_OPT_HEADER_CONTAINER_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, pe_optional_chunk , 0, 0, 0); + LNK_Symbol *coff_sect_header_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_COFF_SECTION_HEADER_CONTAINER_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, coff_sect_header_chunk, 0, 0, 0); + + lnk_symbol_table_push(symtab, win32_header_symbol ); + lnk_symbol_table_push(symtab, dos_symbol ); + lnk_symbol_table_push(symtab, nt_headers_symbol ); + lnk_symbol_table_push(symtab, pe_magic_symbol ); + lnk_symbol_table_push(symtab, coff_file_header_symbol); + lnk_symbol_table_push(symtab, pe_optional_symbol ); + lnk_symbol_table_push(symtab, coff_sect_header_symbol); + + lnk_build_dos_header(symtab, header_sect, dos_chunk); + lnk_build_pe_magic(symtab, header_sect, pe_magic_chunk); + lnk_build_coff_file_header(symtab, header_sect, coff_file_header_chunk, config->machine, config->time_stamp, config->file_characteristics); + switch (config->machine) { + case COFF_MachineType_X64: { + lnk_build_pe_optional_header_x64(symtab, + header_sect, + pe_optional_chunk, + config->machine, + lnk_get_base_addr(config), + config->sect_align, + config->file_align, + config->link_ver, + config->os_ver, + config->image_ver, + config->subsystem_ver, + config->subsystem, + config->dll_characteristics, + config->stack_reserve, + config->stack_commit, + config->heap_reserve, + config->heap_commit, + config->entry_point_name, + sect_arr); + } break; + default: { + lnk_not_implemented("TODO: PE Optional Header for %S", coff_string_from_machine_type(config->machine)); + } break; + } + lnk_build_pe_directories(symtab, header_sect, pe_optional_chunk); + lnk_build_coff_section_table(symtab, header_sect, coff_sect_header_chunk, sect_arr); + + ProfEnd(); + return win32_header_chunk; +} + +//////////////////////////////// + +internal +THREAD_POOL_TASK_FUNC(lnk_undef_symbol_finder) +{ + LNK_SymbolFinder *task = raw_task; + LNK_SymbolFinderResult *result = &task->result_arr[task_id]; + Rng1U64 range = task->range_arr[task_id]; + + for (U64 symbol_idx = range.min; symbol_idx < range.max; symbol_idx += 1) { + LNK_SymbolNode *symbol_node = task->lookup_node_arr.v[symbol_idx]; + LNK_Symbol *symbol = symbol_node->data; + Assert(symbol->type == LNK_Symbol_Undefined); + LNK_UndefinedSymbol *undef = &symbol->u.undefined; + + LNK_SymbolNode *has_defn = lnk_symbol_table_search_node(task->symtab, undef->scope_flags, symbol->name); + if (has_defn) { + Assert(LNK_Symbol_IsDefined(has_defn->data->type) || has_defn->data->type == LNK_Symbol_Weak); + continue; + } + + LNK_SymbolNode *lazy = lnk_symbol_table_search_node(task->symtab, LNK_SymbolScopeFlag_Lib, symbol->name); + if (lazy) { + lnk_push_input_from_lazy(arena, task->path_style, &lazy->data->u.lazy, &result->input_import_list, &result->input_obj_list); + } else { + lnk_symbol_list_push_node(&result->unresolved_symbol_list, symbol_node); + } + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_weak_symbol_finder) +{ + LNK_SymbolFinder *task = raw_task; + LNK_SymbolFinderResult *result = &task->result_arr[task_id]; + Rng1U64 range = task->range_arr[task_id]; + + for (U64 symbol_idx = range.min; symbol_idx < range.max; symbol_idx += 1) { + LNK_SymbolNode *symbol_node = task->lookup_node_arr.v[symbol_idx]; + LNK_Symbol *symbol = symbol_node->data; + Assert(symbol->type == LNK_Symbol_Weak); + LNK_WeakSymbol *weak = &symbol->u.weak; + + Assert((weak->scope_flags & ~(LNK_SymbolScopeFlag_Defined | LNK_SymbolScopeFlag_Internal)) == 0); + LNK_SymbolNode *has_strong_defn = lnk_symbol_table_search_node(task->symtab, weak->scope_flags, symbol->name); + if (has_strong_defn) { + Assert(LNK_Symbol_IsDefined(has_strong_defn->data->type)); + continue; + } + + LNK_SymbolNode *lazy = 0; + switch (weak->lookup_type) { + case COFF_WeakExtType_NOLIBRARY: { + // NOLIBRARY means weak symbol should be resolved in case where strong definition pulls in lib member. + } break; + case COFF_WeakExtType_SEARCH_LIBRARY: { + lazy = lnk_symbol_table_search_node(task->symtab, LNK_SymbolScopeFlag_Lib, symbol->name); + } break; + case COFF_WeakExtType_SEARCH_ALIAS: { + lazy = lnk_symbol_table_search_node(task->symtab, LNK_SymbolScopeFlag_Lib, symbol->name); + if (!lazy) { + if (str8_match(str8_lit(".weak."), symbol->name, StringMatchFlag_RightSideSloppy)) { + // TODO: Clang and MingGW encode extra info in alias + // + // __attribute__((weak,alias("foo"))) void bar(void); + // static void foo() {} + // + // Clang write these COFF symbols in obj for code above: + // + // 30 00000000 0000000001 0 FUNC NULL EXTERNAL foo + // ... + // 33 00000000 UNDEF 1 NULL NULL WEAK_EXTERNAL bar + // Tag Index 35, Characteristics SEARCH_ALIAS + // 35 00000000 0000000001 0 NULL NULL EXTERNAL .weak.bar.default.foo + // + // In this case linker needs to parse .weak.bar.default.foo and search for bar and foo as well. + Assert("TODO: MinGW weak symbol"); + } else { + lazy = lnk_symbol_table_search_node(task->symtab, LNK_SymbolScopeFlag_Lib, weak->fallback_symbol->name); + } + } + } break; + } + + if (lazy) { + lnk_push_input_from_lazy(arena, task->path_style, &lazy->data->u.lazy, &result->input_import_list, &result->input_obj_list); + } else { + lnk_symbol_list_push_node(&result->unresolved_symbol_list, symbol_node); + } + } +} + +internal LNK_SymbolFinderResult +lnk_run_symbol_finder(TP_Context *tp, + TP_Arena *arena, + PathStyle path_style, + LNK_SymbolTable *symtab, + LNK_SymbolList lookup_list, + TP_TaskFunc *task_func) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Setup Task"); + LNK_SymbolFinder task = {0}; + task.path_style = path_style; + task.symtab = symtab; + task.lookup_node_arr = lnk_symbol_node_array_from_list(scratch.arena, lookup_list); + task.result_arr = push_array(scratch.arena, LNK_SymbolFinderResult, tp->worker_count); + task.range_arr = tp_divide_work(scratch.arena, task.lookup_node_arr.count, tp->worker_count); + ProfEnd(); + + ProfBegin("Run Task"); + tp_for_parallel(tp, arena, tp->worker_count, task_func, &task); + ProfEnd(); + + ProfBegin("Concat Results"); + LNK_SymbolFinderResult result = {0}; + for (U64 i = 0; i < tp->worker_count; ++i) { + LNK_SymbolFinderResult *src = &task.result_arr[i]; + lnk_symbol_list_concat_in_place(&result.unresolved_symbol_list, &src->unresolved_symbol_list); + lnk_input_obj_list_concat_in_place(&result.input_obj_list, &src->input_obj_list); + lnk_input_import_list_concat_in_place(&result.input_import_list, &src->input_import_list); + } + ProfEnd(); + + // to get deterministic output accross multiple linker runs we have to sort inputs + ProfBegin("Sort Objs [Count %llu]", result.input_obj_list.count); + LNK_InputObj **input_obj_ptr_arr = lnk_array_from_input_obj_list(scratch.arena, result.input_obj_list); + qsort(input_obj_ptr_arr, result.input_obj_list.count, sizeof(input_obj_ptr_arr[0]), lnk_input_obj_compar); + //radsort(input_obj_ptr_arr, result.input_obj_list.count, lnk_input_obj_compar_is_before); + result.input_obj_list = lnk_list_from_input_obj_arr(input_obj_ptr_arr, result.input_obj_list.count); + ProfEnd(); + + ProfBegin("Sort Imports [Count %llu]", result.input_import_list.count); + LNK_InputImport **input_imp_ptr_arr = lnk_input_import_arr_from_list(scratch.arena, result.input_import_list); + //radsort(input_imp_ptr_arr, result.input_import_list.count, lnk_input_import_is_before); + qsort(input_imp_ptr_arr, result.input_import_list.count, sizeof(input_obj_ptr_arr[0]), lnk_input_import_compar); + result.input_import_list = lnk_list_from_input_import_arr(input_imp_ptr_arr, result.input_import_list.count); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_defined_symbol_inserter) +{ + LNK_DefinedSymbolInserter *task = raw_task; + LNK_SymbolTable *symtab = task->symtab; + Rng1U64 range = task->range_arr[task_id]; + for (U64 bucket_idx = range.min; bucket_idx < range.max; bucket_idx += 1) { + LNK_SymbolList *bucket = &task->bucket_arr[bucket_idx]; + for (LNK_SymbolNode *curr = bucket->first, *next; curr != 0; curr = next) { + next = curr->next; + LNK_SymbolNode *extant_node = lnk_symbol_table_search_bucket(symtab, LNK_SymbolScopeIndex_Defined, bucket_idx, curr->data->name, curr->hash); + if (extant_node) { + LNK_SymbolList *symtab_bucket = lnk_symbol_table_bucket_from_hash(symtab, LNK_SymbolScopeIndex_Defined, curr->hash); + lnk_symbol_list_insert_after(symtab_bucket, extant_node, curr); + } else { + lnk_symbol_table_push_(symtab, LNK_SymbolScopeIndex_Defined, curr, curr->hash); + } + } + } +} + +//////////////////////////////// + +internal void +lnk_apply_reloc(U64 base_addr, + U64 virt_align, + U64 file_align, + LNK_Section **sect_id_map, + LNK_SymbolTable *symtab, + String8 chunk_data, + LNK_Reloc *reloc) +{ + LNK_Symbol *symbol = lnk_resolve_symbol(symtab, reloc->symbol); + + // TODO: check if user forced to link with unresolved symbols and accordingly report the error + if (!LNK_Symbol_IsDefined(symbol->type)) { + lnk_error(LNK_Error_UndefinedSymbol, "%S", symbol->name); + return; + } + + U64 symbol_vsize = 0; + U64 symbol_fsize = 0; + U64 symbol_isect = 0; + U64 symbol_off = 0; + U64 symbol_voff = 0; + U64 symbol_foff = 0; + + LNK_DefinedSymbol *defined_symbol = &symbol->u.defined; + switch (defined_symbol->value_type) { + case LNK_DefinedSymbolValue_Null: break; + case LNK_DefinedSymbolValue_Chunk: { + symbol_isect = lnk_isect_from_symbol(sect_id_map, symbol); + symbol_vsize = lnk_virt_size_from_symbol(sect_id_map, symbol); + symbol_fsize = lnk_file_size_from_symbol(sect_id_map, symbol); + symbol_off = lnk_sect_off_from_symbol(sect_id_map, symbol); + symbol_voff = lnk_virt_off_from_symbol(sect_id_map, symbol); + symbol_foff = lnk_file_off_from_symbol(sect_id_map, symbol); + } break; + case LNK_DefinedSymbolValue_VA: { + symbol_voff = defined_symbol->u.va - base_addr; + } break; + } + +#if LNK_DEBUG + if (str8_match(str8_lit("__ImageBase"), symbol->name, 0)) { + Assert(symbol_isect == 0); + Assert(symbol_voff == 0); + Assert(symbol_foff == 0); + Assert(symbol_vsize == 0); + Assert(symbol_fsize == 0); + } +#endif + + U64 reloc_align = 1; + U64 reloc_size = 0; + S64 reloc_value = 0; + + switch (reloc->type) { + case LNK_Reloc_NULL: /* ignore */ break; + case LNK_Reloc_ADDR_16: { + reloc_value = safe_cast_u16(base_addr + symbol_voff); + reloc_size = 2; + } break; + case LNK_Reloc_ADDR_32: { + reloc_value = safe_cast_u32(base_addr + symbol_voff); + reloc_size = 4; + } break; + case LNK_Reloc_ADDR_64: { + reloc_value = base_addr + symbol_voff; + reloc_size = 8; + } break; + case LNK_Reloc_CHUNK_SIZE_FILE_16: { + reloc_value = safe_cast_u16(symbol_fsize); + reloc_size = 2; + } break; + case LNK_Reloc_CHUNK_SIZE_FILE_32: { + reloc_value = symbol_fsize; + reloc_size = 4; + } break; + case LNK_Reloc_CHUNK_SIZE_VIRT_32: { + reloc_value = symbol_vsize; + reloc_size = 4; + } break; + case LNK_Reloc_FILE_ALIGN_32: { + reloc_value = 0; + reloc_size = 4; + reloc_align = file_align; + } break; + case LNK_Reloc_FILE_OFF_32: { + reloc_value = safe_cast_u32(symbol_foff); + reloc_size = 4; + } break; + case LNK_Reloc_FILE_OFF_64: { + reloc_value = symbol_foff; + reloc_size = 8; + } break; + case LNK_Reloc_REL32: { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + reloc_value = safe_cast_s32((S64)(symbol_voff - reloc_voff) - (4 + 0)); + reloc_size = 4; + } break; + case LNK_Reloc_REL32_1: { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + reloc_value = safe_cast_s32((S64)(symbol_voff - reloc_voff) - (4 + 1)); + reloc_size = 4; + } break; + case LNK_Reloc_REL32_2: { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + reloc_value = safe_cast_s32((S64)(symbol_voff - reloc_voff) - (4 + 2)); + reloc_size = 4; + } break; + case LNK_Reloc_REL32_3: { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + reloc_value = safe_cast_s32((S64)(symbol_voff - reloc_voff) - (4 + 3)); + reloc_size = 4; + } break; + case LNK_Reloc_REL32_4: { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + reloc_value = safe_cast_s32((S64)(symbol_voff - reloc_voff) - (4 + 4)); + reloc_size = 4; + } break; + case LNK_Reloc_REL32_5: { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + reloc_value = safe_cast_s32((S64)(symbol_voff - reloc_voff) - (4 + 5)); + reloc_size = 4; + } break; + case LNK_Reloc_SECT_REL: { + reloc_value = safe_cast_u32(symbol_off); + reloc_size = 4; + } break; + case LNK_Reloc_SECT_IDX: { + reloc_value = safe_cast_u32(symbol_isect); + reloc_size = 4; + } break; + case LNK_Reloc_VIRT_ALIGN_32: { + reloc_value = 0; + reloc_size = 4; + reloc_align = virt_align; + } break; + case LNK_Reloc_VIRT_OFF_32: { + reloc_value = safe_cast_u32(symbol_voff); + reloc_size = 4; + } break; + default: NotImplemented; + } + + // read addend + Assert(reloc->apply_off + reloc_size <= chunk_data.size); + U64 raw_addend = 0; + MemoryCopy(&raw_addend, chunk_data.str + reloc->apply_off, reloc_size); + S64 addend = extend_sign64(raw_addend, reloc_size); + + // commit reloc value + reloc_value += addend; + reloc_value = AlignPow2(reloc_value, reloc_align); + MemoryCopy(chunk_data.str + reloc->apply_off, &reloc_value, reloc_size); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_section_reloc_patcher) +{ + LNK_SectionRelocPatcher *task = raw_task; + + LNK_SymbolTable *symtab = task->symtab; + LNK_SectionTable *st = task->st; + LNK_Section **sect_id_map = task->sect_id_map; + U64 base_addr = task->base_addr; + Rng1U64 range = task->range_arr[task_id]; + + for (U64 sect_idx = range.min; sect_idx < range.max; sect_idx += 1) { + LNK_Section *sect = task->sect_arr[sect_idx]; + + if (sect->has_layout) { + for (LNK_Reloc *reloc = sect->reloc_list.first; reloc != 0; reloc = reloc->next) { + LNK_Chunk *chunk = reloc->chunk; + if (lnk_chunk_is_discarded(chunk)) { + continue; + } + String8 chunk_data = lnk_data_from_chunk_ref(sect_id_map, chunk->ref); + lnk_apply_reloc(base_addr, st->sect_align, st->file_align, sect_id_map, symtab, chunk_data, reloc); + int bad_vs = 0; (void)bad_vs; + } + } else { + for (LNK_Reloc *reloc = sect->reloc_list.first; reloc != 0; reloc = reloc->next) { + LNK_Chunk *chunk = reloc->chunk; + if (lnk_chunk_is_discarded(chunk)) { + continue; + } + if (chunk->type != LNK_Chunk_Leaf) { + continue; + } + lnk_apply_reloc(base_addr, st->sect_align, st->file_align, sect_id_map, symtab, chunk->u.leaf, reloc); + int bad_vs = 0; (void)bad_vs; + } + } + } +} + +internal void +lnk_patch_relocs(TP_Context *tp, LNK_SymbolTable *symtab, LNK_SectionTable *st, U64 base_addr) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + LNK_SectionPtrArray sect_arr = lnk_section_ptr_array_from_list(scratch.arena, st->list); + + LNK_SectionRelocPatcher task; + task.symtab = symtab; + task.st = st; + task.sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + task.sect_arr = sect_arr.v; + task.base_addr = base_addr; + task.range_arr = tp_divide_work(scratch.arena, sect_arr.count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_section_reloc_patcher, &task); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_obj_reloc_patcher) +{ + LNK_ObjRelocPatcher *task = raw_task; + LNK_Obj *obj = task->obj_arr[task_id]; + + for (U64 sect_idx = 0; sect_idx < obj->sect_count; sect_idx += 1) { + LNK_RelocList reloc_list = obj->sect_reloc_list_arr[sect_idx]; + for (LNK_Reloc *reloc = reloc_list.first; reloc != 0; reloc = reloc->next) { + if (lnk_chunk_is_discarded(reloc->chunk)) { + continue; + } + Assert(reloc->chunk->type == LNK_Chunk_Leaf); + + String8 chunk_data; + { + LNK_Section *sect = lnk_sect_from_chunk_ref(task->sect_id_map, reloc->chunk->ref); + if (sect->has_layout) { + chunk_data = lnk_data_from_chunk_ref(task->sect_id_map, reloc->chunk->ref); + } else { + chunk_data = reloc->chunk->u.leaf; + } + } + + lnk_apply_reloc(task->base_addr, task->st->sect_align, task->st->file_align, task->sect_id_map, task->symtab, chunk_data, reloc); + int bad_vs = 0; (void)bad_vs; + } + } +} + +internal void +lnk_patch_relocs_obj(TP_Context *tp, LNK_ObjList obj_list, LNK_SymbolTable *symtab, LNK_SectionTable *st, U64 base_addr) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + LNK_ObjRelocPatcher task; + task.symtab = symtab; + task.st = st; + task.sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + task.base_addr = base_addr; + task.obj_arr = lnk_obj_arr_from_list(scratch.arena, obj_list); + tp_for_parallel(tp, 0, obj_list.count, lnk_obj_reloc_patcher, &task); + + scratch_end(scratch); + ProfEnd(); +} + +//////////////////////////////// + +internal LNK_SectionTable * +lnk_init_section_table(LNK_SymbolTable *symtab, U64 section_virt_off, U64 sect_align, U64 file_align) +{ + ProfBeginFunction(); + + static struct { + char *name; + char *symbol; + int flags; + } sect_layout[] = { + { ".null", LNK_NULL_SYMBOL_NAME, 0 }, + { ".text", LNK_TEXT_SYMBOL_NAME, LNK_TEXT_SECTION_FLAGS }, + { ".data", LNK_DATA_SYMBOL_NAME, LNK_DATA_SECTION_FLAGS }, + { ".rdata", LNK_RDATA_SYMBOL_NAME, LNK_RDATA_SECTION_FLAGS }, + { ".bss", LNK_BSS_SYMBOL_NAME, LNK_BSS_SECTION_FLAGS }, + { ".xdata", LNK_XDATA_SYMBOL_NAME, LNK_XDATA_SECTION_FLAGS }, + { ".pdata", LNK_PDATA_SYMBOL_NAME, LNK_PDATA_SECTION_FLAGS }, + { ".edata", LNK_EDATA_SYMBOL_NAME, LNK_EDATA_SECTION_FLAGS }, + { ".rsrc", LNK_RSRC_SYMBOL_NAME, LNK_RSRC_SECTION_FLAGS }, + { ".debug", LNK_DEBUG_SYMBOL_NAME, LNK_DEBUG_SECTION_FLAGS }, + }; + + LNK_SectionTable *st = lnk_section_table_alloc(section_virt_off, sect_align, file_align); + for (U64 i = 0; i < ArrayCount(sect_layout); ++i) { + LNK_Section *sect = lnk_section_table_push(st, str8_cstring(sect_layout[i].name), sect_layout[i].flags); + sect->symbol_name = str8_cstring(sect_layout[i].symbol); + sect->symbol_name = push_str8_copy(sect->arena, sect->symbol_name); + + LNK_Symbol *symbol = lnk_make_defined_symbol_chunk(symtab->arena, sect->symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, sect->root, 0, 0, 0); + lnk_symbol_table_push(symtab, symbol); + } + + st->null_sect = lnk_section_list_remove(&st->list, str8_lit(".null")); + + // dont build layout because we discard debug from image and move it to pdb + LNK_Section *debug_sect = lnk_section_table_search(st, str8_lit(".debug")); + debug_sect->emit_header = 0; + debug_sect->has_layout = 0; + + ProfEnd(); + return st; +} + +internal LNK_MergeDirectiveList +lnk_init_merge_directive_list(Arena *arena, LNK_ObjList obj_list) +{ + ProfBeginFunction(); + + LNK_MergeDirectiveList result = {0}; + + lnk_merge_directive_list_push(arena, &result, (LNK_MergeDirective){ str8_lit_comp(".xdata") , str8_lit_comp(".rdata") }); + + // collect merge directives from objs + for (LNK_ObjNode *obj_node = obj_list.first; obj_node != 0; obj_node = obj_node->next) { + LNK_Obj *obj = &obj_node->data; + for (LNK_Directive *dir = obj->directive_info.v[LNK_Directive_Merge].first; dir != 0; dir = dir->next) { + for (String8Node *value_node = dir->value_list.first; value_node != 0; value_node = value_node->next) { + LNK_MergeDirective merge_dir; + if (lnk_parse_merge_directive(value_node->string, &merge_dir)) { lnk_merge_directive_list_push(arena, &result, merge_dir); + } else { + lnk_error_obj(LNK_Warning_IllData, obj, "can't parse merge directive \"%S\"", value_node->string); + } + } + } + } + + ProfEnd(); + return result; +} + +internal void +lnk_discard_meta_data_sections(LNK_SectionTable *st) +{ + static char * meta_data_sect_arr[] = { + ".gfids", + ".giats", + ".gljmp", + ".gehcont", + }; + for (U64 meta_idx = 0; meta_idx < ArrayCount(meta_data_sect_arr); meta_idx += 1) { + String8 name = str8_cstring(meta_data_sect_arr[meta_idx]); + LNK_Section *sect = lnk_section_table_search(st, name); + if (sect) { + lnk_visit_chunks(sect->id, sect->root, lnk_chunk_mark_discarded, NULL); + sect->root->is_discarded = 0; + } + } +} + +//////////////////////////////// + +internal int +lnk_pdata_is_before_x8664(void *raw_a, void *raw_b) +{ + PE_IntelPdata *a = raw_a; + PE_IntelPdata *b = raw_b; + int is_before = a->voff_first < b->voff_first; + return is_before; +} + +//////////////////////////////// + +internal void +lnk_log_size_breakdown(LNK_SectionTable *st, LNK_SymbolTable *symtab) +{ + Temp scratch = scratch_begin(0, 0); + + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + + U64 code_size = 0; + U64 data_size = 0; + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + if (sect->has_layout) { + U64 sect_size = lnk_file_size_from_chunk_ref(sect_id_map, sect->root->ref); + if (sect->flags & COFF_SectionFlag_CNT_CODE) { + code_size += sect_size; + } else if (sect->flags & COFF_SectionFlag_CNT_INITIALIZED_DATA) { + data_size += sect_size; + } + } + } + + LNK_Symbol *dos_header_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_DOS_HEADER_SYMBOL_NAME)); + LNK_Symbol *dos_program_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_DOS_PROGRAM_SYMBOL_NAME)); + LNK_Symbol *coff_header_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_COFF_HEADER_SYMBOL_NAME)); + LNK_Symbol *coff_section_header_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_COFF_SECT_HEADER_ARRAY_SYMBOL_NAME)); + LNK_Symbol *pe_opt_header_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_PE_OPT_HEADER_SYMBOL_NAME)); + LNK_Symbol *pe_directories_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Internal, str8_lit(LNK_PE_DIRECTORY_ARRAY_SYMBOL_NAME)); + + LNK_Chunk *dos_header_chunk = dos_header_symbol->u.defined.u.chunk; + LNK_Chunk *dos_program_chunk = dos_program_symbol->u.defined.u.chunk; + LNK_Chunk *coff_header_chunk = coff_header_symbol->u.defined.u.chunk; + LNK_Chunk *coff_section_header_chunk = coff_section_header_symbol->u.defined.u.chunk; + LNK_Chunk *pe_opt_header_chunk = pe_opt_header_symbol->u.defined.u.chunk; + LNK_Chunk *pe_directories_chunk = pe_directories_symbol->u.defined.u.chunk; + + U64 dos_header_size = lnk_file_size_from_chunk_ref(sect_id_map, dos_header_chunk->ref); + U64 dos_program_size = lnk_file_size_from_chunk_ref(sect_id_map, dos_program_chunk->ref); + U64 coff_header_size = lnk_file_size_from_chunk_ref(sect_id_map, coff_header_chunk->ref); + U64 coff_section_header_size = lnk_file_size_from_chunk_ref(sect_id_map, coff_section_header_chunk->ref); + U64 pe_opt_header_size = lnk_file_size_from_chunk_ref(sect_id_map, pe_opt_header_chunk->ref); + U64 pe_directories_size = lnk_file_size_from_chunk_ref(sect_id_map, pe_directories_chunk->ref); + + String8 code_size_str = str8_from_memory_size2(scratch.arena, code_size); + String8 data_size_str = str8_from_memory_size2(scratch.arena, data_size); + + String8List output_list; MemoryZeroStruct(&output_list); + str8_list_pushf(scratch.arena, &output_list, "--- Image Size Breakdown -------------------------------------------------------"); + str8_list_pushf(scratch.arena, &output_list, " DOS Header: %u", dos_header_size); + str8_list_pushf(scratch.arena, &output_list, " DOS Program Stub: %u", dos_program_size); + str8_list_pushf(scratch.arena, &output_list, " COFF Header: %u", coff_header_size); + str8_list_pushf(scratch.arena, &output_list, " COFF Section Headers: %u", coff_section_header_size); + str8_list_pushf(scratch.arena, &output_list, " PE Header: %u", pe_opt_header_size); + str8_list_pushf(scratch.arena, &output_list, " Directories: %u", pe_directories_size); + str8_list_pushf(scratch.arena, &output_list, " Code Size: %S", code_size_str); + str8_list_pushf(scratch.arena, &output_list, " Data Size: %S", data_size_str); + + StringJoin new_line_join = { str8_lit_comp(""), str8_lit_comp("\n"), str8_lit_comp("") }; + String8 output = str8_list_join(scratch.arena, &output_list, &new_line_join); + lnk_log(LNK_Log_SizeBreakdown, "%S\n", output); + + scratch_end(scratch); +} + +internal void +lnk_log_link_stats(LNK_ObjList obj_list, LNK_LibList *lib_index, LNK_SectionTable *st) +{ + Temp scratch = scratch_begin(0, 0); + + U32 lib_count = 0; + for (U32 i = 0; i < LNK_InputSource_Count; i += 1) { + lib_count += lib_index[i].count; + } + U32 reloc_count = 0; + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + reloc_count += sect_node->data.reloc_list.count; + } + + String8List output_list = {0}; + str8_list_pushf(scratch.arena, &output_list, "------ Link Stats --------------------------------------------------------------"); + str8_list_pushf(scratch.arena, &output_list, " Linked Objs: %u", obj_list.count); + str8_list_pushf(scratch.arena, &output_list, " Linked Libs: %u", lib_count); + str8_list_pushf(scratch.arena, &output_list, " Relocs Patched: %u", reloc_count); + + StringJoin new_line_join = { str8_lit_comp(""), str8_lit_comp("\n"), str8_lit_comp("") }; + String8 output = str8_list_join(scratch.arena, &output_list, &new_line_join); + lnk_log(LNK_Log_LinkStats, "%S\n", output); + + scratch_end(scratch); +} + +internal void +lnk_log_timers(void) +{ + Temp scratch = scratch_begin(0, 0); + + U64 total_build_time_micro = 0; + for (U64 i = 0; i < LNK_Timer_Count; ++i) { + total_build_time_micro += g_timers[i].end - g_timers[i].begin; + } + + String8List output_list = {0}; + str8_list_pushf(scratch.arena, &output_list, "------ Link Times --------------------------------------------------------------"); + for (U64 i = 0; i < LNK_Timer_Count; ++i) { + U64 build_time_micro = g_timers[i].end - g_timers[i].begin; + if (build_time_micro != 0) { + String8 timer_name = lnk_string_from_timer_type(i); + DateTime time = date_time_from_micro_seconds(build_time_micro); + String8 time_str = string_from_elapsed_time(scratch.arena, time); + str8_list_pushf(scratch.arena, &output_list, " %-5S Time: %S", timer_name, time_str); + } + } + + DateTime total_time = date_time_from_micro_seconds(total_build_time_micro); + String8 total_time_str = string_from_elapsed_time(scratch.arena, total_time); + str8_list_pushf(scratch.arena, &output_list, " Total Time: %S", total_time_str); + + StringJoin new_line_join = { str8_lit_comp(""), str8_lit_comp("\n"), str8_lit_comp("") }; + String8 output = str8_list_join(scratch.arena, &output_list, &new_line_join); + lnk_log(LNK_Log_Timers, "%S\n", output); + + scratch_end(scratch); +} + +internal void +lnk_write_thread(void *raw_ctx) +{ + ProfBeginFunction(); + LNK_WriteThreadContext *ctx = raw_ctx; + lnk_write_data_to_file_path(ctx->path, ctx->data); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_blake3_hasher_task) +{ + ProfBeginFunction(); + + LNK_Blake3Hasher *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + String8 sub_data = str8_substr(task->data, range); + + blake3_hasher hasher; blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, sub_data.str, sub_data.size); + blake3_hasher_finalize(&hasher, (U8 *)task->hashes[task_id].u64, sizeof(task->hashes[task_id].u64)); + + ProfEnd(); +} + +internal U128 +lnk_blake3_hash_parallel(TP_Context *tp, U64 chunk_count, String8 data) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + ProfBegin("Hash Chunks"); + LNK_Blake3Hasher task = {0}; + task.data = data; + task.ranges = tp_divide_work(scratch.arena, data.size, chunk_count); + task.hashes = push_array(scratch.arena, U128, chunk_count); + tp_for_parallel(tp, 0, chunk_count, lnk_blake3_hasher_task, &task); + ProfEnd(); + + ProfBegin("Combine Hashes"); + blake3_hasher hasher; blake3_hasher_init(&hasher); + for (U64 i = 0; i < chunk_count; ++i) { + blake3_hasher_update(&hasher, (U8 *)task.hashes[i].u64, sizeof(task.hashes[i].u64)); + } + U128 result; + blake3_hasher_finalize(&hasher, (U8 *)result.u64, sizeof(result.u64)); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal void +lnk_run(int argc, char **argv) +{ + enum State { + State_Null, + State_InputSymbols, + State_InputImports, + State_InputDisallowLibs, + State_InputLibs, + State_InputObjs, + State_LookupUndef, + State_LookupWeak, + State_BuildAndInputLinkerObj, + State_BuildAndInputResObj, + State_PushDllHelperUndefSymbol, + State_PushLinkerSymbols, + State_PushLoadConfigUndefSymbol, + State_SearchEntryPoint, + State_CheckUnusedDelayLoads, + State_ReportUnresolvedSymbols, + State_RewireComdats, + State_DiscardMetaDataSections, + State_BuildDebugDirectory, + State_BuildExportTable, + State_MergeSections, + State_BuildCFGuards, + State_BuildBaseRelocs, + State_BuildWin32Header, + State_PatchRelocs, + State_SortExceptionInfo, + State_BuildImpLib, + State_BuildDebugInfo, + State_WriteImage, + }; + struct StateNode { + struct StateNode *next; + enum State state; + }; + struct StateList { + U64 count; + struct StateNode *first; + struct StateNode *last; + }; + +#define state_list_push(a, l, s) do { \ + struct StateNode *node = push_array(a, struct StateNode, 1); \ + node->state = s; \ + SLLQueuePush(l.first, l.last, node); \ + l.count += 1; \ + } while (0) +#define state_list_pop(l) (l).first->state; SLLQueuePop((l).first, (l).last); (l).count -= 1 + + ProfBeginFunction(); + + Temp scratch = scratch_begin(0, 0); + + LNK_Config *config = lnk_build_config(scratch.arena, argc, argv); + + TP_Context *tp = tp_alloc(scratch.arena, config->worker_count); + TP_Arena *tp_arena = tp_arena_alloc(tp); + + #if PROFILE_TELEMETRY + { + String8 cmdl = str8_list_join(scratch.arena, &config->raw_cmd_line, &(StringJoin){ .sep = str8_lit_comp(" ") }); + tmMessage(0, TMMF_ICON_NOTE, "Command Line: %.*s", str8_varg(cmdl)); + } + #endif + + // inputs + String8List include_symbol_list = config->include_symbol_list; + String8List input_disallow_lib_list = config->disallow_lib_list; + LNK_AltNameList alt_name_list = config->alt_name_list; + LNK_InputLibList input_libs[LNK_InputSource_Count] = {0}; + LNK_InputObjList input_obj_list = {0}; + LNK_InputImportList input_import_list = {0}; + LNK_SymbolList input_defn_list = {0}; + LNK_SymbolList input_weak_list = {0}; + + // :null_obj + lnk_input_obj_list_push(scratch.arena, &input_obj_list); + + // input command line objs + LNK_InputObjList cmd_line_obj_inputs = lnk_input_obj_list_from_string_list(scratch.arena, config->input_list[LNK_Input_Obj]); + lnk_input_obj_list_concat_in_place(&input_obj_list, &cmd_line_obj_inputs); + + // input command line libs + input_libs[LNK_InputSource_CmdLine] = config->input_list[LNK_Input_Lib]; + input_libs[LNK_InputSource_Default] = config->input_default_lib_list; + + // state + LNK_SymbolTable *symtab = lnk_symbol_table_alloc_ex(config->symbol_table_cap_defined, config->symbol_table_cap_internal, config->symbol_table_cap_weak, config->symbol_table_cap_lib); + LNK_SectionTable *st = lnk_init_section_table(symtab, config->section_virt_off, config->sect_align, config->file_align); + LNK_ImportTable *imptab_regular = 0; + LNK_ImportTable *imptab_delayed = 0; + LNK_ExportTable *exptab = lnk_export_table_alloc(); + HashTable *disallow_lib_ht = hash_table_init(scratch.arena, 0x100); + HashTable *delay_load_dll_ht = hash_table_init(scratch.arena, 0x100); + HashTable *default_lib_ht = hash_table_init(scratch.arena, 0x100); + HashTable *loaded_lib_ht = hash_table_init(scratch.arena, 0x100); + HashTable *missing_lib_ht = hash_table_init(scratch.arena, 0x100); + HashTable *loaded_obj_ht = hash_table_init(scratch.arena, 0x4000); + LNK_SymbolList lookup_undef_list = {0}; + LNK_SymbolList lookup_weak_list = {0}; + LNK_SymbolList unresolved_undef_list = {0}; + LNK_SymbolList unresolved_weak_list = {0}; + U64 entry_search_attempts = 0; + B32 build_debug_info = lnk_do_debug_info(config); + B32 build_linker_obj = build_debug_info; + B32 build_debug_directory = build_debug_info; + B32 build_res_obj = 1; + B32 discard_meta_data_sections = 1; + B32 merge_sections = !!(config->flags & LNK_ConfigFlag_Merge); + B32 build_cf_guards = 0; // (config->flags != LNK_Guard_NONE); + B32 build_export_table = 1; + B32 build_base_relocs = !(config->flags & LNK_ConfigFlag_Fixed); + B32 report_unresolved_symbols = 1; + B32 check_unused_delay_loads = !!(config->flags & LNK_ConfigFlag_CheckUnusedDelayLoadDll); + B32 do_comdat_rewire = 1; + B32 build_win32_header = 1; + B32 patch_relocs = 1; + B32 sort_exception_info = 1; + B32 build_imp_lib = 1; + LNK_ObjList obj_list = {0}; + LNK_LibList lib_index[LNK_InputSource_Count] = {0}; + String8 image_data = str8_zero(); + OS_Handle image_write_thread = {0}; + + // init state machine + struct StateList state_list = {0}; + state_list_push(scratch.arena, state_list, State_InputObjs); + state_list_push(scratch.arena, state_list, State_InputLibs); + state_list_push(scratch.arena, state_list, State_PushLinkerSymbols); + if (config->delay_load_dll_list.node_count) { + for (String8Node *delay_load_dll_node = config->delay_load_dll_list.first; + delay_load_dll_node != 0; + delay_load_dll_node = delay_load_dll_node->next) { + hash_table_push_path_u64(scratch.arena, delay_load_dll_ht, delay_load_dll_node->string, 0); + } + state_list_push(scratch.arena, state_list, State_PushDllHelperUndefSymbol); + } + if (config->guard_flags != LNK_Guard_None) { + state_list_push(scratch.arena, state_list, State_PushLoadConfigUndefSymbol); + } + + ProfBegin("Image"); // :EndImage + ProfBegin("Build"); // :EndBuild + lnk_timer_begin(LNK_Timer_Image); + + // run states + for (;;) { + while (state_list.count) { + enum State state = state_list_pop(state_list); + switch (state) { + case State_Null: break; + case State_SearchEntryPoint: { + ProfBegin("Serach Entry Point"); + LNK_Symbol *entry_point_symbol = 0; + + B32 is_entry_point_unspecified = config->entry_point_name.size == 0; + if (is_entry_point_unspecified) { + if (config->subsystem == PE_WindowsSubsystem_UNKNOWN) { + // we don't have a subsystem and entry point name, + // so we loop over every subsystem and search potential entry + // points in the symbol table + for (U64 subsys_idx = 0; subsys_idx < PE_WindowsSubsystem_COUNT; subsys_idx += 1) { + String8Array name_arr = pe_get_entry_point_names(config->machine, (PE_WindowsSubsystem)subsys_idx, config->file_characteristics); + for (U64 entry_idx = 0; entry_idx < name_arr.count; entry_idx += 1) { + entry_point_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Defined, name_arr.v[entry_idx]); + if (entry_point_symbol) { + config->subsystem = (PE_WindowsSubsystem)subsys_idx; + goto dbl_break; + } + } + } + + // search for potential entry points in libs + if (!entry_point_symbol) { + for (U64 subsys_idx = 0; subsys_idx < PE_WindowsSubsystem_COUNT; subsys_idx += 1) { + String8Array name_arr = pe_get_entry_point_names(config->machine, (PE_WindowsSubsystem)subsys_idx, config->file_characteristics); + for (U64 entry_idx = 0; entry_idx < name_arr.count; entry_idx += 1) { + entry_point_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Lib, name_arr.v[entry_idx]); + if (entry_point_symbol) { + config->subsystem = (PE_WindowsSubsystem)subsys_idx; + goto dbl_break; + } + } + } + } + + dbl_break:; + } else { + // we have subsystem but no entry point name, get potential entry point names + // and see which is in the symbol table + String8Array name_arr = pe_get_entry_point_names(config->machine, config->subsystem, config->file_characteristics); + for (U64 entry_idx = 0; entry_idx < name_arr.count; entry_idx += 1) { + LNK_Symbol *symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Defined, name_arr.v[entry_idx]); + if (symbol) { + if (entry_point_symbol) { + lnk_error(LNK_Error_EntryPoint, + "multiple entry point symbols found: %S(%S) and %S(%S)", + entry_point_symbol->name, entry_point_symbol->debug, + symbol->name, symbol->debug); + } else { + entry_point_symbol = symbol; + } + } + } + + // search for entry point in libs + if (!entry_point_symbol) { + for (U64 entry_idx = 0; entry_idx < name_arr.count; entry_idx += 1) { + entry_point_symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Lib, name_arr.v[entry_idx]); + if (entry_point_symbol) { + break; + } + } + } + } + + // redirect user entry to appropriate CRT entry + if (entry_point_symbol) { + config->entry_point_name = entry_point_symbol->name; + if (str8_match(config->entry_point_name, str8_lit("wmain"), 0)) { + config->entry_point_name = str8_lit("wmainCRTStartup"); + } else if (str8_match(config->entry_point_name, str8_lit("main"), 0)) { + config->entry_point_name = str8_lit("mainCRTStartup"); + } else if (str8_match(config->entry_point_name, str8_lit("WinMain"), 0)) { + config->entry_point_name = str8_lit("WinMainCRTStartup"); + } else if (str8_match(config->entry_point_name, str8_lit("wWinMain"), 0)) { + config->entry_point_name = str8_lit("wWinMainCRTStartup"); + } + } + } + + // generate undefined symbol so in case obj is in lib it will be linked + if (config->entry_point_name.size) { + str8_list_push(scratch.arena, &include_symbol_list, config->entry_point_name); + } + // no entry point, error and exit + else { + lnk_error(LNK_Error_EntryPoint, "unable to find entry point symbol"); + } + + // by default terminal server is enabled for windows and console applications + if (~config->flags & LNK_ConfigFlag_NoTsAware && + ~config->file_characteristics & PE_ImageFileCharacteristic_FILE_DLL) { + if (config->subsystem == PE_WindowsSubsystem_WINDOWS_GUI || config->subsystem == PE_WindowsSubsystem_WINDOWS_CUI) { + config->dll_characteristics |= PE_DllCharacteristic_TERMINAL_SERVER_AWARE; + } + } + + if (config->subsystem_ver.major == 0 && config->subsystem_ver.minor == 0) { + // subsystem version not specified, set default values + config->subsystem_ver = lnk_get_default_subsystem_version(config->subsystem, config->machine); + } + + // check subsystem version against allowed min version + Version min_subsystem_ver = lnk_get_min_subsystem_version(config->subsystem, config->machine); + int ver_cmp = version_compar(config->subsystem_ver, min_subsystem_ver); + if (ver_cmp < 0) { + lnk_error(LNK_Error_Cmdl, "subsystem version %I64u.%I64u can't be lower than %I64u.%I64u", + config->subsystem_ver.major, config->subsystem_ver.minor, min_subsystem_ver.major, min_subsystem_ver.minor); + } + + ProfEnd(); + } break; + case State_PushDllHelperUndefSymbol: { + ProfBegin("Puhs Dll Helper Undef Symbol"); + + String8 delay_helper_name = str8_zero(); + switch (config->machine) { + case COFF_MachineType_X86: delay_helper_name = str8_cstring(LNK_DELAY_LOAD_HELPER2_X86_SYMBOL_NAME); break; + case COFF_MachineType_X64: delay_helper_name = str8_cstring(LNK_DELAY_LOAD_HELPER2_SYMBOL_NAME); break; + default: NotImplemented; + } + + str8_list_push(scratch.arena, &include_symbol_list, delay_helper_name); + ProfEnd(); + } break; + case State_PushLoadConfigUndefSymbol: { + ProfBegin("Push Load Config Undef Symbol"); + String8 load_config_name = str8_lit(LNK_LOAD_CONFIG_SYMBOL_NAME); + str8_list_push(scratch.arena, &include_symbol_list, load_config_name); + ProfEnd(); + } break; + case State_PushLinkerSymbols: { + ProfBegin("Push Linker Symbols"); + lnk_push_linker_symbols(symtab, config->machine); + ProfEnd(); + } break; + case State_InputSymbols: { + ProfBegin("Input Symbols"); + + ProfBegin("Push /INCLUDE Symbols"); + for (String8Node *include_node = include_symbol_list.first; include_node != 0; include_node = include_node->next) { + String8 name = push_str8_copy(symtab->arena, include_node->string); + LNK_Symbol *symbol = lnk_make_undefined_symbol(symtab->arena, name, LNK_SymbolScopeFlag_Main); + lnk_symbol_list_push(scratch.arena, &lookup_undef_list, symbol); + } + ProfEnd(); + + ProfBegin("Push /ALTERNATIVENAME Symbols"); + Assert(alt_name_list.from_list.node_count == alt_name_list.to_list.node_count); + for (String8Node *from_node = alt_name_list.from_list.first, *to_node = alt_name_list.to_list.first; + from_node != 0; + from_node = from_node->next, to_node = to_node->next) { + String8 to_name = push_str8_copy(symtab->arena, to_node->string); + String8 from_name = push_str8_copy(symtab->arena, from_node->string); + LNK_Symbol *fallback = lnk_make_undefined_symbol(symtab->arena, to_name, LNK_SymbolScopeFlag_Main); + LNK_Symbol *weak = lnk_make_weak_symbol(symtab->arena, from_name, COFF_WeakExtType_SEARCH_ALIAS, fallback); + lnk_symbol_list_push(scratch.arena, &input_weak_list, weak); + } + ProfEnd(); + + ProfBegin("Push Defined Symbols"); + { + Temp temp = temp_begin(scratch.arena); + + ProfBegin("List -> Array"); + LNK_SymbolNodeArray symbol_arr = lnk_symbol_node_array_from_list(temp.arena, input_defn_list); + ProfEnd(); + + ProfBegin("Hash Symbol Names"); + lnk_symbol_node_ptr_array_hash(tp, symbol_arr.v, symbol_arr.count); + ProfEnd(); + + ProfBegin("Populate Buckets"); + LNK_SymbolList *bucket_arr = push_array(temp.arena, LNK_SymbolList, symtab->bucket_count[LNK_SymbolScopeIndex_Defined]); + for (U64 symbol_idx = 0; symbol_idx < symbol_arr.count; symbol_idx += 1) { + LNK_SymbolNode *symbol_node = symbol_arr.v[symbol_idx]; + U64 bucket_idx = symbol_node->hash % symtab->bucket_count[LNK_SymbolScopeIndex_Defined]; + lnk_symbol_list_push_node(&bucket_arr[bucket_idx], symbol_node); + } + ProfEnd(); + + ProfBegin("Insert Defined Symbols"); + LNK_DefinedSymbolInserter symbol_inserter = {0}; + symbol_inserter.symtab = symtab; + symbol_inserter.bucket_arr = bucket_arr; + symbol_inserter.range_arr = tp_divide_work(temp.arena, symtab->bucket_count[LNK_SymbolScopeIndex_Defined], tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_defined_symbol_inserter, &symbol_inserter); + ProfEnd(); + + temp_end(temp); + } + ProfEnd(); + + ProfBegin("Push Weak Symbols"); + for (LNK_SymbolNode *curr = input_weak_list.first; curr != 0; curr = curr->next) { + lnk_symbol_table_push(symtab, curr->data); + } + ProfEnd(); + + LNK_SymbolList new_weak_symbols = lnk_symbol_list_copy(scratch.arena, input_weak_list); + + // we defined new symbols, give unresolved symbols another chance to be resolved + lnk_symbol_list_concat_in_place(&lookup_undef_list, &unresolved_undef_list); + lnk_symbol_list_concat_in_place(&lookup_weak_list, &new_weak_symbols); + lnk_symbol_list_concat_in_place(&lookup_weak_list, &unresolved_weak_list); + + // reset inputs + MemoryZeroStruct(&include_symbol_list); + MemoryZeroStruct(&alt_name_list); + MemoryZeroStruct(&input_defn_list); + MemoryZeroStruct(&input_weak_list); + + ProfEnd(); + } break; + case State_InputImports: { + ProfBegin("Input Imports"); + for (LNK_InputImport *input = input_import_list.first; input != 0; input = input->next) { + COFF_ImportHeader *import_header = &input->import_header; + KeyValuePair *is_delayed = hash_table_search_path(delay_load_dll_ht, import_header->dll_name); + + if (is_delayed) { + if (!imptab_delayed) { + Assert(config->machine != COFF_MachineType_UNKNOWN); + B32 is_unloadable = !!(config->flags & LNK_ConfigFlag_DelayUnload); + B32 is_bindable = !!(config->flags & LNK_ConfigFlag_DelayBind); + imptab_delayed = lnk_import_table_alloc_delayed(st, symtab, config->machine, is_unloadable, is_bindable); + } + LNK_ImportDLL *dll = lnk_import_table_search_dll(imptab_delayed, import_header->dll_name); + if (!dll) { + dll = lnk_import_table_push_dll_delayed(imptab_delayed, symtab, import_header->dll_name, import_header->machine); + } + LNK_ImportFunc *func = lnk_import_table_search_func(dll, import_header->func_name); + if (!func) { + func = lnk_import_table_push_func_delayed(imptab_delayed, symtab, dll, import_header); + } + } else { + if (!imptab_regular) { + Assert(config->machine != COFF_MachineType_UNKNOWN); + imptab_regular = lnk_import_table_alloc_regular(st, symtab, config->machine); + } + LNK_ImportDLL *dll = lnk_import_table_search_dll(imptab_regular, import_header->dll_name); + if (!dll) { + dll = lnk_import_table_push_dll_regular(imptab_regular, symtab, import_header->dll_name, import_header->machine); + } + LNK_ImportFunc *func = lnk_import_table_search_func(dll, import_header->func_name); + if (!func) { + func = lnk_import_table_push_func_regular(imptab_regular, symtab, dll, import_header); + } + } + } + + // reset input + MemoryZeroStruct(&input_import_list); + + ProfEnd(); + } break; + case State_InputDisallowLibs: { + ProfBegin("Input /disallowlib"); + + for (String8Node *name_n = input_disallow_lib_list.first; name_n != 0; name_n = name_n->next) { + if ( ! lnk_is_lib_disallowed(disallow_lib_ht, name_n->string)) { + lnk_push_disallow_lib(scratch.arena, disallow_lib_ht, name_n->string); + } + } + + // reset input + MemoryZeroStruct(&input_disallow_lib_list); + + ProfEnd(); + } break; + case State_InputObjs: { + ProfBegin("Input Objs [Count %llu]", input_obj_list.count); + + ProfBegin("Gather Objs"); + LNK_InputObjList unique_obj_input_list = {0}; + for (LNK_InputObj *input = input_obj_list.first, *next; input != 0; input = next) { + next = input->next; + + B32 was_obj_loaded = hash_table_search_path_u64(loaded_obj_ht, input->dedup_id, 0); + if (was_obj_loaded) { + continue; + } + + String8 full_path = os_make_full_path(scratch.arena, input->dedup_id); + B32 was_full_path_used = hash_table_search_path_u64(loaded_obj_ht, full_path, 0); + if (was_full_path_used) { + continue; + } + + hash_table_push_path_u64(scratch.arena, loaded_obj_ht, input->dedup_id, 0); + if (!str8_match(input->dedup_id, full_path, StringMatchFlag_CaseInsensitive|StringMatchFlag_SlashInsensitive)) { + hash_table_push_path_u64(scratch.arena, loaded_obj_ht, full_path, 0); + } + + lnk_input_obj_list_push_node(&unique_obj_input_list, input); + lnk_log(LNK_Log_InputObj, "Input Obj: %S", full_path); + } + ProfEnd(); + + ProfBegin("Load Objs From Disk"); + LNK_InputObj **input_obj_arr = lnk_array_from_input_obj_list(scratch.arena, unique_obj_input_list); + tp_for_parallel(tp, tp_arena, unique_obj_input_list.count, lnk_load_thin_objs_task, input_obj_arr); + ProfEnd(); + + ProfBegin("Disk Read Check"); + for (U64 input_idx = 0; input_idx < unique_obj_input_list.count; ++input_idx) { + if (input_obj_arr[input_idx]->has_disk_read_failed) { + lnk_error(LNK_Error_InvalidPath, "unable to find obj \"%S\"", input_obj_arr[input_idx]->path); + } + } + ProfEnd(); + + LNK_ObjNodeArray obj_node_arr = lnk_obj_list_push_parallel(tp, tp_arena, &obj_list, st, unique_obj_input_list.count, input_obj_arr); + + ProfBegin("Machine Compat Check"); + for (U64 obj_idx = 0; obj_idx < obj_node_arr.count; ++obj_idx) { + LNK_Obj *obj = &obj_node_arr.v[obj_idx].data; + + // derive machine from obj + if (config->machine == COFF_MachineType_UNKNOWN) { + config->machine = obj->machine; + } + + // is obj machine compatible? + if (obj->machine != COFF_MachineType_UNKNOWN && // obj with unknown machine type is compatible with any other machine type + config->machine != obj->machine) { + lnk_error(LNK_Error_IncompatibleObj, + "conflicting machine types expected %S but got %S in obj %S", + coff_string_from_machine_type(config->machine), + coff_string_from_machine_type(obj->machine), + obj->path); + } + } + ProfEnd(); + + ProfBegin("Collect Directives"); + for (U64 i = 0; i < obj_node_arr.count; ++i) { + LNK_Obj *obj = &obj_node_arr.v[i].data; + str8_list_concat_in_place(&include_symbol_list, &obj->include_symbol_list); + lnk_alt_name_list_concat_in_place(&alt_name_list, &obj->alt_name_list); + for (LNK_Directive *dir = obj->directive_info.v[LNK_Directive_DisallowLib].first; dir != 0; dir = dir->next) { + str8_list_concat_in_place(&input_disallow_lib_list, &dir->value_list); + } + } + ProfEnd(); + + // gather libs for input + LNK_InputLibList lib_list = lnk_collect_default_lib_obj_arr(tp, tp_arena, obj_node_arr); // TODO: put these on temp arena + str8_list_concat_in_place(&input_libs[LNK_InputSource_Obj], &lib_list); + + // gather symbols for input + LNK_SymbolList new_defn_list = lnk_run_symbol_collector(tp, tp_arena, obj_node_arr, LNK_Symbol_DefinedExtern); + LNK_SymbolList new_weak_list = lnk_run_symbol_collector(tp, tp_arena, obj_node_arr, LNK_Symbol_Weak); + LNK_SymbolList new_undef_list = lnk_run_symbol_collector(tp, tp_arena, obj_node_arr, LNK_Symbol_Undefined); // TODO: allocate these on temp arena + + // schedule symbol input + lnk_symbol_list_concat_in_place(&input_defn_list, &new_defn_list); + lnk_symbol_list_concat_in_place(&input_weak_list, &new_weak_list); + lnk_symbol_list_concat_in_place(&lookup_undef_list, &new_undef_list); + + // reset input objs + MemoryZeroStruct(&input_obj_list); + + if (lnk_get_log_status(LNK_Log_InputObj)) { + U64 input_size = 0; + for (U64 i = 0; i < obj_node_arr.count; ++i) { + input_size += obj_node_arr.v[i].data.data.size; + } + String8 input_size_string = str8_from_memory_size2(scratch.arena, input_size); + lnk_log(LNK_Log_InputObj, "[ Obj Input Size %S ]", input_size_string); + } + + ProfEnd(); + } break; + case State_InputLibs: { + ProfBegin("Input Libs"); + + for (U64 input_source = 0; input_source < ArrayCount(input_libs); ++input_source) { + LNK_InputLibList input_lib_list = input_libs[input_source]; + + ProfBegin("Remove Duplicte Input Paths"); + LNK_InputLibList unique_input_lib_list = {0}; + for (LNK_InputLib *input = input_lib_list.first; input != 0; input = input->next) { + String8 path = input->string; + + if (lnk_is_lib_disallowed(disallow_lib_ht, path)) { + continue; + } + if (lnk_is_lib_loaded(default_lib_ht, loaded_lib_ht, input_source, path)) { + continue; + } + + // search disk for library + String8List match_list = os_file_search(scratch.arena, config->lib_dir_list, path); + String8 absolute_path = match_list.node_count ? match_list.first->string : str8(0,0); + + // default to first match + if (lnk_is_lib_loaded(default_lib_ht, loaded_lib_ht, input_source, absolute_path)) { + continue; + } + + // warn about missing lib + if (match_list.node_count == 0) { + KeyValuePair *was_reported = hash_table_search_path(missing_lib_ht, path); + if (!was_reported) { + hash_table_push_path_u64(scratch.arena, missing_lib_ht, path, 0); + lnk_error(LNK_Warning_FileNotFound, "unable to find library `%S`", path); + } + continue; + } + + // warn about multiple matches + if (match_list.node_count > 1) { + lnk_error(LNK_Warning_MultipleLibMatch, "multiple libs match `%S` (picking first match)", path); + lnk_supplement_error_list(match_list); + } + + // save paths for future checks + lnk_push_loaded_lib(scratch.arena, default_lib_ht, loaded_lib_ht, path); + lnk_push_loaded_lib(scratch.arena, default_lib_ht, loaded_lib_ht, absolute_path); + + // push library for loading + str8_list_push(scratch.arena, &unique_input_lib_list, absolute_path); + + lnk_log(LNK_Log_InputLib, "Input Lib: %S", absolute_path); + } + ProfEnd(); + + LNK_LibNodeArray lib_arr; + { + ProfBegin("Disk Read Libs"); + String8Array path_arr = str8_array_from_list(scratch.arena, &unique_input_lib_list); + String8Array data_arr = os_data_from_file_path_parallel(tp, tp_arena->v[0], path_arr); + ProfEnd(); + + ProfBegin("Lib Init"); + lib_arr = lnk_lib_list_push_parallel(tp, tp_arena, &lib_index[input_source], data_arr, path_arr); + ProfEnd(); + + ProfBegin("Count Symbols"); + U64 total_symbol_count = 0; + for (U64 lib_idx = 0; lib_idx < lib_arr.count; lib_idx += 1) { + total_symbol_count += lib_arr.v[lib_idx].data.symbol_count; + } + ProfEnd(); + + ProfBegin("Setup Symbol Array Pointers"); + LNK_Symbol *symbol_arr = push_array_no_zero(symtab->arena, LNK_Symbol, total_symbol_count); + LNK_Symbol **symbol_arr_arr = push_array_no_zero(scratch.arena, LNK_Symbol *, lib_arr.count); + for (U64 lib_idx = 0, cursor = 0; lib_idx < lib_arr.count; lib_idx += 1) { + symbol_arr_arr[lib_idx] = &symbol_arr[cursor]; + cursor += lib_arr.v[lib_idx].data.symbol_count; + } + ProfEnd(); + + ProfBegin("Lazy Symbol Init"); + LNK_LazyIniter lazy_initer_ctx = {0}; + lazy_initer_ctx.range_arr = tp_divide_work(scratch.arena, lib_arr.count, tp->worker_count); + lazy_initer_ctx.lib_arr = lib_arr.v; + lazy_initer_ctx.symbol_arr_arr = symbol_arr_arr; + tp_for_parallel(tp, 0, tp->worker_count, lnk_lazy_initer, &lazy_initer_ctx); + ProfEnd(); + + lnk_symbol_table_push_lazy_arr(tp, symtab, symbol_arr, total_symbol_count); + } + + if (lnk_get_log_status(LNK_Log_InputLib)) { + if (lib_arr.count > 0) { + U64 input_size = 0; + for (U64 i = 0; i < lib_arr.count; ++i) { + input_size += lib_arr.v[i].data.data.size; + } + String8 input_size_string = str8_from_memory_size2(scratch.arena, input_size); + lnk_log(LNK_Log_InputObj, "[ Lib Input Size %S ]", input_size_string); + } + } + } + + // reset input libs + MemoryZeroArray(input_libs); + + ProfEnd(); + } break; + case State_BuildAndInputResObj: { + String8List res_data_list = {0}; + String8List res_path_list = {0}; + + ProfBegin("Build * Resources *"); + { + // load .res from disk + for (String8Node *node = config->input_list[LNK_Input_Res].first; node != 0; node = node->next) { + String8 res_data = os_data_from_file_path(tp_arena->v[0], node->string); + if (res_data.size > 0) { + if (pe_is_res(res_data)) { + String8 stable_res_path = lnk_make_full_path(tp_arena->v[0], config->work_dir, config->path_style, node->string); + str8_list_push(scratch.arena, &res_path_list, stable_res_path); + str8_list_push(tp_arena->v[0], &res_data_list, res_data); + } else { + lnk_error(LNK_Error_IllData, "file is not of RES format: %S", node->string); + } + } else { + lnk_error(LNK_Error_FileNotFound, "unable to open res file: %S", node->string); + } + } + } + ProfEnd(); + + // handle manifest + ProfBegin("Manifest"); + { + LNK_Obj **obj_arr = lnk_obj_arr_from_list(scratch.arena, obj_list); + String8List obj_dep_list = lnk_collect_manifest_dependency_list(tp, tp_arena, obj_arr, obj_list.count); + String8List cmd_dep_list = str8_list_copy(scratch.arena, &config->manifest_dependency_list); + + String8List dep_list = {0}; + str8_list_concat_in_place(&dep_list, &obj_dep_list); + str8_list_concat_in_place(&dep_list, &cmd_dep_list); + + B32 create_manifest = config->input_list[LNK_Input_Manifest].node_count > 0 || + dep_list.node_count > 0 || + config->manifest_opt == LNK_ManifestOpt_Embed; + if (create_manifest) { + String8List input_manifest_path_list = str8_list_copy(tp_arena->v[0], &config->input_list[LNK_Input_Manifest]); + + // TODO: we write a temp file with manifest attributes collected from obj directives and command line switches + // so we can pass file to mt.exe or llvm-mt.exe, when we have our own tool for merging manifest we can switch + // to writing manifest file in memory to skip roun-trip to disk + String8List linker_manifest_data_list = lnk_make_linker_manifest(tp_arena->v[0], config->manifest_uac, config->manifest_level, config->manifest_ui_access, dep_list); + String8 linker_manifest_path = push_str8f(scratch.arena, "%S.manifest.temp", config->manifest_name); + lnk_write_data_list_to_file_path(linker_manifest_path, linker_manifest_data_list); + str8_list_push(tp_arena->v[0], &input_manifest_path_list, linker_manifest_path); + + String8 manifest_path = lnk_merge_manifest_files(tp_arena->v[0], config->mt_path, config->manifest_name, input_manifest_path_list); + + if (config->manifest_opt == LNK_ManifestOpt_Embed) { + // TODO: currently we convert manifest to res and parse res again, this unnecessary instead push manifest + // resource to the tree directly + + String8 manifest_data = os_data_from_file_path(scratch.arena, manifest_path); + if (manifest_data.size == 0) { + lnk_error(LNK_Error_Mt, "unable to locate manifest to embed on disk, path \"%S\"", manifest_path); + } + String8 manifest_res = lnk_res_from_data(tp_arena->v[0], manifest_data); + str8_list_push(tp_arena->v[0], &res_data_list, manifest_res); + str8_list_push(tp_arena->v[0], &res_path_list, manifest_path); + } + + // cleanup disk + os_delete_file_at_path(linker_manifest_path); + if (config->delete_manifest) { + os_delete_file_at_path(manifest_path); + } + } + } + ProfEnd(); // Manifest + + if (res_data_list.node_count > 0) { + String8 obj_name = str8_lit("* Resources *"); + String8 obj_data = lnk_obj_from_res_file_list(tp, + tp_arena->v[0], + st, + symtab, + res_data_list, + res_path_list, + config->machine, + config->time_stamp, + config->work_dir, + config->path_style, + obj_name); + + LNK_InputObj *input = lnk_input_obj_list_push(scratch.arena, &input_obj_list); + input->dedup_id = obj_name; + input->path = obj_name; + input->data = obj_data; + } + } break; + case State_BuildAndInputLinkerObj: { + ProfBegin("Build * Linker * Obj"); + + String8 obj_name = str8_lit("* Linker *"); + + StringJoin join = { str8_lit_comp(""), str8_lit_comp(" "), str8_lit_comp("") }; + String8 raw_cmd_line = str8_list_join(scratch.arena, &config->raw_cmd_line, &join); + + String8 obj_data = lnk_make_linker_coff_obj(tp, scratch.arena, config->time_stamp, config->machine, config->work_dir, config->image_name, config->pdb_name, raw_cmd_line, obj_name); + + LNK_InputObj *input = lnk_input_obj_list_push(scratch.arena, &input_obj_list); + input->dedup_id = obj_name; + input->path = obj_name; + input->data = obj_data; + + ProfEnd(); + } break; + case State_LookupUndef: { + ProfBegin("Lookup Undefined Symbols"); + // search archives + LNK_SymbolFinderResult result = lnk_run_symbol_finder(tp, tp_arena, config->path_style, symtab, lookup_undef_list, lnk_undef_symbol_finder); // TODO: put these on temp arena + + // new inputs found + input_obj_list = result.input_obj_list; + input_import_list = result.input_import_list; + + // undefined symbols that weren't resolved + lnk_symbol_list_concat_in_place(&unresolved_undef_list, &result.unresolved_symbol_list); + + // reset input + MemoryZeroStruct(&lookup_undef_list); + ProfEnd(); + } break; + case State_LookupWeak: { + ProfBegin("Lookup Weak Symbols"); + // search archives + LNK_SymbolFinderResult result = lnk_run_symbol_finder(tp, tp_arena, config->path_style, symtab, lookup_weak_list, lnk_weak_symbol_finder); // TODO: put these on temp arena + + // schedule new inputs + input_obj_list = result.input_obj_list; + input_import_list = result.input_import_list; + + // weak symbols that weren't resolved + lnk_symbol_list_concat_in_place(&unresolved_weak_list, &result.unresolved_symbol_list); + + // reset input + MemoryZeroStruct(&lookup_weak_list); + ProfEnd(); + } break; + case State_CheckUnusedDelayLoads: { + if (imptab_delayed) { + for (String8Node *node = config->delay_load_dll_list.first; node != 0; node = node->next) { + LNK_ImportDLL *dll = lnk_import_table_search_dll(imptab_delayed, node->string); + if (dll == 0) { + lnk_error(LNK_Warning_UnusedDelayLoadDll, "/DELAYLOAD: %S found no imports", node->string); + } + } + } + } break; + case State_ReportUnresolvedSymbols: { + // report unresolved symbols + for (LNK_SymbolNode *node = unresolved_undef_list.first; node != 0; node = node->next) { + lnk_error(LNK_Error_UnresolvedSymbol, "unresolved symbol %S", node->data->name); + } + if (unresolved_undef_list.count) { + goto exit; + } + } break; + case State_RewireComdats: { + ProfBegin("Fold COMDAT symbols"); + lnk_fold_comdat_chunks(tp, symtab); + ProfEnd(); + } break; + + case State_DiscardMetaDataSections: { + ProfBegin("Discard Meta Data Sections"); + lnk_discard_meta_data_sections(st); + ProfEnd(); + } break; + case State_BuildDebugDirectory: { + ProfBegin("Build Debug Directory"); + + // push debug directory layout chunks + LNK_Section *debug_sect = lnk_section_table_search(st, str8_lit(".rdata")); + LNK_Chunk *debug_chunk = lnk_section_push_chunk_list(debug_sect, debug_sect->root, str8(0,0)); + LNK_Chunk *debug_dir_array_chunk = lnk_section_push_chunk_list(debug_sect, debug_chunk, str8(0,0)); + + // push symbols for PE directory patch + LNK_Symbol *dir_array_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_DEBUG_DIR_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, debug_dir_array_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, dir_array_symbol); + + // debug entry for PDB + if (config->debug_mode != LNK_DebugMode_None && config->debug_mode != LNK_DebugMode_Null) { + lnk_build_debug_pdb(st, symtab, debug_sect, debug_dir_array_chunk, config->time_stamp, config->guid, config->age, config->pdb_name); + } + + // debug entry for RDI + if (config->rad_debug == LNK_SwitchState_Yes) { + lnk_build_debug_rdi(st, symtab, debug_sect, debug_dir_array_chunk, config->time_stamp, config->guid, config->rad_debug_name); + } + + ProfEnd(); + } break; + case State_BuildExportTable: { + ProfBegin("Build Export Table"); + + lnk_collect_exports_from_obj_directives(exptab, obj_list, symtab); + lnk_build_edata(exptab, st, symtab, config->image_name, config->machine); + + ProfEnd(); + } break; + case State_MergeSections: { + ProfBegin("Merge Sections"); + LNK_MergeDirectiveList merge_list = lnk_init_merge_directive_list(scratch.arena, obj_list); + lnk_section_table_merge(st, merge_list); + ProfEnd(); + } break; + case State_BuildCFGuards: { + ProfBegin("Build CF Guards"); + B32 emit_suppress_flag = 1; // MSVC emits this flag but every entry has zero set. + lnk_build_guard_tables(tp, st, symtab, exptab, obj_list, config->machine, config->entry_point_name, config->guard_flags, emit_suppress_flag); + ProfEnd(); + } break; + case State_BuildBaseRelocs: { + ProfBegin("Base Relocs"); + lnk_build_base_relocs(tp, tp_arena, st, symtab, config->machine, config->page_size, obj_list); + ProfEnd(); + } break; + case State_BuildWin32Header: { + ProfBegin("Build Win32 Header"); + + // remove empty section headers from output image + lnk_section_table_remove_empties(st, symtab); + + // gather output sections + LNK_SectionArray out_sect_arr = lnk_section_table_get_output_sections(scratch.arena, st); + + // push back null section where we store image header + LNK_Section *header_sect = lnk_section_table_push_null(st); + + // fill out header section with win32 image header data + lnk_build_win32_image_header(symtab, header_sect, header_sect->root, config, out_sect_arr); + + ProfEnd(); + } break; + case State_PatchRelocs: { + ProfBegin("Patch Relocs"); + U64 base_addr = lnk_get_base_addr(config); + lnk_section_table_build_data(tp, st, config->machine); + lnk_section_table_assign_indices(st); + lnk_section_table_assign_virtual_offsets(st); + lnk_section_table_assign_file_offsets(st); + lnk_patch_relocs_obj(tp, obj_list, symtab, st, base_addr); + lnk_patch_relocs(tp, symtab, st, base_addr); + ProfEnd(); + } break; + case State_SortExceptionInfo: { + ProfBegin("Sort Exception Info"); + LNK_Symbol *pdata_symbol = lnk_symbol_table_searchf(symtab, LNK_SymbolScopeFlag_Internal, LNK_PDATA_SYMBOL_NAME); + if (pdata_symbol) { + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + String8 pdata = lnk_data_from_chunk_ref_no_pad(sect_id_map, pdata_symbol->u.defined.u.chunk->ref); + + switch (config->machine) { + case COFF_MachineType_X86: + case COFF_MachineType_X64: { + U64 count = pdata.size / sizeof(PE_IntelPdata); + radsort((PE_IntelPdata *)pdata.str, count, lnk_pdata_is_before_x8664); + } break; + case COFF_MachineType_ARM64: + case COFF_MachineType_ARM: { + Assert(!"TOOD: ARM"); + } break; + case COFF_MachineType_MIPSFPU: + case COFF_MachineType_MIPS16: + case COFF_MachineType_MIPSFPU16: { + Assert(!"TODO: MIPS"); + } break; + } + } + ProfEnd(); + } break; + case State_WriteImage: { + ProfEnd(); // :EndBuild + + if (lnk_get_log_status(LNK_Log_InputObj)) { + U64 total_input_size = 0; + for (LNK_ObjNode *obj_n = obj_list.first; obj_n != 0; obj_n = obj_n->next) { + total_input_size += obj_n->data.data.size; + } + String8 size_string = str8_from_memory_size2(scratch.arena, total_input_size); + lnk_log(LNK_Log_InputObj, "[Total Obj Input Size %S]", size_string); + } + if (lnk_get_log_status(LNK_Log_InputLib)) { + U64 total_input_size = 0; + for (U64 i = 0; i < ArrayCount(lib_index); ++i) { + LNK_LibList list = lib_index[i]; + for (LNK_LibNode *lib_n = list.first; lib_n != 0; lib_n = lib_n->next) { + total_input_size += lib_n->data.data.size; + } + } + String8 size_string = str8_from_memory_size2(scratch.arena, total_input_size); + lnk_log(LNK_Log_InputLib, "[Total Lib Input Size %S]", size_string); + } + + ProfBegin("Image Serialize"); + image_data = lnk_section_table_serialize(scratch.arena, st); + ProfEnd(); + + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + + if (config->flags & LNK_ConfigFlag_WriteImageChecksum) { + ProfBegin("Image Checksum"); + + U32 image_checksum = pe_compute_checksum(image_data.str, image_data.size); + + LNK_Symbol *checksum_symbol = lnk_symbol_table_searchf(symtab, LNK_SymbolScopeFlag_Internal, LNK_PE_CHECKSUM_SYMBOL_NAME); + U64 checksum_foff = lnk_file_off_from_symbol(sect_id_map, checksum_symbol); + + U32 *checksum_ptr = (U32 *)(image_data.str + checksum_foff); + *checksum_ptr = image_checksum; + + ProfEnd(); + } + + switch (config->guid_type) { + case LNK_DebugInfoGuid_Null: break; + case Lnk_DebugInfoGuid_ImageBlake3: { + ProfBegin("Hash Image With Blake3"); + LNK_Symbol *guid_symbol = lnk_symbol_table_searchf(symtab, LNK_SymbolScopeFlag_Internal, LNK_CV_HEADER_GUID_SYMBOL_NAME); + U64 guid_foff = lnk_file_off_from_symbol(sect_id_map, guid_symbol); + + U128 hash = lnk_blake3_hash_parallel(tp, 128, image_data); + + OS_Guid *guid_ptr = (OS_Guid *)(image_data.str + guid_foff); + MemoryCopy(guid_ptr, hash.u64, sizeof(hash.u64)); + + ProfEnd(); + } break; + } + + LNK_WriteThreadContext *ctx = push_array(scratch.arena, LNK_WriteThreadContext, 1); + ctx->path = config->image_name; + ctx->data = image_data; + image_write_thread = os_thread_launch(lnk_write_thread, ctx, 0); + + lnk_timer_end(LNK_Timer_Image); + ProfEnd(); // :EndImage + } break; + case State_BuildImpLib: { + ProfBegin("Build Imp Lib"); + lnk_timer_begin(LNK_Timer_Lib); + String8List lib_list = lnk_build_import_lib(tp, tp_arena, config->machine, config->time_stamp, config->imp_lib_name, config->image_name, exptab); + lnk_write_data_list_to_file_path(config->imp_lib_name, lib_list); + lnk_timer_end(LNK_Timer_Lib); + ProfEnd(); + } break; + case State_BuildDebugInfo: { + ProfBegin("Debug Info"); + lnk_timer_begin(LNK_Timer_Debug); + + LNK_CodeViewInput input = lnk_make_code_view_input(tp, tp_arena, config->lib_dir_list, obj_list); + CV_DebugT *types = lnk_import_types(tp, tp_arena, &input); + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + + if (config->rad_debug == LNK_SwitchState_Yes) { + lnk_timer_begin(LNK_Timer_Rdi); + RDI_Arch arch = rdi_arch_from_coff_machine(config->machine); + LNK_SectionArray image_sects = lnk_section_table_get_output_sections(scratch.arena, st); + + String8List rdi_data = lnk_build_rad_debug_info(tp, + tp_arena, + config->target_os, + arch, + config->image_name, + image_data, + image_sects, + sect_id_map, + input.count, + input.obj_arr, + input.debug_s_arr, + input.total_symbol_input_count, + input.symbol_inputs, + input.parsed_symbols, + types); + + lnk_write_data_list_to_file_path(config->rad_debug_name, rdi_data); + lnk_timer_end(LNK_Timer_Rdi); + } + + // TODO: Parallel debug info builds are currently blocked by the patch + // strings in $$FILE_CHECKSUM step in `lnk_process_c13_data_task`. + if (config->debug_mode == LNK_DebugMode_Full || config->debug_mode == LNK_DebugMode_GHash) { + lnk_timer_begin(LNK_Timer_Pdb); + String8List pdb_data = lnk_build_pdb(tp, + tp_arena, + config->guid, + config->machine, + config->time_stamp, + config->age, + config->pdb_page_size, + config->pdb_name, + config->lib_dir_list, + config->natvis_list, + symtab, + sect_id_map, + input.count, + input.obj_arr, + input.debug_s_arr, + input.total_symbol_input_count, + input.symbol_inputs, + input.parsed_symbols, + types); + + lnk_write_data_list_to_file_path(config->pdb_name, pdb_data); + lnk_timer_end(LNK_Timer_Pdb); + } else if (config->debug_mode == LNK_DebugMode_FastLink) { + lnk_not_implemented("FASTLINK"); + } + + lnk_timer_end(LNK_Timer_Debug); + ProfEnd(); + } break; + } + } + + if (input_disallow_lib_list.node_count) { + state_list_push(scratch.arena, state_list, State_InputDisallowLibs); + continue; + } + if (input_import_list.count) { + state_list_push(scratch.arena, state_list, State_InputImports); + continue; + } + if (input_defn_list.count || + input_weak_list.count || + include_symbol_list.node_count || + alt_name_list.from_list.node_count) { + state_list_push(scratch.arena, state_list, State_InputSymbols); + continue; + } + if (input_obj_list.count) { + state_list_push(scratch.arena, state_list, State_InputObjs); + continue; + } + { + B32 have_pending_lib_inputs = 0; + for (U64 i = 0; i < ArrayCount(input_libs); ++i) { + if (input_libs[i].node_count) { + have_pending_lib_inputs = 1; + break; + } + } + if (have_pending_lib_inputs) { + state_list_push(scratch.arena, state_list, State_InputLibs); + continue; + } + } + if (lookup_undef_list.count) { + state_list_push(scratch.arena, state_list, State_LookupUndef); + continue; + } + if (lookup_weak_list.count) { + state_list_push(scratch.arena, state_list, State_LookupWeak); + continue; + } + if (unresolved_weak_list.count) { + // we can't find strong definitions for unresolved weak symbols + // so now we have to use fallback symbols + MemoryZeroStruct(&unresolved_weak_list); + + // make sure fallback symbols are defined, if not try to find definitions + for (LNK_SymbolNode *symbol_n = unresolved_weak_list.first; symbol_n != 0; symbol_n = symbol_n->next) { + if (symbol_n->data->u.weak.fallback_symbol->type == LNK_Symbol_Undefined) { + lnk_symbol_list_push(scratch.arena, &lookup_undef_list, symbol_n->data->u.weak.fallback_symbol); + } + } + + continue; + } + if (entry_search_attempts == 0) { + state_list_push(scratch.arena, state_list, State_SearchEntryPoint); + entry_search_attempts += 1; + continue; + } + if (build_res_obj) { + build_res_obj = 0; + state_list_push(scratch.arena, state_list, State_BuildAndInputResObj); + continue; + } + if (build_linker_obj) { + build_linker_obj = 0; + state_list_push(scratch.arena, state_list, State_BuildAndInputLinkerObj); + continue; + } + if (check_unused_delay_loads) { + check_unused_delay_loads = 0; + state_list_push(scratch.arena, state_list, State_CheckUnusedDelayLoads); + continue; + } + if (unresolved_undef_list.count) { + if (report_unresolved_symbols) { + report_unresolved_symbols = 0; + state_list_push(scratch.arena, state_list, State_ReportUnresolvedSymbols); + continue; + } + } + if (do_comdat_rewire) { + do_comdat_rewire = 0; + state_list_push(scratch.arena, state_list, State_RewireComdats); + continue; + } + + /// --- inputs are ready --- + + if (discard_meta_data_sections) { + discard_meta_data_sections = 0; + state_list_push(scratch.arena, state_list, State_DiscardMetaDataSections); + continue; + } + if (build_debug_directory) { + build_debug_directory = 0; + state_list_push(scratch.arena, state_list, State_BuildDebugDirectory); + continue; + } + if (build_export_table) { + build_export_table = 0; + state_list_push(scratch.arena, state_list, State_BuildExportTable); + continue; + } + if (merge_sections) { + merge_sections = 0; + state_list_push(scratch.arena, state_list, State_MergeSections); + continue; + } + if (build_cf_guards) { + build_cf_guards = 0; + state_list_push(scratch.arena, state_list, State_BuildCFGuards); + continue; + } + if (build_base_relocs) { + build_base_relocs = 0; + state_list_push(scratch.arena, state_list, State_BuildBaseRelocs); + continue; + } + if (build_win32_header) { + build_win32_header = 0; + state_list_push(scratch.arena, state_list, State_BuildWin32Header); + continue; + } + if (patch_relocs) { + patch_relocs = 0; + state_list_push(scratch.arena, state_list, State_PatchRelocs); + continue; + } + if (sort_exception_info) { + sort_exception_info = 0; + state_list_push(scratch.arena, state_list, State_SortExceptionInfo); + continue; + } + if (image_data.size == 0) { + state_list_push(scratch.arena, state_list, State_WriteImage); + continue; + } + if (build_imp_lib) { + build_imp_lib = 0; + if (config->file_characteristics & PE_ImageFileCharacteristic_FILE_DLL) { + state_list_push(scratch.arena, state_list, State_BuildImpLib); + continue; + } + } + if (build_debug_info) { + build_debug_info = 0; + state_list_push(scratch.arena, state_list, State_BuildDebugInfo); + continue; + } + + // wait for the thread to finish writing image to disk + os_thread_join(image_write_thread, -1); + + break; + } + + if (lnk_get_log_status(LNK_Log_SizeBreakdown)) { + lnk_log_size_breakdown(st, symtab); + } + if (lnk_get_log_status(LNK_Log_LinkStats)) { + lnk_log_link_stats(obj_list, lib_index, st); + } + if (lnk_get_log_status(LNK_Log_Timers)) { + lnk_log_timers(); + } + +exit:; + + // linker is done punt memory release to OS + //lnk_section_table_release(&st); + //lnk_symbol_table_release(&symtab); + //lnk_export_table_release(&export_table); + //lnk_import_table_release(&imptab_regular); + //lnk_import_table_release(&imptab_delayed); + //tp_arena_release(&tp_arena); + + scratch_end(scratch); + ProfEnd(); + +#undef state_list_push +#undef state_list_pop +} + +internal void +entry_point(CmdLine *cmdline) +{ + Temp scratch = scratch_begin(0,0); + +#if PROFILE_TELEMETRY + tmMessage(0, TMMF_ICON_NOTE, BUILD_TITLE); +#endif + + // TODO: temp hack to make custom command line work while syncing with latest code base changes + int argc; + char **argv; + { + LPWSTR w32_cmd_line = GetCommandLineW(); + argc = 0; + LPWSTR *argvw = CommandLineToArgvW(w32_cmd_line, &argc); + argv = push_array(scratch.arena, char *, argc); + for(int i = 0; i < argc; ++i) + { + String16 arg16 = str16_cstring((U16 *)argvw[i]); + String8 arg8 = str8_from_16(scratch.arena, arg16); + argv[i] = (char *)arg8.str; + } + } + + lnk_init_error_handler(); + lnk_run(argc, argv); + + scratch_end(scratch); +} + +#if 0 +internal void +lnk_dump_symbol_table(FILE *f, LNK_SymbolTable *symtab) +{ + for (U64 bucket_idx = 0; bucket_idx < symtab->bucket_count; bucket_idx += 1) { + LNK_SymbolList *bucket = symtab->buckets[bucket_idx]; + if (bucket) { + U64 node_idx = 0; + for (LNK_SymbolNode *symbol_node = bucket->first; symbol_node != 0; symbol_node = symbol_node->next, node_idx += 1) { + LNK_Symbol *symbol = symbol_node->data; + fprintf(f, "[%04llX,%04llX] %.*s\n", bucket_idx, node_idx, str8_varg(symbol->name)); + } + } + } +} + +int +lnk_chunk_size_compar(void *ud, const void *a, const void *b) +{ + LNK_Section **sect_id_map = (LNK_Section**)ud; + LNK_ChunkPtr ac = *(LNK_ChunkPtr*)a; + LNK_ChunkPtr bc = *(LNK_ChunkPtr*)b; + U64 as = lnk_virt_size_from_chunk_ref(sect_id_map, ac->ref); + U64 bs = lnk_virt_size_from_chunk_ref(sect_id_map, bc->ref); + int cmp = as < bs ? -1 : as > bs ? +1 : 0; + return cmp; +} + +internal LNK_ChunkArray +lnk_query_chunks_near_voff_ex(Arena *arena, LNK_SectionTable *st, LNK_Section **sect_id_map, U64 voff) +{ + Temp scratch = scratch_begin(&arena, 1); + LNK_ChunkArray result; MemoryZeroStruct(&result); + for (U64 id = 0; id < st->id_max; ++id) { + LNK_Section *sect = sect_id_map[id]; + U64 root_voff = lnk_virt_off_from_chunk_ref(sect_id_map, sect->root->ref); + U64 root_size = lnk_virt_size_from_chunk_ref(sect_id_map, sect->root->ref); + if (root_voff <= voff && voff < root_voff + root_size) { + U64List list; MemoryZeroStruct(&list); + for (U64 chunk_id = 0; chunk_id < sect->cman->total_chunk_count; ++chunk_id) { + LNK_ChunkRef chunk_ref = { sect->id, chunk_id }; + U64 chunk_voff = lnk_virt_off_from_chunk_ref(sect_id_map, chunk_ref); + U64 chunk_size = lnk_virt_size_from_chunk_ref(sect_id_map, chunk_ref); + if (chunk_voff <= voff && voff < chunk_voff + chunk_size) { + u64_list_push(scratch.arena, &list, chunk_id); + } + } + + if (list.count) { + result.count = 0; + result.v = push_array_no_zero(arena, LNK_ChunkPtr, list.count); + LNK_ChunkPtr *chunk_id_map = lnk_make_chunk_id_map(scratch.arena, sect->cman); + for (U64Node *i = list.first; i != NULL; i = i->next) { + result.v[result.count++] = chunk_id_map[i->data]; + } + qsort_s((void*)result.v, result.count, sizeof(result.v[0]), lnk_chunk_size_compar, sect_id_map); + } + + break; + } + } + scratch_end(scratch); + return result; +} + +internal LNK_ChunkArray +lnk_query_chunks_near_voff(Arena *arena, LNK_SectionTable *st, U64 voff) +{ + Temp scratch = scratch_begin(&arena, 1); + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + LNK_ChunkArray result = lnk_query_chunks_near_voff_ex(arena, st, sect_id_map, voff); + scratch_end(scratch); + return result; +} + +internal void +lnk_dump_crt_inits(LNK_SectionTable *st, LNK_SymbolTable *symtab) +{ + static struct { + char *first; + char *last; + } table[] = { + { "__xi_a", "__xi_z" }, + { "__xc_a", "__xc_z" }, + { "__xp_a", "__xp_z" }, + { "__xt_a", "__xt_z" } + }; + + Temp scratch = scratch_begin(0, 0); + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + for (U64 i = 0; i < ArrayCount(table); ++i) { + LNK_Symbol *first = lnk_symbol_table_searchf(symtab, LNK_SymbolScopeFlag_Main, table[i].first); + LNK_Symbol *last = lnk_symbol_table_searchf(symtab, LNK_SymbolScopeFlag_Main, table[i].last); + U64 first_voff = lnk_virt_off_from_chunk_ref(sect_id_map, first->u.defined.u.chunk->ref); + U64 last_voff = lnk_virt_off_from_chunk_ref(sect_id_map, last->u.defined.u.chunk->ref); + U64 ptr_size = sizeof(U64); + U64 count = (last_voff - first_voff) / ptr_size; + printf("(%s-%s)\n", table[i].first, table[i].last); + for (U64 ptr_idx = 0; ptr_idx < count; ++ptr_idx) { + LNK_ChunkArray chunk_ptr_arr = lnk_query_chunks_near_voff_ex(scratch.arena, st, sect_id_map, first_voff + ptr_idx * ptr_size); + LNK_Chunk *chunk = chunk_ptr_arr.v[0]; + printf("\t%.*s\n", str8_varg(chunk->debug)); + } + } + scratch_end(scratch); +} + +internal void +lnk_dump_resource_dir_(COFF_ResourceID dir_id, PE_ResourceDir *dir) +{ + Temp scratch = scratch_begin(0, 0); + + SYMS_String8 dir_id_syms = syms_str8(0,0); + if (dir_id.type == COFF_ResourceIDType_NUMBER) { + dir_id_syms = syms_pe_resource_type_to_string(dir_id.u.number); + } + if (dir_id_syms.size == 0) { + dir_id_syms = syms_coff_resource_id_to_string(scratch.arena, dir_id); + } + + tool_fprintf(stdout, "ID: %.*s, Characteristics: %u, Time stamp: %u, Version: %u.%u\n", + syms_expand_string(dir_id_syms), dir->characteristics, dir->time_stamp, dir->major_version, dir->minor_version); + tool_fprintf(stdout, "{\n"); + tool_indent(stdout); + + PE_ResourceList list_arr[2]; + list_arr[0] = dir->named_list; + list_arr[1] = dir->id_list; + + for (U64 i = 0; i < ArrayCount(list_arr); ++i) { + PE_ResourceList *list = &list_arr[i]; + for (PE_ResourceNode *n = list->first; n != NULL; n = n->next) { + PE_Resource *res = &n->data; + switch (res->type) { + default: InvalidPath; + case PE_ResData_NULL: break; + case PE_ResData_DIR: { + lnk_dump_resource_dir_(res->id, res->u.dir); + } break; + case PE_ResData_COFF_LEAF: { + SYMS_String8 id_syms = syms_coff_resource_id_to_string(scratch.arena, res->id); + tool_fprintf(stdout, "ID: %.*s Data voff: 0x%X, Data size: %u, Code page: %u, Reserved: %u\n", + syms_expand_string(id_syms), res->u.leaf.data_voff, res->u.leaf.data_size, res->u.leaf.code_page, res->u.leaf.reserved); + } break; + case PE_ResData_COFF_RESOURCE: { + SYMS_String8 id_syms = syms_coff_resource_id_to_string(scratch.arena, res->id); + SYMS_String8 type_syms = syms_str8(0,0); + if (res->u.coff_res.type.type == COFF_ResourceIDType_NUMBER) { + type_syms = syms_pe_resource_type_to_string(res->u.coff_res.type.u.number); + } + if (type_syms.size == 0) { + type_syms = syms_coff_resource_id_to_string(scratch.arena, res->u.coff_res.type); + } + tool_fprintf(stdout, "ID: %.*s Data version: %u, Version: %u, Memory flags: %u, Data Byte Count: %u\n", + syms_expand_string(id_syms), res->u.coff_res.data_version, res->u.coff_res.version, res->u.coff_res.memory_flags, res->u.coff_res.data.size); + } break; + } + } + } + + tool_unindent(stdout); + tool_fprintf(stdout, "}\n"); + scratch_end(scratch); +} + +internal void +lnk_dump_resource_dir(PE_ResourceDir *dir) +{ + COFF_ResourceID dir_id; + dir_id.type = COFF_ResourceIDType_NUMBER; + dir_id.u.number = 0; + lnk_dump_resource_dir_(dir_id, dir); +} + +#endif diff --git a/src/linker/lnk.h b/src/linker/lnk.h new file mode 100644 index 00000000..77fbef6f --- /dev/null +++ b/src/linker/lnk.h @@ -0,0 +1,298 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#define LNK_NULL_SYMBOL_NAME "NULL" +#define LNK_TEXT_SYMBOL_NAME "TEXT" +#define LNK_DATA_SYMBOL_NAME "DATA" +#define LNK_RDATA_SYMBOL_NAME "RDATA" +#define LNK_BSS_SYMBOL_NAME "BSS" +#define LNK_XDATA_SYMBOL_NAME "XDATA" +#define LNK_PDATA_SYMBOL_NAME "PDATA" +#define LNK_BASE_RELOC_SYMBOL_NAME "BASE_RELOC" +#define LNK_EDATA_SYMBOL_NAME "EDATA" +#define LNK_DEBUG_DIR_SYMBOL_NAME "DEDIR" +#define LNK_DEBUG_DATA_SYMBOL_NAME "DEDAT" +#define LNK_CV_DIR_SYMBOL_NAME "CV_DIR" +#define LNK_CV_HEADER_PDB70_SYMBOL_NAME "CV_HEADER_PDB70" +#define LNK_CV_HEADER_RDI_SYMBOL_NAME "CV_HEADER_RDI" +#define LNK_CV_HEADER_GUID_SYMBOL_NAME "CV_HEADER_GUID" +#define LNK_RSRC_SYMBOL_NAME "RSRC" +#define LNK_DEBUG_SYMBOL_NAME "DEBUG" +#define LNK_GFIDS_SYMBOL_NAME "GFIDS" +#define LNK_GIATS_SYMBOL_NAME "GIATS" +#define LNK_GLJMP_SYMBOL_NAME "GLJMP" +#define LNK_GEHCONT_SYMBOL_NAME "GEHCONT" +#define LNK_IMPORT_NAME_TABLE_SYMBOL_NAME "IMPORT_STR" +#define LNK_IMPORT_DLL_TABLE_SYMBOL_NAME "IDATA" +#define LNK_IMPORT_ILT_SYMBOL_NAME "ILT" +#define LNK_IMPORT_IAT_SYMBOL_NAME "IAT" +#define LNK_IMPORT_JMP_SYMBOL_NAME "IMPORT_THUNKS" +#define LNK_DELAYED_IMPORT_DLL_TABLE_SYMBOL_NAME "DELAYED_IMPORT_DLL_TABLE" +#define LNK_DELAYED_IMPORT_HANDLE_TABLE_SYMBOL_NAME "DELAYED_IMPORT_HANDLE_TABLE" +#define LNK_DELAYED_IMPORT_INT_SYMBOL_NAME "DELAYED_IMPORT_INT" +#define LNK_DELAYED_IMPORT_IAT_SYMBOL_NAME "DELAYED_IMPORT_IAT" +#define LNK_DELAYED_IMPORT_ILT_SYMBOL_NAME "DELAYED_IMPORT_ILT" +#define LNK_DELAYED_IMPORT_BIAT_SYMBOL_NAME "DELAYED_IMPORT_BIAT" +#define LNK_DELAYED_IMPORT_UIAT_SYMBOL_NAME "DELAYED_IMPORT_UIAT" +#define LNK_DELAYED_IMPORT_CODE_SYMBOL_NAME "DELAYED_IMPORT_CODE" + +#define LNK_WIN32_HEADER_SYMBOL_NAME "WIN32_HEADER" +#define LNK_DOS_SYMBOL_NAME "DOS" +#define LNK_NT_HEADERS_SYMBOL_NAME "NT_HEADERS" +#define LNK_PE_MAGIC_CONTAINER_SYMBOL_NAME "PE_MAGIC_CONTAINER" +#define LNK_COFF_FILE_HEADER_CONTAINER_SYMBOL_NAME "COFF_FILE_HEADER_CONTAINER" +#define LNK_PE_OPT_HEADER_CONTAINER_SYMBOL_NAME "PE_OPTIONAL_HEADER_CONTAINER" +#define LNK_COFF_SECTION_HEADER_CONTAINER_SYMBOL_NAME "COFF_SECTION_HEADER_CONTAINER" + +#define LNK_DOS_HEADER_SYMBOL_NAME "DOS_HEADER" +#define LNK_DOS_PROGRAM_SYMBOL_NAME "DOS_PROGRAM" +#define LNK_PE_MAGIC_SYMBOL_NAME "PE_MAGIC" +#define LNK_COFF_HEADER_SYMBOL_NAME "COFF_HEADER" +#define LNK_PE_DIRECTORY_ARRAY_SYMBOL_NAME "PE_DIRECTORY_ARRAY" +#define LNK_PE_DIRECTORY_COUNT_SYMBOL_NAME "PE_DIRECTORY_COUNT" +#define LNK_PE_OPT_HEADER_SYMBOL_NAME "PE_OPTIONAL_HEADER" +#define LNK_COFF_SECT_HEADER_ARRAY_SYMBOL_NAME "COFF_SECT_HEADER_ARRAY" +#define LNK_COFF_SECT_HEADER_COUNT_SYMBOL_NAME "COFF_SECT_HEADER_COUNT" +#define LNK_PE_CHECKSUM_SYMBOL_NAME "PE_CHECKSUM" + +// _tls_used is a special section in CRT which has format of +// PE_TLSHeader32 or PE_TLSHeader64, according to machine type. +#define LNK_TLS_SYMBOL_NAME "_tls_used" + +// _load_config_used points to SYMS_PeLoadConfig32/SYMS_PeLoadConfig64 +// and symbols below are used to patch patricual fields of the struct. +#define LNK_LOAD_CONFIG_SYMBOL_NAME "_load_config_used" +#define LNK_ENCLAVE_CONFIG_SYMBOL_NAME "__enclave_config" +#define LNK_GUARD_FLAGS_SYMBOL_NAME "__guard_flags" +#define LNK_GUARD_FIDS_TABLE_SYMBOL_NAME "__guard_fids_table" +#define LNK_GUARD_FIDS_COUNT_SYMBOL_NAME "__guard_fids_count" +#define LNK_GUARD_IAT_TABLE_SYMBOL_NAME "__guard_iat_table" +#define LNK_GUARD_IAT_COUNT_SYMBOL_NAME "__guard_iat_count" +#define LNK_GUARD_LONGJMP_TABLE_SYMBOL_NAME "__guard_longjmp_table" +#define LNK_GUARD_LONGJMP_COUNT_SYMBOL_NAME "__guard_longjmp_count" +#define LNK_GUARD_EHCONT_TABLE_SYMBOL_NAME "__guard_eh_cont_table" +#define LNK_GUARD_EHCONT_COUNT_SYMBOL_NAME "__guard_eh_cont_count" +// x86 load config fields +#define LNK_SAFE_SE_HANDLER_TABLE_SYMBOL_NAME "__safe_se_handler_table" +#define LNK_SAFE_SE_HANDLER_COUNT_SYMBOL_NAME "__safe_se_handler_count" + +// load symbols from delayimp.lib +#define LNK_DELAY_LOAD_HELPER2_SYMBOL_NAME "__delayLoadHelper2" +#define LNK_DELAY_LOAD_HELPER2_X86_SYMBOL_NAME "___delayLoadHelper2@8" + +#define LNK_TEXT_SECTION_FLAGS (COFF_SectionFlag_CNT_CODE|COFF_SectionFlag_MEM_EXECUTE|COFF_SectionFlag_MEM_READ) +#define LNK_DATA_SECTION_FLAGS (COFF_SectionFlag_CNT_INITIALIZED_DATA|COFF_SectionFlag_MEM_READ|COFF_SectionFlag_MEM_WRITE) +#define LNK_RDATA_SECTION_FLAGS (COFF_SectionFlag_CNT_INITIALIZED_DATA|COFF_SectionFlag_MEM_READ) +#define LNK_BSS_SECTION_FLAGS (COFF_SectionFlag_CNT_UNINITIALIZED_DATA|COFF_SectionFlag_MEM_READ|COFF_SectionFlag_MEM_WRITE) +#define LNK_IDATA_SECTION_FLAGS LNK_DATA_SECTION_FLAGS +#define LNK_DEBUG_DIR_SECTION_FLAGS LNK_DATA_SECTION_FLAGS +#define LNK_RSRC_SECTION_FLAGS LNK_DATA_SECTION_FLAGS +#define LNK_XDATA_SECTION_FLAGS LNK_RDATA_SECTION_FLAGS +#define LNK_PDATA_SECTION_FLAGS LNK_RDATA_SECTION_FLAGS +#define LNK_EDATA_SECTION_FLAGS LNK_RDATA_SECTION_FLAGS +#define LNK_GFIDS_SECTION_FLAGS LNK_RDATA_SECTION_FLAGS +#define LNK_GIATS_SECTION_FLAGS LNK_RDATA_SECTION_FLAGS +#define LNK_GLJMP_SECTION_FLAGS LNK_RDATA_SECTION_FLAGS +#define LNK_GEHCONT_SECTION_FLAGS LNK_RDATA_SECTION_FLAGS +#define LNK_RELOC_SECTION_FLAGS (LNK_RDATA_SECTION_FLAGS | COFF_SectionFlag_MEM_DISCARDABLE) +#define LNK_DEBUG_SECTION_FLAGS (LNK_RDATA_SECTION_FLAGS | COFF_SectionFlag_MEM_DISCARDABLE) + +//////////////////////////////// + +typedef enum +{ + LNK_InputSource_CmdLine, // specified on command line + LNK_InputSource_Default, // specified through defaultlib switch + LNK_InputSource_Obj, // refrenced from objects + LNK_InputSource_Count +} LNK_InputSourceType; + +typedef String8Node LNK_InputLib; +typedef String8List LNK_InputLibList; + +typedef struct LNK_InputImport +{ + COFF_ImportHeader import_header; + struct LNK_InputImport *next; +} LNK_InputImport; + +typedef struct LNK_InputImportList +{ + U64 count; + LNK_InputImport *first; + LNK_InputImport *last; +} LNK_InputImportList; + +//////////////////////////////// + +typedef struct LNK_BaseRelocPage +{ + U64 voff; + U64List entries; +} LNK_BaseRelocPage; + +typedef struct LNK_BaseRelocPageNode +{ + struct LNK_BaseRelocPageNode *next; + LNK_BaseRelocPage v; +} LNK_BaseRelocPageNode; + +typedef struct LNK_BaseRelocPageList +{ + U64 count; + LNK_BaseRelocPageNode *first; + LNK_BaseRelocPageNode *last; +} LNK_BaseRelocPageList; + +typedef struct LNK_BaseRelocPageArray +{ + U64 count; + LNK_BaseRelocPage *v; +} LNK_BaseRelocPageArray; + +typedef struct +{ + U64 page_size; + LNK_Section **sect_id_map; + LNK_Reloc **reloc_arr; + Rng1U64 *range_arr; + LNK_BaseRelocPageList *list_arr; + HashTable **page_ht_arr; +} LNK_BaseRelocTask; + +typedef struct +{ + Rng1U64 *ranges; + U64 page_size; + LNK_Section **sect_id_map; + LNK_BaseRelocPageList *list_arr; + LNK_Obj **obj_arr; + HashTable **page_ht_arr; +} LNK_ObjBaseRelocTask; + +typedef struct +{ + Rng1U64 *range_arr; + LNK_LibNode *lib_arr; + LNK_Symbol **symbol_arr_arr; +} LNK_LazyIniter; + +typedef struct +{ + LNK_InputObjList input_obj_list; + LNK_InputImportList input_import_list; + LNK_SymbolList unresolved_symbol_list; +} LNK_SymbolFinderResult; + +typedef struct +{ + PathStyle path_style; + LNK_SymbolTable *symtab; + LNK_SymbolNodeArray lookup_node_arr; + LNK_SymbolFinderResult *result_arr; + Rng1U64 *range_arr; +} LNK_SymbolFinder; + +typedef struct +{ + LNK_SymbolTable *symtab; + LNK_SectionTable *st; + LNK_Section **sect_id_map; + U64 base_addr; + LNK_Section **sect_arr; + Rng1U64 *range_arr; +} LNK_SectionRelocPatcher; + +typedef struct +{ + LNK_SymbolTable *symtab; + LNK_SectionTable *st; + LNK_Section **sect_id_map; + U64 base_addr; + LNK_Obj **obj_arr; +} LNK_ObjRelocPatcher; + + +typedef struct +{ + String8 path; + String8 data; +} LNK_WriteThreadContext; + +typedef struct +{ + String8 data; + Rng1U64 *ranges; + U128 *hashes; +} LNK_Blake3Hasher; + +//////////////////////////////// + +internal LNK_InputImport * lnk_input_import_list_push(Arena *arena, LNK_InputImportList *list); +internal void lnk_input_import_list_concat_in_place(LNK_InputImportList *list, LNK_InputImportList *to_concat); +internal LNK_InputImport ** lnk_input_import_arr_from_list(Arena *arena, LNK_InputImportList list); +internal LNK_InputImportList lnk_list_from_input_import_arr(LNK_InputImport **arr, U64 count); + +//////////////////////////////// +// Helpers + +internal void lnk_write_data_list_to_file_path(String8 path, String8List list); +internal void lnk_write_data_to_file_path(String8 path, String8 data); + +internal String8 lnk_make_full_path(Arena *arena, String8 work_dir, PathStyle system_path_style, String8 path); + +internal String8 lnk_get_lib_name(String8 path); +internal B32 lnk_is_lib_disallowed(HashTable *disallow_lib_ht, String8 path); +internal B32 lnk_is_lib_loaded(HashTable *default_lib_ht, HashTable *loaded_lib_ht, LNK_InputSourceType input_source, String8 lib_path); +internal void lnk_push_disallow_lib(Arena *arena, HashTable *disallow_lib_ht, String8 path); +internal void lnk_push_loaded_lib(Arena *arena, HashTable *default_lib_ht, HashTable *loaded_lib_ht, String8 path); + +//////////////////////////////// +// Manifest + +internal String8List lnk_make_linker_manifest(Arena *arena, B32 manifest_uac, String8 manifest_level, String8 manifest_ui_access, String8List manifest_dependency_list); +internal String8 lnk_merge_manifest_files(Arena *arena, String8 mt_path, String8 manifest_name, String8List manifest_path_list); +internal String8 lnk_res_from_data(Arena *arena, String8 data); + +//////////////////////////////// +// Resources + +internal void lnk_serialize_pe_resource_tree(LNK_SectionTable *st, LNK_SymbolTable *symtab, PE_ResourceDir *root_dir); +internal void lnk_add_resource_debug_s(LNK_SectionTable *st, LNK_SymbolTable *symtab, String8 obj_path, String8 cwd_path, String8 exe_path, CV_Arch arch, String8List res_file_list, MD5Hash *res_hash_array); +internal String8 lnk_make_res_obj(TP_Context *tp, Arena *arena, PE_ResourceDir *root_dir, COFF_MachineType machine, COFF_TimeStamp time_stamp, String8 path, String8 cwd_path, String8 exe_path, String8List res_file_list, MD5Hash *res_hash_array); +internal String8 lnk_obj_from_res_file_list(TP_Context *tp, Arena *arena, LNK_SectionTable *st, LNK_SymbolTable *symtab, String8List res_file_list, String8List res_path_list, COFF_MachineType machine, U32 time_stamp, String8 work_dir, PathStyle system_path_style, String8 obj_name); + +//////////////////////////////// +// Debug + +internal String8 lnk_make_linker_coff_obj(TP_Context *tp, Arena *arena, COFF_TimeStamp time_stamp, COFF_MachineType machine, String8 cwd_path, String8 exe_path, String8 pdb_path, String8 cmd_line, String8 obj_name); + +//////////////////////////////// +// Win32 Image Helpers + +internal void lnk_build_debug_pdb(LNK_SectionTable *st, LNK_SymbolTable *symtab, LNK_Section *debug_sect, LNK_Chunk *debug_dir_array_chunk, COFF_TimeStamp time_stamp, OS_Guid guid, U32 age, String8 pdb_path); +internal void lnk_build_debug_rdi(LNK_SectionTable *st, LNK_SymbolTable *symtab, LNK_Section *debug_sect, LNK_Chunk *debug_dir_array_chunk, COFF_TimeStamp time_stamp, OS_Guid guid, String8 rdi_path); +internal void lnk_build_guard_tables(TP_Context *tp, LNK_SectionTable *st, LNK_SymbolTable *symtab, LNK_ExportTable *exptab, LNK_ObjList obj_list, COFF_MachineType machine, String8 entry_point_name, LNK_GuardFlags guard_flags, B32 emit_suppress_flag); +internal void lnk_build_base_relocs(TP_Context *tp, TP_Arena *tp_arena, LNK_SectionTable *st, LNK_SymbolTable *symtab, COFF_MachineType machine, U64 page_size, LNK_ObjList obj_list); +internal LNK_Chunk * lnk_build_dos_header(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent_chunk); +internal LNK_Chunk * lnk_build_pe_magic(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent); +internal LNK_Chunk * lnk_build_coff_file_header(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent, COFF_MachineType machine, COFF_TimeStamp time_stamp, PE_ImageFileCharacteristics file_characteristics); +internal LNK_Chunk * lnk_build_pe_optional_header_x64(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent, COFF_MachineType machine, U64 base_addr, U64 sect_align, U64 file_align, Version linker_ver, Version os_ver, Version image_ver, Version subsystem_ver, PE_WindowsSubsystem subsystem, PE_DllCharacteristics dll_characteristics, U64 stack_reserve, U64 stack_commit, U64 heap_reserve, U64 heap_commit, String8 entry_point_name, LNK_SectionArray sect_arr); +internal LNK_Chunk * lnk_build_pe_directories(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent); +internal LNK_Chunk * lnk_build_coff_section_table(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent_chunk, LNK_SectionArray sect_arr); +internal LNK_Chunk * lnk_build_win32_image_header(LNK_SymbolTable *symtab, LNK_Section *header_sect, LNK_Chunk *parent_chunk, LNK_Config *config, LNK_SectionArray sect_arr); + +//////////////////////////////// +// Relocs + +internal void lnk_patch_relocs(TP_Context *tp, LNK_SymbolTable *symtab, LNK_SectionTable *st, U64 base_addr); +internal void lnk_apply_reloc(U64 base_addr, U64 virt_align, U64 file_align, LNK_Section **sect_id_map, LNK_SymbolTable *symtab, String8 chunk_data, LNK_Reloc *reloc); + +//////////////////////////////// + +internal void lnk_log_size_breakdown(LNK_SectionTable *st, LNK_SymbolTable *symtab); +internal void lnk_log_link_stats(LNK_ObjList obj_list, LNK_LibList *lib_index, LNK_SectionTable *st); +internal void lnk_log_timers(void); + diff --git a/src/linker/lnk_base_reloc.c b/src/linker/lnk_base_reloc.c new file mode 100644 index 00000000..92b3f1be --- /dev/null +++ b/src/linker/lnk_base_reloc.c @@ -0,0 +1,264 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal LNK_BaseRelocPageArray +lnk_base_reloc_page_array_from_list(Arena* arena, LNK_BaseRelocPageList list) +{ + LNK_BaseRelocPageArray result = {0}; + result.count = 0; + result.v = push_array_no_zero(arena, LNK_BaseRelocPage, list.count); + for (LNK_BaseRelocPageNode* n = list.first; n != 0; n = n->next) { + result.v[result.count++] = n->v; + } + Assert(result.count == list.count); + return result; +} + +internal void +lnk_emit_base_reloc_info(Arena *arena, + LNK_Section **sect_id_map, + U64 page_size, + HashTable *page_ht, + LNK_BaseRelocPageList *page_list, + LNK_Reloc *reloc) +{ + B32 is_addr = (reloc->type == LNK_Reloc_ADDR_64 || reloc->type == LNK_Reloc_ADDR_32); + if (is_addr) { + U64 reloc_voff = lnk_virt_off_from_reloc(sect_id_map, reloc); + U64 page_voff = AlignDownPow2(reloc_voff, page_size); + + LNK_BaseRelocPageNode *page; + { + String8 raw_page; + B32 is_page_present = hash_table_search_u64(page_ht, page_voff, &raw_page); + if (is_page_present) { + page = *(LNK_BaseRelocPageNode **) raw_page.str; + } else { + // fill out page + page = push_array(arena, LNK_BaseRelocPageNode, 1); + page->v.voff = page_voff; + + // push page + SLLQueuePush(page_list->first, page_list->last, page); + page_list->count += 1; + + // register page voff + hash_table_push_u64(arena, page_ht, page_voff, str8_struct(&page)); + } + } + + u64_list_push(arena, &page->v.entries, reloc_voff); + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_emit_base_relocs_from_reloc_array_task) +{ + LNK_BaseRelocTask *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + LNK_BaseRelocPageList *page_list = &task->list_arr[task_id]; + HashTable *page_ht = task->page_ht_arr[task_id]; + + for (U64 reloc_idx = range.min; reloc_idx < range.max; reloc_idx += 1) { + LNK_Reloc *reloc = task->reloc_arr[reloc_idx]; + lnk_emit_base_reloc_info(arena, task->sect_id_map, task->page_size, page_ht, page_list, reloc); + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_emit_base_relocs_from_objs_task) +{ + LNK_ObjBaseRelocTask *task = raw_task; + LNK_Obj *obj = task->obj_arr[task_id]; + LNK_BaseRelocPageList *page_list = &task->list_arr[worker_id]; + HashTable *page_ht = task->page_ht_arr[worker_id]; + + for (U64 sect_idx = 0; sect_idx < obj->sect_count; sect_idx += 1) { + B32 is_live = !lnk_chunk_is_discarded(&obj->chunk_arr[sect_idx]); + if (is_live) { + LNK_RelocList reloc_list = obj->sect_reloc_list_arr[sect_idx]; + for (LNK_Reloc *reloc = reloc_list.first; reloc != 0; reloc = reloc->next) { + lnk_emit_base_reloc_info(arena, + task->sect_id_map, + task->page_size, + page_ht, + page_list, + reloc); + } + } + } +} + +int +lnk_base_reloc_page_compar(void *raw_a, void *raw_b) +{ + LNK_BaseRelocPage* a = raw_a; + LNK_BaseRelocPage* b = raw_b; + int is_before = a->voff < b->voff; + return is_before; +} + +internal void +lnk_base_reloc_page_array_sort(LNK_BaseRelocPageArray arr) +{ + ProfBeginFunction(); + radsort(arr.v, arr.count, lnk_base_reloc_page_compar); + ProfEnd(); +} + +internal void +lnk_build_base_relocs(TP_Context *tp, + LNK_SectionTable *st, + LNK_SymbolTable *symtab, + COFF_MachineType machine, + U64 page_size, + LNK_ObjList obj_list) +{ + ProfBeginFunction(); + + TP_Arena *arena = g_file_arena; + TP_Temp temp = tp_temp_begin(arena); + + lnk_section_table_build_data(st, machine); + lnk_section_table_assign_virtual_offsets(st); + + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(arena->v[0], st); + + LNK_BaseRelocPageList *page_list_arr = push_array(arena->v[0], LNK_BaseRelocPageList, tp->worker_count); + HashTable **page_ht_arr = push_array_no_zero(arena->v[0], HashTable *, tp->worker_count); + for (U64 i = 0; i < tp->worker_count; ++i) { + page_ht_arr[i] = hash_table_init(arena->v[0], 1024); + } + + // emit pages from relocs defined in section table + ProfBegin("Emit Relocs From Section Table"); + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + LNK_BaseRelocTask task = {0}; + task.page_size = page_size; + task.sect_id_map = sect_id_map; + task.list_arr = page_list_arr; + task.page_ht_arr = page_ht_arr; + task.reloc_arr = lnk_reloc_array_from_list(arena->v[0], sect_node->data.reloc_list); + task.range_arr = tp_divide_work(arena->v[0], sect_node->data.reloc_list.count, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, lnk_emit_base_relocs_from_reloc_array_task, &task); + } + ProfEnd(); + + // emit pages from relocs defined in objs + ProfBegin("Emit Relocs From Objs"); + { + LNK_ObjBaseRelocTask task = {0}; + task.page_size = page_size; + task.sect_id_map = sect_id_map; + task.page_ht_arr = page_ht_arr; + task.list_arr = page_list_arr; + task.obj_arr = lnk_obj_arr_from_list(arena->v[0], obj_list); + tp_for_parallel(tp, arena, obj_list.count, lnk_emit_base_relocs_from_objs_task, &task); + } + ProfEnd(); + + // merge page lists + + ProfBegin("Merge Worker Page Lists"); + + HashTable *main_ht = page_ht_arr[0]; + LNK_BaseRelocPageList *main_page_list = &page_list_arr[0]; + + for (U64 list_idx = 1; list_idx < tp->worker_count; ++list_idx) { + LNK_BaseRelocPageList src = page_list_arr[list_idx]; + + for (LNK_BaseRelocPageNode *src_page = src.first, *src_next; src_page != 0; src_page = src_next) { + src_next = src_page->next; + + String8 raw_page; + B32 is_page_present = hash_table_search(main_ht, str8_struct(&src_page->v.voff), &raw_page); + + if (is_page_present) { + // page exists concat voffs + Assert(raw_page.size == sizeof(LNK_BaseRelocPageNode)); + LNK_BaseRelocPageNode *page = (LNK_BaseRelocPageNode *) raw_page.str; + Assert(page != src_page); + u64_list_concat_in_place(&page->v.entries, &src_page->v.entries); + } else { + // push page to main list + SLLQueuePush(main_page_list->first, main_page_list->last, src_page); + main_page_list->count += 1; + + // store lookup voff + hash_table_push_nocopy(arena->v[0], main_ht, str8_struct(&src_page->v.voff), str8_struct(src_page)); + } + } + } + + ProfEnd(); + + // push storage for section + LNK_Section *base_reloc_sect = lnk_section_table_push(st, str8_lit(".reloc"), LNK_RELOC_SECTION_FLAGS); + LNK_Symbol *base_reloc_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit(LNK_BASE_RELOC_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, base_reloc_sect->root, 0, 0, 0); + lnk_symbol_table_push(symtab, base_reloc_symbol); + + ProfBegin("Page List -> Array"); + LNK_BaseRelocPageArray page_arr = lnk_base_reloc_page_array_from_list(base_reloc_sect->arena, *main_page_list); + ProfEnd(); + + ProfBegin("Sort Pages on VOFF"); + lnk_base_reloc_page_array_sort(page_arr); + ProfEnd(); + + HashTable *voff_ht = hash_table_init(arena->v[0], page_size); + + ProfBegin("Serialize Pages"); + for (U64 page_idx = 0; page_idx < page_arr.count; ++page_idx) { + LNK_BaseRelocPage *page = &page_arr.v[page_idx]; + + // push buffer + U64 buf_align = sizeof(U32); + U64 buf_size = AlignPow2(sizeof(U32)*2 + sizeof(U16)*page->entries.count, buf_align); + U8 *buf = push_array_no_zero(base_reloc_sect->arena, U8, buf_size); + + // setup pointers into buffer + U32 *page_voff_ptr = (U32*)buf; + U32 *block_size_ptr = page_voff_ptr + 1; + U16 *reloc_arr_base = (U16*)(block_size_ptr + 1); + U16 *reloc_arr_ptr = reloc_arr_base; + + // write reloc array + for (U64Node *i = page->entries.first; i != 0; i = i->next) { + // was base reloc entry made? + if (hash_table_search_u64(voff_ht, i->data, 0)) { + continue; + } + hash_table_push_u64(arena->v[0], voff_ht, i->data, str8(0,0)); + + // write entry + U64 rel_off = i->data - page->voff; + Assert(rel_off <= page_size); + *reloc_arr_ptr++ = PE_BaseRelocMake(PE_BaseRelocKind_DIR64, rel_off); + } + + // write pad + U64 pad_reloc_count = AlignPadPow2(page->entries.count, sizeof(reloc_arr_ptr[0])); + MemoryZeroTyped(reloc_arr_ptr, pad_reloc_count); // fill pad with PE_BaseRelocKind_ABSOLUTE + reloc_arr_ptr += pad_reloc_count; + + // compute block size + U64 reloc_arr_size = (U64)((U8*)reloc_arr_ptr - (U8*)reloc_arr_base); + U64 block_size = sizeof(*page_voff_ptr) + sizeof(*block_size_ptr) + reloc_arr_size; + + // write header + *page_voff_ptr = safe_cast_u32(page->voff); + *block_size_ptr = safe_cast_u32(block_size); + Assert(*block_size_ptr <= buf_size); + + // push page chunk + lnk_section_push_chunk_raw(base_reloc_sect, base_reloc_sect->root, buf, block_size, str8(0,0)); + + // purge voffs for next run + hash_table_purge(voff_ht); + } + ProfEnd(); + + tp_temp_end(temp); + ProfEnd(); +} + diff --git a/src/linker/lnk_base_reloc.h b/src/linker/lnk_base_reloc.h new file mode 100644 index 00000000..0a308010 --- /dev/null +++ b/src/linker/lnk_base_reloc.h @@ -0,0 +1,56 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct LNK_BaseRelocPage +{ + U64 voff; + U64List entries; +} LNK_BaseRelocPage; + +typedef struct LNK_BaseRelocPageNode +{ + struct LNK_BaseRelocPageNode *next; + LNK_BaseRelocPage v; +} LNK_BaseRelocPageNode; + +typedef struct LNK_BaseRelocPageList +{ + U64 count; + LNK_BaseRelocPageNode *first; + LNK_BaseRelocPageNode *last; +} LNK_BaseRelocPageList; + +typedef struct LNK_BaseRelocPageArray +{ + U64 count; + LNK_BaseRelocPage *v; +} LNK_BaseRelocPageArray; + +typedef struct +{ + U64 page_size; + LNK_Section **sect_id_map; + LNK_Reloc **reloc_arr; + Rng1U64 *range_arr; + LNK_BaseRelocPageList *list_arr; + HashTable **page_ht_arr; +} LNK_BaseRelocTask; + +typedef struct +{ + U64 page_size; + LNK_Section **sect_id_map; + LNK_BaseRelocPageList *list_arr; + LNK_Obj **obj_arr; + HashTable **page_ht_arr; +} LNK_ObjBaseRelocTask; + +//////////////////////////////// + +internal LNK_BaseRelocPageArray lnk_base_reloc_page_array_from_list(Arena* arena, LNK_BaseRelocPageList list); +internal void lnk_emit_base_reloc_info(Arena *arena, LNK_Section **sect_id_map, U64 page_size, HashTable *page_ht, LNK_BaseRelocPageList *page_list, LNK_Reloc *reloc); +internal void lnk_base_reloc_page_array_sort(LNK_BaseRelocPageArray arr); +internal void lnk_build_base_relocs(TP_Context *tp, LNK_SectionTable *st, LNK_SymbolTable *symtab, COFF_MachineType machine, U64 page_size, LNK_ObjList obj_list); + diff --git a/src/linker/lnk_chunk.c b/src/linker/lnk_chunk.c new file mode 100644 index 00000000..90a0b3d5 --- /dev/null +++ b/src/linker/lnk_chunk.c @@ -0,0 +1,755 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +global read_only LNK_Chunk g_null_chunk = { 0, 0, /* is_discarded: */ 1 }; +global read_only LNK_Chunk *g_null_chunk_ptr = &g_null_chunk; + +internal LNK_ChunkRef +lnk_chunk_ref(U64 sect_id, U64 chunk_id) +{ + LNK_ChunkRef ref = {0}; + ref.sect_id = sect_id; + ref.chunk_id = chunk_id; + return ref; +} + +internal B32 +lnk_chunk_ref_is_equal(LNK_ChunkRef a, LNK_ChunkRef b) +{ + B32 is_equal = a.sect_id == b.sect_id && a.chunk_id == b.chunk_id; + return is_equal; +} + +internal LNK_ChunkNode * +lnk_chunk_list_push(Arena *arena, LNK_ChunkList *list, LNK_Chunk *chunk) +{ + LNK_ChunkNode *node = push_array_no_zero(arena, LNK_ChunkNode, 1); + node->next = 0; + node->data = chunk; + + SLLQueuePush(list->first, list->last, node); + ++list->count; + + return node; +} + +internal void +lnk_chunk_list_concat_in_place(LNK_ChunkList *list, LNK_ChunkList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal void +lnk_chunk_list_concat_in_place_arr(LNK_ChunkList *list, LNK_ChunkList *arr, U64 count) +{ + SLLConcatInPlaceArray(list, arr, count); +} + +internal LNK_ChunkList ** +lnk_make_chunk_list_arr_arr(Arena *arena, U64 slot_count, U64 per_count) +{ + LNK_ChunkList **arr_arr = push_array_no_zero(arena, LNK_ChunkList *, slot_count); + for (U64 i = 0; i < slot_count; i += 1) { + arr_arr[i] = push_array(arena, LNK_ChunkList, per_count); + } + return arr_arr; +} + +internal int +lnk_chunk_sort_index_is_before(void *raw_a, void *raw_b) +{ + // Grouped Sections (PE Format) + // "All contributions with the same object-section name are allocated contiguously in the image, + // and the blocks of contributions are sorted in lexical order by object-section name." + LNK_ChunkPtr *a = raw_a; + LNK_ChunkPtr *b = raw_b; + + // sort on section postfix + int cmp = str8_compar_case_sensetive(&(*a)->sort_idx, &(*b)->sort_idx); + + // sort on obj position on command line + if (cmp == 0) { + cmp = u64_compar(&(*a)->input_idx, &(*b)->input_idx); + } + + int is_before = cmp < 0; + return is_before; +} + +internal void +lnk_chunk_array_sort(LNK_ChunkArray arr) +{ + radsort(arr.v, arr.count, lnk_chunk_sort_index_is_before); +} + +internal LNK_ChunkManager * +lnk_chunk_manager_alloc(Arena *arena, U64 id, U64 align) +{ + ProfBeginFunction(); + + LNK_ChunkList temp_list = {0}; + + LNK_Chunk temp_chunk = {0}; + temp_chunk.ref = lnk_chunk_ref(id, 0); + temp_chunk.align = align; + temp_chunk.type = LNK_Chunk_List; + temp_chunk.u.list = &temp_list; + + LNK_ChunkManager *cman = push_array_no_zero(arena, LNK_ChunkManager, 1); + cman->total_chunk_count = 1; // null chunk + cman->root = 0; + cman->root = lnk_chunk_push_list(arena, cman, &temp_chunk, str8(0,0)); + cman->root->align = align; + + ProfEnd(); + return cman; +} + +internal LNK_Chunk * +lnk_chunk_push_(Arena *arena, LNK_Chunk *parent, U64 chunk_id, String8 sort_index) +{ + ProfBeginFunction(); + + Assert(parent->type == LNK_Chunk_List); + LNK_ChunkList *list = parent->u.list; + + LNK_Chunk *chunk = push_array_no_zero(arena, LNK_Chunk, 1); + chunk->ref = lnk_chunk_ref(parent->ref.sect_id, chunk_id); + chunk->align = 1; + chunk->is_discarded = 0; + chunk->sort_chunk = 1; + chunk->type = LNK_Chunk_Null; + chunk->sort_idx = push_str8_copy(arena, sort_index); + chunk->input_idx = list->count; + chunk->flags = 0; + chunk->associate = 0; + + lnk_chunk_list_push(arena, list, chunk); + + ProfEnd(); + return chunk; +} + +internal LNK_Chunk * +lnk_chunk_push(Arena *arena, LNK_ChunkManager *cman, LNK_Chunk *parent, String8 sort_index) +{ + U64 chunk_id = cman->total_chunk_count; + ++cman->total_chunk_count; + LNK_Chunk *chunk = lnk_chunk_push_(arena, parent, chunk_id, sort_index); + return chunk; +} + +internal LNK_Chunk * +lnk_chunk_push_leaf(Arena *arena, LNK_ChunkManager *cman, LNK_Chunk *parent, String8 sort_index, void *raw_ptr, U64 raw_size) +{ + LNK_Chunk *chunk = lnk_chunk_push(arena, cman, parent, sort_index); + chunk->type = LNK_Chunk_Leaf; + chunk->u.leaf = str8((U8 *)raw_ptr, raw_size); + return chunk; +} + +internal LNK_Chunk * +lnk_chunk_push_list(Arena *arena, LNK_ChunkManager *cman, LNK_Chunk *parent, String8 sort_index) +{ + LNK_Chunk *chunk = lnk_chunk_push(arena, cman, parent, sort_index); + chunk->type = LNK_Chunk_List; + chunk->u.list = push_array(arena, LNK_ChunkList, 1); + return chunk; +} + +internal LNK_ChunkNode * +lnk_chunk_deep_copy(Arena *arena, LNK_Chunk *chunk) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + LNK_ChunkNode *dst_root_node = push_array_no_zero(arena, LNK_ChunkNode, 1); + LNK_ChunkNode *src_root_node = push_array_no_zero(scratch.arena, LNK_ChunkNode, 1); + src_root_node->next = 0; + src_root_node->data = chunk; + + struct Stack { + struct Stack *next; + LNK_ChunkNode *src_node; + LNK_ChunkNode *dst_node; + }; + struct Stack *stack = push_array_no_zero(scratch.arena, struct Stack, 1); + stack->next = 0; + stack->src_node = src_root_node; + stack->dst_node = dst_root_node; + + while (stack) { + while (stack->src_node) { + LNK_Chunk *src = stack->src_node->data; + LNK_Chunk *dst = stack->dst_node->data; + + stack->src_node = stack->src_node->next; + stack->dst_node = stack->dst_node->next; + + dst->ref = src->ref; + dst->align = src->align; + dst->sort_idx = push_str8_copy(arena, src->sort_idx); + dst->type = src->type; + dst->flags = src->flags; + lnk_chunk_set_debugf(arena, dst, "%S", src->debug); + + switch (src->type) { + case LNK_Chunk_Null: break; + case LNK_Chunk_Leaf: { + B32 is_bss = src->u.leaf.str == 0; + if (is_bss) { + dst->u.leaf = src->u.leaf; + } else { + dst->u.leaf = push_str8_copy(arena, src->u.leaf); + } + } break; + case LNK_Chunk_List: { + LNK_ChunkNode *chain = 0; + LNK_ChunkNode *curr = 0; + if (src->u.list->count > 0) { + chain = push_array(arena, LNK_ChunkNode, src->u.list->count); + curr = chain; + for (U64 i = 1; i < src->u.list->count; ++i) { + curr->next = &chain[i]; + curr = curr->next; + } + curr->next = 0; + } + + dst->u.list = push_array_no_zero(arena, LNK_ChunkList, 1); + dst->u.list->count = src->u.list->count; + dst->u.list->first = chain; + dst->u.list->last = curr; + + struct Stack *frame = push_array_no_zero(scratch.arena, struct Stack, 1); + frame->next = 0; + frame->src_node = src->u.list->first; + frame->dst_node = dst->u.list->first; + SLLStackPush(stack, frame); + } break; + default: InvalidPath; break; + } + } + + SLLStackPop(stack); + } + + scratch_end(scratch); + ProfEnd(); + return dst_root_node; +} + +internal LNK_ChunkNode * +lnk_merge_chunks(Arena *arena, LNK_ChunkManager *dst_cman, LNK_Chunk *dst, LNK_Chunk *src, U64 *id_map_out, U64 id_map_max) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 0); + + Assert(src->ref.sect_id != dst->ref.sect_id); + Assert(dst->type == LNK_Chunk_List); + Assert(src->type != LNK_Chunk_Null); + + LNK_ChunkNode *src_node = push_array(arena, LNK_ChunkNode, 1); + src_node->data = src; + + struct Stack { + struct Stack *next; + LNK_ChunkNode *node; + }; + struct Stack *stack = push_array_no_zero(scratch.arena, struct Stack, 1); + stack->next = 0; + stack->node = src_node; + + while (stack) { + while (stack->node) { + LNK_Chunk *chunk = stack->node->data; + + // advance node + stack->node = stack->node->next; + + // allocate id + U64 new_id = dst_cman->total_chunk_count++; + + // write id map + Assert(chunk->ref.chunk_id < id_map_max); + id_map_out[chunk->ref.chunk_id] = new_id; + + // update id + chunk->ref = lnk_chunk_ref(dst->ref.sect_id, new_id); + + // recurse down on lists + if (chunk->type == LNK_Chunk_List) { + struct Stack *frame = push_array_no_zero(scratch.arena, struct Stack, 1); + frame->next = 0; + frame->node = chunk->u.list->first; + SLLStackPush(stack, frame); + } + } + + // reached end of chunk list, pop frame + SLLStackPop(stack); + } + + // move source root copy to destination section + LNK_ChunkList *list = dst->u.list; + ++list->count; + SLLQueuePush(list->first, list->last, src_node); + + scratch_end(scratch); + ProfEnd(); + return src_node; +} + +internal void +lnk_chunk_associate(Arena *arena, LNK_Chunk *head, LNK_Chunk *chunk) +{ + // for simplicity we don't support multiple associations, + // but it's possible to craft symbol table with multiple associations + Assert(!chunk->associate); + chunk->associate = head; +} + +internal B32 +lnk_chunk_is_discarded(LNK_Chunk *chunk) +{ + B32 is_discarded = chunk->is_discarded; + LNK_Chunk *curr = chunk->associate; + while (!is_discarded && curr) { + is_discarded = curr->is_discarded; + curr = curr->associate; + } + return is_discarded; +} + +internal U64 +lnk_chunk_get_size(LNK_Chunk *chunk) +{ + U64 result = 0; + switch (chunk->type) { + case LNK_Chunk_Null: break; + case LNK_Chunk_Leaf: { + result = chunk->u.leaf.size; + } break; + case LNK_Chunk_LeafArray: + case LNK_Chunk_List: { + Assert(!"TODO: list size"); + } break; + } + return result; +} + +internal U64 +lnk_chunk_list_get_node_count(LNK_Chunk *chunk) +{ + Assert(chunk->type == LNK_Chunk_List); + return chunk->u.list->count; +} + +internal void +lnk_chunk_op_list_push_node(LNK_ChunkOpList *list, LNK_ChunkOp *op) +{ + SLLQueuePush(list->first, list->last, op); +} + +internal LNK_ChunkOp * +lnk_push_chunk_op_begin(Arena *arena, U64 chunk_id) +{ + LNK_ChunkOp *begin_op = push_array_no_zero(arena, LNK_ChunkOp, 1); + begin_op->next = 0; + begin_op->type = LNK_ChunkOp_Begin; + begin_op->u.chunk_id = chunk_id; + return begin_op; +} + +internal LNK_ChunkOp * +lnk_push_chunk_op_end_virt(Arena *arena) +{ + LNK_ChunkOp *end_virt_op = push_array_no_zero(arena, LNK_ChunkOp, 1); + end_virt_op->next = 0; + end_virt_op->type = LNK_ChunkOp_EndVirt; + return end_virt_op; +} + +internal LNK_ChunkOp * +lnk_push_chunk_op_end_file(Arena *arena) +{ + LNK_ChunkOp *end_op = push_array_no_zero(arena, LNK_ChunkOp, 1); + end_op->next = 0; + end_op->type = LNK_ChunkOp_End; + return end_op; +} + +internal LNK_ChunkOp * +lnk_push_chunk_op_align(Arena *arena, U64 align, U64 val) +{ + LNK_ChunkOp *align_op = push_array_no_zero(arena, LNK_ChunkOp, 1); + align_op->next = 0; + align_op->type = LNK_ChunkOp_Align; + align_op->u.align.x = align; + align_op->u.align.val = val; + return align_op; +} + +internal LNK_ChunkOp * +lnk_push_chunk_op_write(Arena *arena, String8 string) +{ + LNK_ChunkOp *write_op = push_array_no_zero(arena, LNK_ChunkOp, 1); + write_op->next = 0; + write_op->type = LNK_ChunkOp_WriteString; + write_op->u.string = string; + return write_op; +} + +internal LNK_ChunkOpList +lnk_op_list_from_chunk(Arena *arena, LNK_Chunk *root, U64 total_chunk_count, U8 align_byte) +{ + struct Stack { + struct Stack *next; + LNK_ChunkArray chunk_array; + U64 ichunk; + }; + + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + // setup stack + struct Stack *stack = push_array(scratch.arena, struct Stack, 1); + stack->chunk_array.count = 1; + stack->chunk_array.v = &root; + + // setup output list + LNK_ChunkOpList list = {0}; + list.total_chunk_count = total_chunk_count; + list.first = list.last = 0; + + // write null + LNK_ChunkOp *null_begin_op = lnk_push_chunk_op_begin(arena, 0); + LNK_ChunkOp *null_end_virt_op = lnk_push_chunk_op_end_virt(arena); + LNK_ChunkOp *null_end_file_op = lnk_push_chunk_op_end_file(arena);; + lnk_chunk_op_list_push_node(&list, null_begin_op); + lnk_chunk_op_list_push_node(&list, null_end_virt_op); + lnk_chunk_op_list_push_node(&list, null_end_file_op); + + // traverse chunks from root + while (stack) { + while (stack->ichunk < stack->chunk_array.count) { + LNK_Chunk *chunk = stack->chunk_array.v[stack->ichunk++]; + + // skip unused chunks + if (lnk_chunk_is_discarded(chunk)) { + continue; + } + + switch (chunk->type) { + case LNK_Chunk_Leaf: { + // align start in its own begin/end block so align bytes don't contribute to chunk size + LNK_ChunkOp *pad_begin_op = lnk_push_chunk_op_begin(arena, list.total_chunk_count++); + LNK_ChunkOp *pad_align_op = lnk_push_chunk_op_align(arena, chunk->align, align_byte); + LNK_ChunkOp *pad_end_file_op = lnk_push_chunk_op_end_file(arena); + lnk_chunk_op_list_push_node(&list, pad_begin_op); + lnk_chunk_op_list_push_node(&list, pad_align_op); + lnk_chunk_op_list_push_node(&list, pad_end_file_op); + + // write leaf + LNK_ChunkOp *leaf_begin_op = lnk_push_chunk_op_begin(arena, chunk->ref.chunk_id); + LNK_ChunkOp *leaf_write_op = lnk_push_chunk_op_write(arena, chunk->u.leaf); + LNK_ChunkOp *leaf_align_op = lnk_push_chunk_op_align(arena, chunk->align, align_byte); + LNK_ChunkOp *leaf_end_virt_op = lnk_push_chunk_op_end_virt(arena); + LNK_ChunkOp *leaf_end_file_op = lnk_push_chunk_op_end_file(arena); + #if LNK_DUMP_CHUNK_LAYOUT + leaf_write_op->chunk = chunk; + #endif + lnk_chunk_op_list_push_node(&list, leaf_begin_op); + lnk_chunk_op_list_push_node(&list, leaf_write_op); + lnk_chunk_op_list_push_node(&list, leaf_align_op); + lnk_chunk_op_list_push_node(&list, leaf_end_virt_op); + lnk_chunk_op_list_push_node(&list, leaf_end_file_op); + } break; + + case LNK_Chunk_LeafArray: { + LNK_ChunkOp *begin_op = lnk_push_chunk_op_begin(arena, chunk->ref.chunk_id); + LNK_ChunkOp *align_op = lnk_push_chunk_op_align(arena, chunk->align, align_byte); + lnk_chunk_op_list_push_node(&list, begin_op); + lnk_chunk_op_list_push_node(&list, align_op); + + if (chunk->sort_chunk) { + lnk_chunk_array_sort(*chunk->u.arr); + } + + struct Stack *frame = push_array_no_zero(scratch.arena, struct Stack, 1); + frame->next = 0; + frame->chunk_array = *chunk->u.arr; + frame->ichunk = 0; + SLLStackPush(stack, frame); + } goto yeild; + + case LNK_Chunk_List: { + // balance ops at :end_chunk_series + LNK_ChunkOp *begin_op = lnk_push_chunk_op_begin(arena, chunk->ref.chunk_id); + LNK_ChunkOp *align_op = lnk_push_chunk_op_align(arena, chunk->align, align_byte); + lnk_chunk_op_list_push_node(&list, begin_op); + lnk_chunk_op_list_push_node(&list, align_op); + + // chunk list -> chunk array + LNK_ChunkArray chunk_array = {0}; + chunk_array.v = push_array_no_zero(scratch.arena, LNK_ChunkPtr, chunk->u.list->count); + for (LNK_ChunkNode *cptr = chunk->u.list->first; cptr != 0; cptr = cptr->next) { + chunk_array.v[chunk_array.count++] = cptr->data; + } + + if (chunk->sort_chunk) { + lnk_chunk_array_sort(chunk_array); + } + + // recurse into list chunk + struct Stack *frame = push_array_no_zero(scratch.arena, struct Stack, 1); + frame->next = 0; + frame->chunk_array = chunk_array; + frame->ichunk = 0; + SLLStackPush(stack, frame); + } goto yeild; + + case LNK_Chunk_Null: { /* ignore */ } break; + } + } + + // terminate series + if (stack->next) { + struct Stack *prev = stack->next; + Assert(prev->ichunk > 0); + + // :end_chunk_series + LNK_ChunkOp *end_virt_op = lnk_push_chunk_op_end_virt(arena); + LNK_ChunkOp *align_op = lnk_push_chunk_op_align(arena, prev->chunk_array.v[prev->ichunk - 1]->align, align_byte); + LNK_ChunkOp *end_op = lnk_push_chunk_op_end_file(arena); + lnk_chunk_op_list_push_node(&list, end_virt_op); + lnk_chunk_op_list_push_node(&list, align_op); + lnk_chunk_op_list_push_node(&list, end_op); + } + + // move to next frame + SLLStackPop(stack); + + yeild:; + } + + scratch_end(scratch); + ProfEnd(); + return list; +} + +internal LNK_ChunkLayout +lnk_chunk_layout_from_op_list(Arena *arena, LNK_ChunkOpList op_list, B32 is_data_inited) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + // setup stack + struct Stack { + struct Stack *next; + U64 chunk_id; + U64 cursor; + } *stack = 0; + + // setup state + U64 cursor = 0; + String8List data_list = {0}; + + // setup output + U64 *chunk_off_array = push_array_no_zero(arena, U64, op_list.total_chunk_count); + U64 *chunk_file_size_array = push_array_no_zero(arena, U64, op_list.total_chunk_count); + U64 *chunk_virt_size_array = push_array_no_zero(arena, U64, op_list.total_chunk_count); + + // debug stomp so discarded chunks map to invalid offset +#if LNK_PARANOID + MemorySet(chunk_off_array, 0xFF, sizeof(*chunk_off_array) * op_list.total_chunk_count); + MemorySet(chunk_file_size_array, 0xFF, sizeof(*chunk_file_size_array) * op_list.total_chunk_count); + MemorySet(chunk_virt_size_array, 0xFF, sizeof(*chunk_virt_size_array) * op_list.total_chunk_count); +#endif + + // execute opcodes + for (LNK_ChunkOp *op = op_list.first; op != NULL; op = op->next) { + switch (op->type) { + case LNK_ChunkOp_Null: break; + case LNK_ChunkOp_Begin: { + struct Stack *frame = push_array(scratch.arena, struct Stack, 1); + frame->chunk_id = op->u.chunk_id; + frame->cursor = cursor; + SLLStackPush(stack, frame); + chunk_off_array[stack->chunk_id] = stack->cursor; + } break; + case LNK_ChunkOp_End: { + chunk_file_size_array[stack->chunk_id] = cursor - stack->cursor; + SLLStackPop(stack); + } break; + case LNK_ChunkOp_EndVirt: { + chunk_virt_size_array[stack->chunk_id] = cursor - stack->cursor; + } break; + case LNK_ChunkOp_Align: { + Assert(IsPow2(op->u.align.x)); + U64 size = AlignPow2(cursor, op->u.align.x) - cursor; + + String8 string; + string.size = size; + string.str = push_array_no_zero(arena, U8, string.size); + MemorySet(string.str, op->u.align.val, string.size); + + op->type = LNK_ChunkOp_WriteString; + op->u.string = string; + } // fall-through + case LNK_ChunkOp_WriteString: { + if (is_data_inited) { + // we allow chunks to have null for str for regions in the image that are zeroed out. + if (op->u.string.str == 0) { + op->u.string.str = push_array(arena, U8, op->u.string.size); + } + str8_list_push(scratch.arena, &data_list, op->u.string); + } +#if LNK_DUMP_CHUNK_LAYOUT + if (op->chunk) { + fprintf(g_layout_file, "[%.*s] %llX %.*s\n", str8_varg(op->chunk->sort_idx), op->chunk->input_idx, str8_varg(op->chunk->debug)); + } +#endif + // advance + cursor += op->u.string.size; + } break; + } + } + + // are begin/end series opcodes balanced? + Assert(stack == 0); + + // fill out result + LNK_ChunkLayout layout = {0}; + layout.data = str8_list_join(arena, &data_list, 0); + layout.chunk_off_array = chunk_off_array; + layout.chunk_file_size_array = chunk_file_size_array; + layout.chunk_virt_size_array = chunk_virt_size_array; + + scratch_end(scratch); + ProfEnd(); + return layout; +} + +internal LNK_ChunkLayout +lnk_build_chunk_layout(Arena *arena, LNK_ChunkManager *cman, COFF_SectionFlags flags, U8 align_byte) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + // should we write data for chunks? + B32 is_data_inited = !!(~flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA); + + // build layout + LNK_ChunkOpList op_list = lnk_op_list_from_chunk(scratch.arena, cman->root, cman->total_chunk_count, align_byte); + LNK_ChunkLayout layout = lnk_chunk_layout_from_op_list(arena, op_list, is_data_inited); + + scratch_end(scratch); + ProfEnd(); + return layout; +} + +internal B32 +lnk_visit_chunks_(U64 sect_id, LNK_Chunk *chunk, LNK_ChunkVisitorSig *cb, void *ud) +{ + // visit chunk + B32 is_done = cb(sect_id, chunk, ud); + if (is_done) { + return is_done; + } + + switch (chunk->type) { + case LNK_Chunk_Null: + case LNK_Chunk_Leaf: { + // reached leaf + } break; + case LNK_Chunk_LeafArray: { + for (U64 idx = 0; idx < chunk->u.arr->count; idx += 1) { + is_done = lnk_visit_chunks_(sect_id, chunk->u.arr->v[idx], cb, ud); + if (is_done) { + break; + } + } + } break; + case LNK_Chunk_List: { + for (LNK_ChunkNode *i = chunk->u.list->first; i != 0; i = i->next) { + is_done = lnk_visit_chunks_(sect_id, i->data, cb, ud); + if (is_done) { + break; + } + } + } break; + } + + return is_done; +} + +internal void +lnk_visit_chunks(U64 sect_id, LNK_Chunk *chunk, LNK_ChunkVisitorSig *cb, void *ud) +{ + lnk_visit_chunks_(sect_id, chunk, cb, ud); +} + +LNK_CHUNK_VISITOR_SIG(lnk_save_chunk_ptr) +{ + LNK_Chunk **id_map = (LNK_Chunk **)ud; + if (!chunk->is_discarded) { + id_map[chunk->ref.chunk_id] = chunk; + } + return 0; +} + +internal LNK_ChunkPtr * +lnk_make_chunk_id_map(Arena *arena, LNK_ChunkManager *cman) +{ + LNK_ChunkPtr *chunk_id_map = push_array_no_zero(arena, LNK_ChunkPtr, cman->total_chunk_count + 1); + lnk_visit_chunks(0, cman->root, lnk_save_chunk_ptr, chunk_id_map); + + LNK_Chunk *null_chunk = push_array(arena, LNK_Chunk, 1); + null_chunk->is_discarded = 1; + + chunk_id_map[0] = null_chunk; + + return chunk_id_map; +} + +internal LNK_ChunkNode * +lnk_chunk_ptr_list_reserve(Arena *arena, LNK_ChunkList *list, U64 count) +{ + LNK_ChunkNode *arr = 0; + if (count) { + arr = push_array(arena, LNK_ChunkNode, count); + LNK_Chunk *chunk_arr = push_array(arena, LNK_Chunk, count); + for (U64 i = 0; i < count; i += 1) { + arr[i].data = &chunk_arr[i]; + SLLQueuePush(list->first, list->last, &arr[i]); + } + list->count += count; + } + return arr; +} + +internal String8Array +lnk_data_arr_from_chunk_ptr_list(Arena *arena, LNK_ChunkList list) +{ + String8Array arr = {0}; + arr.v = push_array(arena, String8, list.count); + for (LNK_ChunkNode *n = list.first; n != 0; n = n->next) { + LNK_ChunkPtr c = n->data; + Assert(c->type == LNK_Chunk_Leaf); + arr.v[arr.count] = c->u.leaf; + arr.count += 1; + } + return arr; +} + +internal String8Array * +lnk_data_arr_from_chunk_ptr_list_arr(Arena *arena, LNK_ChunkList *list_arr, U64 count) +{ + String8Array *result = push_array(arena, String8Array, count); + for (U64 i = 0; i < count; i += 1) { + result[i] = lnk_data_arr_from_chunk_ptr_list(arena, list_arr[i]); + } + return result; +} + diff --git a/src/linker/lnk_chunk.h b/src/linker/lnk_chunk.h new file mode 100644 index 00000000..072fa85c --- /dev/null +++ b/src/linker/lnk_chunk.h @@ -0,0 +1,164 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +//////////////////////////////// + +#define LNK_DEBUG_CHUNKS 0 +#define LNK_DUMP_CHUNK_LAYOUT 0 + +//////////////////////////////// + +typedef struct LNK_ChunkRef +{ + U64 sect_id; + U64 chunk_id; +} LNK_ChunkRef; + +typedef enum +{ + LNK_Chunk_Null, + LNK_Chunk_Leaf, + LNK_Chunk_LeafArray, + LNK_Chunk_List, +} LNK_ChunkType; + +typedef struct LNK_Chunk +{ + LNK_ChunkRef ref; + LNK_ChunkType type; + U64 align; + B32 is_discarded; + B32 sort_chunk; + String8 sort_idx; + U64 input_idx; + COFF_SectionFlags flags; + struct LNK_Chunk *associate; + union { + String8 leaf; + struct LNK_ChunkList *list; + struct LNK_ChunkArray *arr; + } u; +#if LNK_DEBUG_CHUNKS + String8 debug; +#endif +} LNK_Chunk, * LNK_ChunkPtr; + +typedef struct LNK_ChunkNode +{ + struct LNK_ChunkNode *next; + LNK_ChunkPtr data; +} LNK_ChunkNode; + +typedef struct LNK_ChunkArray +{ + U64 count; + LNK_ChunkPtr *v; +} LNK_ChunkArray; + +typedef struct LNK_ChunkList +{ + U64 count; + LNK_ChunkNode *first; + LNK_ChunkNode *last; +} LNK_ChunkList; + +typedef enum LNK_ChunkOpType +{ + LNK_ChunkOp_Null, + LNK_ChunkOp_WriteString, + LNK_ChunkOp_Align, + LNK_ChunkOp_Begin, + LNK_ChunkOp_End, + LNK_ChunkOp_EndVirt, +} LNK_ChunkOpType; + +typedef struct LNK_ChunkOp +{ + struct LNK_ChunkOp *next; + LNK_ChunkOpType type; + union { + String8 string; + U64 chunk_id; + struct { + U64 val; + U64 x; + } align; + LNK_Chunk *leaf; + } u; +#if LNK_DUMP_CHUNK_LAYOUT + LNK_Chunk *chunk; +#endif +} LNK_ChunkOp; + +typedef struct LNK_ChunkOpList +{ + U64 total_chunk_count; + LNK_ChunkOp *first; + LNK_ChunkOp *last; +} LNK_ChunkOpList; + +typedef struct LNK_ChunkLayout +{ + String8 data; + U64 *chunk_off_array; + U64 *chunk_file_size_array; + U64 *chunk_virt_size_array; +} LNK_ChunkLayout; + +typedef struct LNK_ChunkManager +{ + LNK_Chunk *root; + U64 total_chunk_count; +} LNK_ChunkManager; + +extern LNK_Chunk g_null_chunk; +extern LNK_Chunk *g_null_chunk_ptr; + +internal LNK_ChunkRef lnk_chunk_ref(U64 sect_id, U64 chunk_id); +internal B32 lnk_chunk_ref_is_equal(LNK_ChunkRef a, LNK_ChunkRef b); + +internal LNK_ChunkNode * lnk_chunk_list_push(Arena *arena, LNK_ChunkList *list, LNK_Chunk *chunk); +internal void lnk_chunk_list_concat_in_place(LNK_ChunkList *list, LNK_ChunkList *to_concat); +internal void lnk_chunk_list_concat_in_place_arr(LNK_ChunkList *list, LNK_ChunkList *arr, U64 count); +internal LNK_ChunkList ** lnk_make_chunk_list_arr_arr(Arena *arena, U64 slot_count, U64 per_count); +internal void lnk_chunk_array_sort(LNK_ChunkArray arr); + +internal LNK_ChunkManager * lnk_chunk_manager_alloc(Arena *arena, U64 id, U64 align); +internal LNK_Chunk * lnk_chunk_push(Arena *arena, LNK_ChunkManager *cman, LNK_Chunk *parent, String8 sort_index); +internal LNK_Chunk * lnk_chunk_push_leaf(Arena *arena, LNK_ChunkManager *cman, LNK_Chunk *parent, String8 sort_index, void *raw_ptr, U64 raw_size); +internal LNK_Chunk * lnk_chunk_push_list(Arena *arena, LNK_ChunkManager *cman, LNK_Chunk *parent, String8 sort_index); +internal LNK_ChunkNode * lnk_chunk_deep_copy(Arena *arena, LNK_Chunk *chunk); +internal LNK_ChunkNode * lnk_merge_chunks(Arena *arena, LNK_ChunkManager *dst_cman, LNK_Chunk *dst, LNK_Chunk *src, U64 *id_map_out, U64 id_map_max); +internal void lnk_chunk_associate(Arena *arena, LNK_Chunk *head, LNK_Chunk *associate); +internal B32 lnk_chunk_is_discarded(LNK_Chunk *chunk); +internal U64 lnk_chunk_get_size(LNK_Chunk *chunk); +internal U64 lnk_chunk_list_get_node_count(LNK_Chunk *chunk); + +internal void lnk_chunk_op_list_push_node(LNK_ChunkOpList *list, LNK_ChunkOp *op); + +internal LNK_ChunkOp * lnk_push_chunk_op_begin(Arena *arena, U64 chunk_id); +internal LNK_ChunkOp * lnk_push_chunk_op_end_virt(Arena *arena); +internal LNK_ChunkOp * lnk_push_chunk_op_end_file(Arena *arena); +internal LNK_ChunkOp * lnk_push_chunk_op_align(Arena *arena, U64 align, U64 val); +internal LNK_ChunkOp * lnk_push_chunk_op_write(Arena *arena, String8 string); + +internal LNK_ChunkOpList lnk_op_list_from_chunk(Arena *arena, LNK_Chunk *root, U64 total_chunk_count, U8 align_byte); +internal LNK_ChunkLayout lnk_chunk_layout_from_op_list(Arena *arena, LNK_ChunkOpList op_list, B32 is_data_inited); +internal LNK_ChunkLayout lnk_build_chunk_layout(Arena *arena, LNK_ChunkManager *cman, COFF_SectionFlags flags, U8 align_byte); + +#define LNK_CHUNK_VISITOR_SIG(name) B32 name(U64 sect_id, LNK_Chunk *chunk, void *ud) +typedef LNK_CHUNK_VISITOR_SIG(LNK_ChunkVisitorSig); +internal void lnk_visit_chunks(U64 sect_id, LNK_Chunk *root, LNK_ChunkVisitorSig *cb, void *ud); + +internal LNK_ChunkNode * lnk_chunk_ptr_list_reserve(Arena *arena, LNK_ChunkList *list, U64 count); +internal String8Array lnk_data_arr_from_chunk_ptr_list(Arena *arena, LNK_ChunkList list); +internal String8Array * lnk_data_arr_from_chunk_ptr_list_arr(Arena *arena, LNK_ChunkList *list_arr, U64 count); + +#if LNK_DEBUG_CHUNKS +#define lnk_chunk_set_debugf(a, c, f, ...) do { (c)->debug = push_str8f((a), f, __VA_ARGS__); } while(0) +#else +#define lnk_chunk_set_debugf(a, c, f, ...) (void)(c) +#endif + diff --git a/src/linker/lnk_cmd_line.c b/src/linker/lnk_cmd_line.c new file mode 100644 index 00000000..69942dd9 --- /dev/null +++ b/src/linker/lnk_cmd_line.c @@ -0,0 +1,274 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal String8List +lnk_arg_list_parse_windows_rules(Arena *arena, String8 string) +{ + Temp scratch = scratch_begin(&arena, 1); + + String8List list = {0}; + + U8 *ptr = string.str; + U8 *opl = string.str + string.size; + while (ptr < opl) { + // skip white space and new lines + for (;;) { + U64 size = (U64)(opl - ptr); + UnicodeDecode uni = utf8_decode(ptr, size); + if (uni.codepoint != ' ' && uni.codepoint != '\n' && uni.codepoint != '\r') { + break; + } + ptr += uni.inc; + } + + if (*ptr == '\0') { + break; + } + + String8List token_builder = {0}; + U8 *anchor = ptr; + while (ptr < opl) { + UnicodeDecode uni; + + uni = utf8_decode(ptr, (U64)(opl-ptr)); + if (uni.codepoint == '\0' || uni.codepoint == '\n' || uni.codepoint == '\r' || uni.codepoint == ' ') { + break; + } + + // handle string and strip quotes + if (uni.codepoint == '"') { + String8 text_before_quote = str8(anchor, (U64)(ptr - anchor)); + str8_list_push(scratch.arena, &token_builder, text_before_quote); + + // advance past starting quote + ptr += uni.inc; + anchor = ptr; + + U8 *quote_end = ptr; + while (ptr < opl) { + uni = utf8_decode(ptr, (U64)(opl - ptr)); + ptr += uni.inc; + // skip escape char + if (uni.codepoint == '\\') { + uni = utf8_decode(ptr, (U64)(opl - ptr)); + ptr += uni.inc; + } else if (uni.codepoint == '"' || uni.codepoint == '\0') { + break; // found matching quote char + } + quote_end = ptr; + } + + String8 text_inside_quotes = str8(anchor, (U64)(quote_end - anchor)); + str8_list_push(scratch.arena, &token_builder, text_inside_quotes); + anchor = ptr; + } else { + ptr += uni.inc; + } + } + + // push remaining text + String8 text = str8(anchor, (U64)(ptr - anchor)); + str8_list_push(scratch.arena, &token_builder, text); + + // push token + String8 token = str8_list_join(arena, &token_builder, NULL); + if (token.size) { + str8_list_push(arena, &list, token); + } + } + + scratch_end(scratch); + return list; +} + +internal void +lnk_cmd_line_push_option_node(LNK_CmdLine *cmd_line, LNK_CmdOption *opt) +{ + SLLQueuePush(cmd_line->first_option, cmd_line->last_option, opt); + cmd_line->option_count += 1; +} + +internal LNK_CmdOption * +lnk_cmd_line_push_option_list(Arena *arena, LNK_CmdLine *cmd_line, String8 string, String8List value_strings) +{ + // fill out node + LNK_CmdOption *opt = push_array_no_zero(arena, LNK_CmdOption, 1); + opt->next = 0; + opt->string = string; + opt->value_strings = value_strings; + + // push node + lnk_cmd_line_push_option_node(cmd_line, opt); + + return opt; +} + +internal LNK_CmdOption * +lnk_cmd_line_push_option_string(Arena *arena, LNK_CmdLine *cmd_line, String8 string, String8 value) +{ + String8List value_list = str8_split_by_string_chars(arena, value, str8_lit(","), StringSplitFlag_KeepEmpties); + LNK_CmdOption *opt = lnk_cmd_line_push_option_list(arena, cmd_line, string, value_list); + return opt; +} + +internal LNK_CmdOption * +lnk_cmd_line_push_option(Arena *arena, LNK_CmdLine *cmd_line, char *string, char *value) +{ + return lnk_cmd_line_push_option_string(arena, cmd_line, str8_cstring(string), str8_cstring(value)); +} + +internal LNK_CmdOption * +lnk_cmd_line_push_option_if_not_present(Arena *arena, LNK_CmdLine *cmd_line, char *string, char *value) +{ + if (!lnk_cmd_line_has_option(*cmd_line, string)) { + return lnk_cmd_line_push_option(arena, cmd_line, string, value); + } + return 0; +} + +internal LNK_CmdLine +lnk_cmd_line_parse_windows_rules(Arena *arena, String8List arg_list) +{ + Temp scratch = scratch_begin(&arena, 1); + + LNK_CmdLine cmd_line = {0}; + + for (String8Node *arg_node = arg_list.first; arg_node != 0; arg_node = arg_node->next) { + String8 arg = arg_node->string; + B32 is_option = str8_match(str8_lit("/"), arg, StringMatchFlag_RightSideSloppy) || + str8_match(str8_lit("-"), arg, StringMatchFlag_RightSideSloppy); + if (is_option) { + U64 param_start_pos = str8_find_needle(arg, 0, str8_lit(":"), 0); + String8 option_name = str8_chop(arg, arg.size - param_start_pos); + + // remove '/' or '-' from option name + option_name = str8_skip(option_name, 1); + + // skip ':' + String8 value_string = str8_skip(arg, param_start_pos + 1); + + // make value list + String8List value_list = str8_split_by_string_chars(arena, value_string, str8_lit(","), 0); + + // push command + lnk_cmd_line_push_option_list(arena, &cmd_line, option_name, value_list); + } else { + str8_list_push(arena, &cmd_line.input_list, arg); + } + } + + scratch_end(scratch); + return cmd_line; +} + +internal LNK_CmdOption * +lnk_cmd_line_option_from_string(LNK_CmdLine cmd_line, String8 string) +{ + LNK_CmdOption *opt; + for (opt = cmd_line.first_option; opt != NULL; opt = opt->next) { + if (str8_match(string, opt->string, StringMatchFlag_CaseInsensitive)) { + break; + } + } + return opt; +} + +internal B32 +lnk_cmd_line_has_option_string(LNK_CmdLine cmd_line, String8 string) +{ + LNK_CmdOption *opt = lnk_cmd_line_option_from_string(cmd_line, string); + B32 has_option = (opt != 0); + return has_option; +} + +internal B32 +lnk_cmd_line_has_option(LNK_CmdLine cmd_line, char *string) +{ + return lnk_cmd_line_has_option_string(cmd_line, str8_cstring(string)); +} + +internal String8List +lnk_unwrap_rsp(Arena *arena, String8List arg_list) +{ + Temp scratch = scratch_begin(&arena, 1); + + String8List result = {0}; + + for (String8Node *curr = arg_list.first; curr != 0; curr = curr->next) { + B32 is_rsp = str8_match(str8_lit("@"), curr->string, StringMatchFlag_RightSideSloppy); + if (is_rsp) { + // remove "@" + String8 name = str8_skip(curr->string, 1); + + if (os_file_path_exists(name)) { + // read rsp from disk + String8 file = os_data_from_file_path(scratch.arena, name); + + // parse rsp + String8List rsp_args = lnk_arg_list_parse_windows_rules(scratch.arena, file); + + // handle case where rsp references another rsp + String8List list = lnk_unwrap_rsp(arena, rsp_args); + + // push arguments from rsp + list = str8_list_copy(arena, &list); + str8_list_concat_in_place(&result, &list); + } else { + lnk_error(LNK_Error_Cmdl, "unable to find rsp: %S", name); + } + } else { + // push regular argument + String8 str = push_str8_copy(arena, curr->string); + str8_list_push(arena, &result, str); + } + } + + scratch_end(scratch); + return result; +} + +internal String8List +lnk_data_from_cmd_line(Arena *arena, LNK_CmdLine cmd_line) +{ + String8List result = {0}; + + for (LNK_CmdOption *opt = cmd_line.first_option; opt != 0; opt = opt->next) { + // separate directives + if (opt != cmd_line.first_option) { + str8_list_pushf(arena, &result, " "); + } + + // push new directive + str8_list_pushf(arena, &result, "/%.*s", str8_varg(opt->string)); + + // do we have arguments? + if (opt->value_strings.node_count > 0) { + str8_list_pushf(arena, &result, ":"); + + for (String8Node *value_node = opt->value_strings.first; value_node != 0; value_node = value_node->next) { + // separate arguments + if (value_node != opt->value_strings.first) { + str8_list_pushf(arena, &result, ","); + } + + // push argument + B32 has_spaces = str8_find_needle(value_node->string, 0, str8_lit(" "), StringMatchFlag_CaseInsensitive) < value_node->string.size; + if (has_spaces) { + str8_list_pushf(arena, &result, "\"%.*s\"", str8_varg(value_node->string)); + } else { + str8_list_pushf(arena, &result, "%.*s", str8_varg(value_node->string)); + } + } + } + } + + // append inputs + for (String8Node *input_node = cmd_line.input_list.first; input_node != 0; input_node = input_node->next) { + if (input_node != cmd_line.input_list.first) { + str8_list_pushf(arena, &result, " "); + } + str8_list_pushf(arena, &result, "\"%.*s\"", str8_varg(input_node->string)); + } + + return result; +} diff --git a/src/linker/lnk_cmd_line.h b/src/linker/lnk_cmd_line.h new file mode 100644 index 00000000..2d35832f --- /dev/null +++ b/src/linker/lnk_cmd_line.h @@ -0,0 +1,33 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct LNK_CmdOption +{ + struct LNK_CmdOption *next; + String8 string; + String8List value_strings; +} LNK_CmdOption; + +typedef struct LNK_CmdLine +{ + U64 option_count; + LNK_CmdOption *first_option; + LNK_CmdOption *last_option; + String8List input_list; +} LNK_CmdLine; + +internal String8List lnk_arg_list_parse_windows_rules(Arena *arena, String8 string); +internal LNK_CmdLine lnk_cmd_line_parse_windows_rules(Arena *arena, String8List arg_list); +internal LNK_CmdOption * lnk_cmd_line_option_from_string(LNK_CmdLine cmd_line, String8 string); +internal B32 lnk_cmd_line_has_option_string(LNK_CmdLine cmd_line, String8 string); +internal B32 lnk_cmd_line_has_option(LNK_CmdLine cmd_line, char *string); + +internal LNK_CmdOption * lnk_cmd_line_push_option(Arena *arena, LNK_CmdLine *cmd_line, char *string, char *value); +internal LNK_CmdOption * lnk_cmd_line_push_option_if_not_present(Arena *arena, LNK_CmdLine *cmd_line, char *string, char *value); + +internal String8List lnk_unwrap_rsp(Arena *arena, String8List arg_list); + +internal String8List lnk_data_from_cmd_line(Arena *arena, LNK_CmdLine cmd_line); + diff --git a/src/linker/lnk_config.c b/src/linker/lnk_config.c new file mode 100644 index 00000000..dd50c19b --- /dev/null +++ b/src/linker/lnk_config.c @@ -0,0 +1,1790 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +//////////////////////////////// +// Enum <-> String + +read_only struct +{ + LNK_CmdSwitchType type; + char *name; + char *args; + char *desc; +} g_cmd_switch_map[] = { + { LNK_CmdSwitch_Null, "", "", "" }, + { LNK_CmdSwitch_NotImplemented, "NOT_IMPLEMENTED", "", "" }, + { LNK_CmdSwitch_Align, "ALIGN", ":#", "" }, + { LNK_CmdSwitch_AllowBind, "ALLOWBIND", "[:NO]", "" }, + { LNK_CmdSwitch_AllowIsolation, "ALLOWISOLATION", "[:NO]", "" }, + { LNK_CmdSwitch_AlternateName, "ALTERNATENAME", "Creates an a symbol alias \"FROM=TO\"." }, + { LNK_CmdSwitch_AppContainer, "APPCONTAINER", "[:NO]", "" }, + { LNK_CmdSwitch_NotImplemented, "ASSEMBLYDEBUG", "", "" }, // .NET + { LNK_CmdSwitch_NotImplemented, "ASSEMBLYLINKRESOURCE", "", "" }, // .NET + { LNK_CmdSwitch_NotImplemented, "ASSEMBLYMODULE", "", "" }, // .NET + { LNK_CmdSwitch_NotImplemented, "ASSEMBLYRESOURCE", "", "" }, // .NET + { LNK_CmdSwitch_Base, "BASE", "{ADDRESS[,SIZE]|@FILENAME,KEY}", "" }, + { LNK_CmdSwitch_NotImplemented, "CLRIMAGETYPE", "", "" }, // .NET + { LNK_CmdSwitch_NotImplemented, "CLRLOADEROPTIMIZATION","", "" }, // .NET + { LNK_CmdSwitch_NotImplemented, "CLRSUPPORTLASTERROR", "", "" }, // .NET + { LNK_CmdSwitch_NotImplemented, "CLRTHREADATTRIBUTE", "", "" }, // .NET + { LNK_CmdSwitch_NotImplemented, "CLRUNMANAGEDCODECHECK","", "" }, // .NET + { LNK_CmdSwitch_Debug, "DEBUG", "[:{FULL|NONE}]", "" }, + { LNK_CmdSwitch_NotImplemented, "DEF", ":FILENAME", "" }, + { LNK_CmdSwitch_DefaultLib, "DEFAULTLIB", ":LIBNAME", "" }, + { LNK_CmdSwitch_Delay, "DELAY", ":{NOBIND|UNLOAD}", "" }, + { LNK_CmdSwitch_DelayLoad, "DELAYLOAD", ":DLL", "" }, + { LNK_CmdSwitch_NotImplemented, "DELAYSIGN", "", "" }, + { LNK_CmdSwitch_NotImplemented, "DEPENDENTLOADFLAG", "", "" }, + { LNK_CmdSwitch_Dll, "DLL", "", "" }, + { LNK_CmdSwitch_NotImplemented, "DRIVER", "", "" }, + { LNK_CmdSwitch_DisallowLib, "DISALLOWLIB", ":LIBRARY", "", }, + { LNK_CmdSwitch_DynamicBase, "DYNAMICBASE", "[:NO]", "" }, + { LNK_CmdSwitch_NotImplemented, "EMITVOLATILEMETADATA", "", "" }, + { LNK_CmdSwitch_Entry, "ENTRY", ":FUNCTION", "" }, + { LNK_CmdSwitch_Null, "ERRORREPORT", "", "Deprecated starting Windows Vista." }, + { LNK_CmdSwitch_NotImplemented, "EXPORT", ":SYMBOL", "" }, + { LNK_CmdSwitch_NotImplemented, "EXPORTADMIN", "", "" }, + { LNK_CmdSwitch_FastFail, "FASTFAIL", "", "Not used." }, + { LNK_CmdSwitch_NotImplemented, "FASTGENPROFILE", "", "" }, + { LNK_CmdSwitch_FileAlign, "FILEALIGN", ":#", "" }, + { LNK_CmdSwitch_Fixed, "FIXED", "[:NO]", "" }, + { LNK_CmdSwitch_NotImplemented, "FORCE", "", "" }, + { LNK_CmdSwitch_FunctionPadMin, "FUNCTIONPADMIN", ":#", "Not Implemented" }, + { LNK_CmdSwitch_NotImplemented, "GUARD", "", "" }, + { LNK_CmdSwitch_NotImplemented, "GENPROFILE", "", "" }, + { LNK_CmdSwitch_Heap, "HEAP", "RESERVE[,COMMIT]", "" }, + { LNK_CmdSwitch_HighEntropyVa, "HIGHENTROPYVA", "[:NO]", "" }, + { LNK_CmdSwitch_NotImplemented, "IDLOUT", "", "" }, + { LNK_CmdSwitch_Ignore, "IGNORE", ":#", "" }, + { LNK_CmdSwitch_NotImplemented, "IGNOREIDL", "", "" }, + { LNK_CmdSwitch_NotImplemented, "ILK", "", "" }, + { LNK_CmdSwitch_ImpLib, "IMPLIB", ":FILENAME", "" }, + { LNK_CmdSwitch_Include, "INCLUDE", "", "" }, + { LNK_CmdSwitch_Incremental, "INCREMENTAL", "[:NO]", "Incremental linking is not supported." }, + { LNK_CmdSwitch_NotImplemented, "INTEGRITYCHECK", "", "" }, + { LNK_CmdSwitch_NotImplemented, "KERNEL", "", "" }, + { LNK_CmdSwitch_NotImplemented, "KEYCONTAINER", "", "" }, + { LNK_CmdSwitch_NotImplemented, "KEYFILE", "", "" }, + { LNK_CmdSwitch_LargeAddressAware, "LARGEADDRESSAWARE", "[:NO]", "" }, + { LNK_CmdSwitch_LibPath, "LIBPATH", ":DIR", "" }, + { LNK_CmdSwitch_NotImplemented, "LINKERREPO", "", "" }, + { LNK_CmdSwitch_NotImplemented, "LINKERREPOTARGET", "", "" }, + { LNK_CmdSwitch_NotImplemented, "LTCG", "", "" }, + { LNK_CmdSwitch_NotImplemented, "LTCGOUT", "", "" }, + { LNK_CmdSwitch_Machine, "MACHINE", ":{X64|X86}", "" }, + { LNK_CmdSwitch_Manifest, "MANIFEST", "[:{EMBED[,ID=#]|NO]", "" }, + { LNK_CmdSwitch_ManifestDependency, "MANIFESTDEPENDENCY", ":\"manifest dependency XML string\"", "" }, + { LNK_CmdSwitch_ManifestFile, "MANIFESTFILE", ":FILENAME", "" }, + { LNK_CmdSwitch_ManifestInput, "MANIFESTINPUT", ":FILENAME", "" }, + { LNK_CmdSwitch_ManifestUac, "MANIFESTUAC", ":{NO|{'level'={'asInvoker'|'highestAvailable'|'requireAdministrator'} ['uiAccess'={'true'|'false'}]}}", "" }, + { LNK_CmdSwitch_NotImplemented, "MAP", "", "" }, + { LNK_CmdSwitch_NotImplemented, "MAPINFO", "", "" }, + { LNK_CmdSwitch_NotImplemented, "MERGE", "", "" }, + { LNK_CmdSwitch_NotImplemented, "MIDL", "", "" }, + { LNK_CmdSwitch_Natvis, "NATVIS", ":FILENAME", "" }, + { LNK_CmdSwitch_NotImplemented, "NOASSEMBLY", "", "" }, + { LNK_CmdSwitch_NoDefaultLib, "NODEFAULTLIB", ":LIBNAME", "" }, + { LNK_CmdSwitch_NotImplemented, "NOENTRY", "", "" }, + { LNK_CmdSwitch_NotImplemented, "NOEXP", "", "" }, + { LNK_CmdSwitch_NotImplemented, "NOIMPLIB", "", "" }, + { LNK_CmdSwitch_NoLogo, "NOLOGO", "", "" }, + { LNK_CmdSwitch_NxCompat, "NXCOMPAT", "[:NO]", "" }, + { LNK_CmdSwitch_Opt, "OPT", "", "" }, + { LNK_CmdSwitch_NotImplemented, "ORDER", "", "" }, + { LNK_CmdSwitch_Out, "OUT", ":FILENAME", "" }, + { LNK_CmdSwitch_Pdb, "PDB", ":FILENAME", "" }, + { LNK_CmdSwitch_NotImplemented, "PDBSTRIPPED", "", "" }, + { LNK_CmdSwitch_PdbPageSize, "PDBPAGESIZE", ":#", "Page size must be power of two" }, + { LNK_CmdSwitch_NotImplemented, "PROFILE", "", "" }, + { LNK_CmdSwitch_Release, "RELEASE", "", "" }, + { LNK_CmdSwitch_NotImplemented, "SAFESEH", "", "" }, + { LNK_CmdSwitch_NotImplemented, "SECTION", "", "" }, + { LNK_CmdSwitch_NotImplemented, "SOURCELINK", "", "" }, + { LNK_CmdSwitch_Stack, "STACK", ":RESERVE[,COMMIT]", "" }, + { LNK_CmdSwitch_NotImplemented, "STUB", "", "" }, + { LNK_CmdSwitch_SubSystem, "SUBSYSTEM", ":{CONSOLE|NATIVE|WINDOWS}[,#[.##]]", "" }, + { LNK_CmdSwitch_NotImplemented, "SWAPRUN", "", "" }, + { LNK_CmdSwitch_NotImplemented, "TLBID", "", "" }, + { LNK_CmdSwitch_NotImplemented, "TLBOUT", "", "" }, + { LNK_CmdSwitch_NotImplemented, "TIME", "", "" }, + { LNK_CmdSwitch_TsAware, "TSAWARE", "[:NO]", "" }, + { LNK_CmdSwitch_NotImplemented, "USERPROFILE", "", "" }, + { LNK_CmdSwitch_NotImplemented, "VERBOSE", "", "" }, + { LNK_CmdSwitch_Version, "VERSION", "", "" }, + { LNK_CmdSwitch_NotImplemented, "WINMD", "", "" }, + { LNK_CmdSwitch_NotImplemented, "WINMDDELAYSIGN", "", "" }, + { LNK_CmdSwitch_NotImplemented, "WINMDKEYCONTAINER", "", "" }, + { LNK_CmdSwitch_NotImplemented, "WINMDKEYFILE", "", "" }, + { LNK_CmdSwitch_NotImplemented, "WHOLEARCHIVE", "", "" }, + { LNK_CmdSwitch_NotImplemented, "WX", "", "" }, + + //- internal switches + { LNK_CmdSwitch_Rad_Age, "RAD_AGE", ":#", "Age embeded in EXE and PDB, used to validate incremental build. Default is 1." }, + { LNK_CmdSwitch_Rad_BuildInfo, "RAD_BUILD_INFO", "", "Print build info and exit." }, + { LNK_CmdSwitch_Rad_CheckUnusedDelayLoadDll, "RAD_CHECK_UNUSED_DELAY_LOAD_DLL", "[:NO]", "" }, + { LNK_CmdSwitch_Rad_Debug, "RAD_DEBUG", "[:NO]", "Emit RAD debug info file." }, + { LNK_CmdSwitch_Rad_DebugName, "RAD_DEBUG_NAME", ":FILENAME", "Sets file name for RAD debug info file." }, + { LNK_CmdSwitch_Rad_DelayBind, "RAD_DELAY_BIND", "[:NO]", "" }, + { LNK_CmdSwitch_Rad_DeleteManifest, "RAD_DELETE_MANIFEST", "[:NO]", "" }, + { LNK_CmdSwitch_Rad_DoMerge, "RAD_DO_MERGE", "[:NO]", "" }, + { LNK_CmdSwitch_Rad_EnvLib, "RAD_ENV_LIB", "[:NO]", "" }, + { LNK_CmdSwitch_Rad_Exe, "RAD_EXE", "[:NO]", "" }, + { LNK_CmdSwitch_Rad_Guid, "RAD_GUID", ":{IMAGEBLAKE3|XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXXXXXX}", "" }, + { LNK_CmdSwitch_Rad_LargePages, "RAD_LARGE_PAGES", "[:NO]", "Disabled by default on Windows." }, + { LNK_CmdSwitch_Rad_LinkVer, "RAD_LINK_VER", ":##,##", "" }, + { LNK_CmdSwitch_Rad_Log, "RAD_LOG", ":{ALL,INPUT_OBJ,INPUT_LIB,IO,LINK_STATS,TIMERS}", "" }, + { LNK_CmdSwitch_Rad_MtPath, "RAD_MT_PATH", ":EXEPATH", "Path to manifest tool." }, + { LNK_CmdSwitch_Rad_OsVer, "RAD_OS_VER", ":##,##", "" }, + { LNK_CmdSwitch_Rad_PageSize, "RAD_PAGE_SIZE", ":#", "Must be power of two." }, + { LNK_CmdSwitch_Rad_PathStyle, "RAD_PATH_STYLE", ":{WindowsAbsolute|UnixAbsolute}", "" }, + { LNK_CmdSwitch_Rad_SectVirtOff, "RAD_SECT_VIRT_OFF", ":#", "Set RVA where section data is placed in memory. For internal use only." }, + { LNK_CmdSwitch_Rad_SuppressError, "RAD_SUPPRESS_ERROR", ":#", "" }, + { LNK_CmdSwitch_Rad_SymbolTableCapDefined, "RAD_SYMBOL_TABLE_CAP_DEFINED", ":#", "Number of buckets allocated in the symbol table for defined symbols." }, + { LNK_CmdSwitch_Rad_SymbolTableCapInternal, "RAD_SYMBOL_TABLE_CAP_INTERNAL", ":#", "Number of buckets allocated in the symbol table for internal symbols." }, + { LNK_CmdSwitch_Rad_SymbolTableCapWeak, "RAD_SYMBOL_TABLE_CAP_WEAK", ":#", "Number of buckets allocated in the symbol table for weak symbols." }, + { LNK_CmdSwitch_Rad_SymbolTableCapLib, "RAD_SYMBOL_TABLE_CAP_LIB", ":#", "Number of buckets allocated in the symbol table for library symbols." }, + { LNK_CmdSwitch_Rad_TargetOs, "RAD_TARGET_OS", ":{WINDOWS,LINUX,MAC}" }, + { LNK_CmdSwitch_Rad_TimeStamp, "RAD_TIME_STAMP", ":#", "Time stamp embeded in EXE and PDB." }, + { LNK_CmdSwitch_Rad_Version, "RAD_VERSION", "", "Print version and exit." }, + { LNK_CmdSwitch_Rad_Workers, "RAD_WORKERS", ":#", "Sets number of workers created in the pool. Number is capped at 1024." }, + + { LNK_CmdSwitch_Help, "HELP", "", "" }, + { LNK_CmdSwitch_Help, "?", "", "" }, +}; + +internal String8 +lnk_string_from_cmd_switch_type(LNK_CmdSwitchType type) +{ + for (U64 cmd_idx = 0; cmd_idx < ArrayCount(g_cmd_switch_map); cmd_idx += 1) { + if (g_cmd_switch_map[cmd_idx].type == type) { + return str8_cstring(g_cmd_switch_map[cmd_idx].name); + } + } + return str8(0,0); +} + +internal LNK_CmdSwitchType +lnk_cmd_switch_type_from_string(String8 string) +{ + LNK_CmdSwitchType type = LNK_CmdSwitch_Null; + for (U64 icmd = 0; icmd < ArrayCount(g_cmd_switch_map); icmd += 1) { + String8 cmd_name = str8_cstring(g_cmd_switch_map[icmd].name); + if (str8_match(cmd_name, string, StringMatchFlag_CaseInsensitive)) { + type = g_cmd_switch_map[icmd].type; + break; + } + } + + return type; +} + +read_only struct { + char *name; + LNK_InputType type; +} g_input_type_map[] = { + { "o", LNK_Input_Obj }, + { "obj", LNK_Input_Obj }, + { "lib", LNK_Input_Lib }, + { "rlib", LNK_Input_Lib }, // rust libs + { "res", LNK_Input_Res }, +}; + +internal LNK_InputType +lnk_input_type_from_string(String8 string) +{ + for (U64 i = 0; i < ArrayCount(g_input_type_map); i += 1) { + if (str8_match(str8_cstring(g_input_type_map[i].name), string, StringMatchFlag_CaseInsensitive)) { + return g_input_type_map[i].type; + } + } + return LNK_Input_Null; +} + +read_only struct +{ + char *name; + LNK_DebugMode mode; +} g_debug_mode_map[] = { + { "null", LNK_DebugMode_Null }, + { "none", LNK_DebugMode_None }, + { "fastlink", LNK_DebugMode_FastLink }, + { "ghash", LNK_DebugMode_GHash }, + { "full", LNK_DebugMode_Full }, +}; + +internal LNK_DebugMode +lnk_debug_mode_from_string(String8 string) +{ + for (U64 i = 0; i < ArrayCount(g_debug_mode_map); i += 1) { + if (str8_match(str8_cstring(g_debug_mode_map[i].name), string, StringMatchFlag_CaseInsensitive)) { + return g_debug_mode_map[i].mode; + } + } + return LNK_DebugMode_Null; +} + +//////////////////////////////// + +internal LNK_CmdOption * +lnk_cmd_line_push_option_if_not_presentf(Arena *arena, LNK_CmdLine *cmd_line, LNK_CmdSwitchType cmd_switch_type, char *param_fmt, ...) +{ + LNK_CmdOption *opt = 0; + String8 cmd_switch_name = lnk_string_from_cmd_switch_type(cmd_switch_type); + if (!lnk_cmd_line_has_option_string(*cmd_line, cmd_switch_name)) { + va_list param_args; + va_start(param_args, param_fmt); + String8 param_str = push_str8fv(arena, param_fmt, param_args); + va_end(param_args); + + opt = lnk_cmd_line_push_option_string(arena, cmd_line, cmd_switch_name, param_str); + } + return opt; +} + +internal LNK_CmdOption * +lnk_cmd_line_push_optionf(Arena *arena, LNK_CmdLine *cmd_line, LNK_CmdSwitchType cmd_switch, char *param_fmt, ...) +{ + va_list param_args; + va_start(param_args, param_fmt); + String8 param_str = push_str8fv(arena, param_fmt, param_args); + va_end(param_args); + String8 cmd_switch_name = lnk_string_from_cmd_switch_type(cmd_switch); + LNK_CmdOption *opt = lnk_cmd_line_push_option_string(arena, cmd_line, cmd_switch_name, param_str); + return opt; +} + +internal B32 +lnk_cmd_line_has_switch(LNK_CmdLine cmd_line, LNK_CmdSwitchType cmd_switch) +{ + String8 cmd_switch_name = lnk_string_from_cmd_switch_type(cmd_switch); + return lnk_cmd_line_has_option_string(cmd_line, cmd_switch_name); +} + +//////////////////////////////// + +internal void +lnk_error_cmd_switch(LNK_ErrorCode code, LNK_CmdSwitchType cmd_switch, char *fmt, ...) +{ + Temp scratch = scratch_begin(0,0); + + va_list args; + va_start(args, fmt); + + String8 switch_name = lnk_string_from_cmd_switch_type(cmd_switch); + String8 message = push_str8fv(scratch.arena, fmt, args); + String8 output = push_str8f(scratch.arena, "/%S: %S", switch_name, message); + lnk_error(code, "%S", output); + + va_end(args); + + scratch_end(scratch); +} + +internal void +lnk_error_cmd_switch_invalid_param_count(LNK_ErrorCode code, LNK_CmdSwitchType cmd_switch) +{ + lnk_error_cmd_switch(code, cmd_switch, "invalid number of parameters"); +} + +internal void +lnk_error_cmd_switch_invalid_param(LNK_ErrorCode code, LNK_CmdSwitchType cmd_switch, String8 param) +{ + lnk_error_cmd_switch(code, cmd_switch, "invalid parameter \"%S\"", param); +} + +internal String8 +lnk_error_check_and_strip_quotes(LNK_ErrorCode error_code, LNK_CmdSwitchType cmd_switch, String8 string) +{ + String8 result = string; + B32 starts_with_quote = str8_match(str8_lit("\""), string, StringMatchFlag_RightSideSloppy); + if (starts_with_quote) { + if (str8_ends_with(string, str8_lit("\""), 0)) { + result = str8_skip(result, 1); + result = str8_chop(result, 1); + } else { + lnk_error_cmd_switch(error_code, cmd_switch, "detected unmatched \" in \"%S\"", string); + } + } + return result; +} + +internal void +lnk_error_invalid_uac_level_param(LNK_ErrorCode error_code, LNK_CmdSwitchType cmd_switch, String8 input) +{ + lnk_error_cmd_switch(error_code, cmd_switch, "invalid param format, expected \"level={'asInvoker'|'highestAvailable'|'requireAdministrator'}\" but got \"%S\"", input); +} + +internal void +lnk_error_invalid_uac_ui_access_param(LNK_ErrorCode error_code, LNK_CmdSwitchType cmd_switch, String8 input) +{ + lnk_error_cmd_switch(error_code, cmd_switch, "invalid param format, expected \"uiAccess={'true'|'false'}\" but got \"%S\"", input); +} + +//////////////////////////////// + +internal U64 +lnk_get_base_addr(LNK_Config *config) +{ + U64 base_addr = config->user_base_addr; + if (base_addr == 0) { + if (config->file_characteristics & PE_ImageFileCharacteristic_FILE_DLL) { + base_addr = coff_default_dll_base_from_machine(config->machine); + } else if (config->file_characteristics & PE_ImageFileCharacteristic_EXE) { + base_addr = coff_default_exe_base_from_machine(config->machine); + } else { + lnk_error(LNK_Error_Cmdl, "image type is not specified."); + } + } + return base_addr; +} + +internal Version +lnk_get_default_subsystem_version(PE_WindowsSubsystem subsystem, COFF_MachineType machine) +{ + Version ver = make_version(0,0); + switch (subsystem) { + case PE_WindowsSubsystem_WINDOWS_BOOT_APPLICATION: { + ver = make_version(1,0); + } break; + + case PE_WindowsSubsystem_WINDOWS_CUI: { + switch (machine) { + case COFF_MachineType_X64: + case COFF_MachineType_X86: { + ver = make_version(6,0); + } break; + case COFF_MachineType_ARMNT: + case COFF_MachineType_ARM64: + case COFF_MachineType_ARM: { + ver = make_version(6,2); + } break; + default: { InvalidPath; } break; + } + } break; + + case PE_WindowsSubsystem_WINDOWS_GUI: { + switch (machine) { + case COFF_MachineType_X64: + case COFF_MachineType_X86: { + ver = make_version(6,0); + } break; + case COFF_MachineType_ARMNT: + case COFF_MachineType_ARM64: + case COFF_MachineType_ARM: { + ver = make_version(6,2); + } break; + default: { InvalidPath; } break; + } + } break; + + case PE_WindowsSubsystem_POSIX_CUI: { + ver = make_version(19,90); + } break; + + case PE_WindowsSubsystem_EFI_APPLICATION: + case PE_WindowsSubsystem_EFI_BOOT_SERVICE_DRIVER: + case PE_WindowsSubsystem_EFI_ROM: + case PE_WindowsSubsystem_EFI_RUNTIME_DRIVER: { + ver = make_version(1,0); + } break; + + case PE_WindowsSubsystem_NATIVE_WINDOWS: + case PE_WindowsSubsystem_NATIVE: { + Assert(!"TODO: detect -drive=WDM switch"); + } break; + } + return ver; +} + +internal Version +lnk_get_min_subsystem_version(PE_WindowsSubsystem subsystem, COFF_MachineType machine) +{ + Version ver = make_version(0,0); + switch (subsystem) { + case PE_WindowsSubsystem_WINDOWS_BOOT_APPLICATION: { + ver = make_version(1,0); + } break; + + case PE_WindowsSubsystem_WINDOWS_CUI: { + switch (machine) { + case COFF_MachineType_X86: { + ver = make_version(5,1); + } break; + + case COFF_MachineType_X64: { + ver = make_version(5,2); + } break; + + case COFF_MachineType_ARMNT: + case COFF_MachineType_ARM64: + case COFF_MachineType_ARM: { + ver = make_version(6,2); + } break; + + default: InvalidPath; break; + } + } break; + + case PE_WindowsSubsystem_WINDOWS_GUI: { + switch (machine) { + case COFF_MachineType_X86: { + ver = make_version(5,1); + } break; + + case COFF_MachineType_X64: { + ver = make_version(5,2); + } break; + + case COFF_MachineType_ARMNT: + case COFF_MachineType_ARM64: + case COFF_MachineType_ARM: { + ver = make_version(6,2); + } break; + + default: InvalidPath; break; + } + } break; + + case PE_WindowsSubsystem_POSIX_CUI: { + ver = make_version(1,0); + } break; + + case PE_WindowsSubsystem_EFI_APPLICATION: + case PE_WindowsSubsystem_EFI_BOOT_SERVICE_DRIVER: + case PE_WindowsSubsystem_EFI_ROM: + case PE_WindowsSubsystem_EFI_RUNTIME_DRIVER: { + ver = make_version(1,0); + } break; + + case PE_WindowsSubsystem_NATIVE_WINDOWS: + case PE_WindowsSubsystem_NATIVE: { + Assert(!"TODO: detect -drive=WDM switch"); + } break; + } + return ver; +} + +internal String8 +lnk_get_mt_path(Arena *arena) +{ +#if OS_WINDOWS +#undef OS_WINDOWS +#pragma comment(lib, "shlwapi.lib") +#include + + String8 mt_path = str8(0,0); + local_persist wchar_t raw_mt_path[MAX_PATH + 1] = L"mt.exe"; + B32 is_mt_found = PathFindOnPathW(&raw_mt_path[0], 0); + if (is_mt_found) { + String16 mt_path_16 = str16_cstring(&raw_mt_path[0]); + mt_path = str8_from_16(arena, mt_path_16); + mt_path = path_convert_slashes(arena, mt_path, PathStyle_WindowsAbsolute); + } else { + lnk_error(LNK_Error_Cmdl, "mt.exe not found, please specify path with /RAD_MT_PATH or run vcvarsall.bat"); + } + +#undef OS_WINDOWS +#define OS_WINDOWS 1 +#else +#error "TODO: find llvm-mt" +#endif + return mt_path; +} + +internal B32 +lnk_do_debug_info(LNK_Config *config) +{ + B32 do_debug_info = config->rad_debug == LNK_SwitchState_Yes || + (config->debug_mode != LNK_DebugMode_None && config->debug_mode != LNK_DebugMode_Null); + return do_debug_info; +} + +//////////////////////////////// + +internal B32 +lnk_cmd_switch_parse_version(String8List value_strings, LNK_CmdSwitchType cmd_switch, Version *ver_out) +{ + Temp scratch = scratch_begin(0,0); + B32 is_parsed = 0; + + if (value_strings.node_count == 1) { + String8List split_list = str8_split_by_string_chars(scratch.arena, value_strings.first->string, str8_lit("."), StringSplitFlag_KeepEmpties); + + String8 maj_str = str8_lit("0"); + String8 min_str = str8_lit("0"); + if (split_list.node_count == 1) { + maj_str = split_list.first->string; + } else if (split_list.node_count == 2) { + maj_str = split_list.first->string; + min_str = split_list.last->string; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid version format, too many dots, expected format: {N[.N]}"); + goto exit; + } + + U64 maj, min; + if (try_u64_from_str8_c_rules(maj_str, &maj)) { + if (try_u64_from_str8_c_rules(min_str, &min)) { + *ver_out = make_version(maj, min); + is_parsed = 1; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse minor version"); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse major version"); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + +exit:; + scratch_end(scratch); + return is_parsed; +} + +internal B32 +lnk_cmd_switch_parse_tuple(String8List value_strings, LNK_CmdSwitchType cmd_switch, Rng1U64 *tuple_out) +{ + if (value_strings.node_count == 1) { + U64 value; + if (try_u64_from_str8_c_rules(value_strings.first->string, &value)) { + tuple_out->v[0] = value; + return 1; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse the parameter \"%S\"", value_strings.first->string); + } + } else if (value_strings.node_count == 2) { + U64 a,b; + if (try_u64_from_str8_c_rules(value_strings.first->string, &a)) { + if (try_u64_from_str8_c_rules(value_strings.last->string, &b)) { + tuple_out->v[0] = a; + tuple_out->v[1] = b; + return 1; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable ot parse second parameter \"%S\"", value_strings.last->string); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse first parameter \"%S\"", value_strings.first->string); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + return 0; +} + +internal B32 +lnk_try_parse_u64(String8 string, LNK_ParseU64Flags flags, U64 *value_out) +{ + if (try_u64_from_str8_c_rules(string, value_out)) { + if (flags & LNK_ParseU64Flag_CheckUnder32bit) { + if (*value_out > max_U32) { + return 0; + } + } + + if (flags & LNK_ParseU64Flag_CheckPow2) { + if (!IsPow2(*value_out)) { + return 0; + } + } + } + + return 1; +} + +internal B32 +lnk_cmd_switch_parse_u64(String8List value_strings, LNK_CmdSwitchType cmd_switch, U64 *value_out, LNK_ParseU64Flags flags) +{ + if (value_strings.node_count != 1) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters, exepcted integer number as input"); + return 0; + } + if (!lnk_try_parse_u64(value_strings.first->string, flags, value_out)) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse string \"%S\"", value_strings.first->string); + return 0; + } + return 1; +} + +internal B32 +lnk_cmd_switch_parse_u32(String8List value_strings, LNK_CmdSwitchType cmd_switch, U32 *value_out, LNK_ParseU64Flags flags) +{ + U64 value; + if (lnk_cmd_switch_parse_u64(value_strings, cmd_switch, &value, flags | LNK_ParseU64Flag_CheckUnder32bit)) { + *value_out = (U32)value; + return 1; + } + return 0; +} + +internal B32 +lnk_cmd_switch_parse_u64_list(Arena *arena, String8List value_strings, LNK_CmdSwitchType cmd_switch, U64List *list_out, LNK_ParseU64Flags flags) +{ + for (String8Node *string_n = value_strings.first; string_n != 0; string_n = string_n->next) { + U64 value; + if (!lnk_try_parse_u64(string_n->string, flags, &value)) { + return 0; + } + u64_list_push(arena, list_out, value); + } + return 1; +} + +internal B32 +lnk_cmd_switch_parse_flag(String8List value_strings, LNK_CmdSwitchType cmd_switch, LNK_SwitchState *value_out) +{ + B32 is_parsed = 0; + if (value_strings.node_count > 1) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "too many parameters"); + } else if (value_strings.node_count == 1) { + if (str8_match(value_strings.first->string, str8_lit("no"), StringMatchFlag_CaseInsensitive)) { + *value_out = LNK_SwitchState_No; + is_parsed = 1; + } else if (str8_match(value_strings.first->string, str8_lit("yes"), StringMatchFlag_CaseInsensitive)) { + *value_out = LNK_SwitchState_Yes; + is_parsed = 1; + } else if (value_strings.first->string.size == 0) { + *value_out = 1; + is_parsed = 1; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid parameter \"%S\"", value_strings.first->string); + } + } else { + *value_out = LNK_SwitchState_Yes; + is_parsed = 1; + } + return is_parsed; +} + +internal void +lnk_cmd_switch_set_flag_inv_16(String8List value_strings, LNK_CmdSwitchType cmd_switch, U16 *flags, U16 bits) +{ + LNK_SwitchState state; + if (lnk_cmd_switch_parse_flag(value_strings, cmd_switch, &state)) { + switch (state) { + case LNK_SwitchState_Null: break; + case LNK_SwitchState_Yes : *flags |= bits; break; + case LNK_SwitchState_No : *flags &= ~bits; break; + } + } +} + +internal void +lnk_cmd_switch_set_flag_inv_64(String8List value_strings, LNK_CmdSwitchType cmd_switch, U64 *flags, U64 bits) +{ + LNK_SwitchState state; + if (lnk_cmd_switch_parse_flag(value_strings, cmd_switch, &state)) { + switch (state) { + case LNK_SwitchState_Null: break; + case LNK_SwitchState_Yes : *flags |= bits; break; + case LNK_SwitchState_No : *flags &= ~bits; break; + } + } +} + +internal void +lnk_cmd_switch_set_flag_16(String8List value_strings, LNK_CmdSwitchType cmd_switch, U16 *flags, U16 bits) +{ + LNK_SwitchState state; + if (lnk_cmd_switch_parse_flag(value_strings, cmd_switch, &state)) { + switch (state) { + case LNK_SwitchState_Null: break; + case LNK_SwitchState_Yes : *flags |= bits; break; + case LNK_SwitchState_No : *flags &= ~bits; break; + } + } +} + +internal void +lnk_cmd_switch_set_flag_32(String8List value_strings, LNK_CmdSwitchType cmd_switch, U32 *flags, U32 bits) +{ + LNK_SwitchState state; + if (lnk_cmd_switch_parse_flag(value_strings, cmd_switch, &state)) { + switch (state) { + case LNK_SwitchState_Null: break; + case LNK_SwitchState_Yes : *flags |= bits; break; + case LNK_SwitchState_No : *flags &= ~bits; break; + } + } +} + +internal void +lnk_cmd_switch_set_flag_64(String8List value_strings, LNK_CmdSwitchType cmd_switch, U64 *flags, U64 bits) +{ + LNK_SwitchState state; + if (lnk_cmd_switch_parse_flag(value_strings, cmd_switch, &state)) { + switch (state) { + case LNK_SwitchState_Null: break; + case LNK_SwitchState_Yes : *flags |= bits; break; + case LNK_SwitchState_No : *flags &= ~bits; break; + } + } +} + +internal B32 +lnk_cmd_switch_parse_string(String8List value_strings, LNK_CmdSwitchType cmd_switch, String8 *string_out) +{ + if (value_strings.node_count == 1) { + if (value_strings.first->string.size > 0) { + *string_out = value_strings.first->string; + return 1; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "empty string is not permitted"); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + return 0; +} + +internal void +lnk_cmd_switch_parse_string_copy(Arena *arena, String8List value_strings, LNK_CmdSwitchType cmd_switch, String8 *string_out) +{ + if (lnk_cmd_switch_parse_string(value_strings, cmd_switch, string_out)) { + *string_out = push_str8_copy(arena, *string_out); + } +} + +//////////////////////////////// + +internal B32 +lnk_parse_alt_name_directive(Arena *arena, String8 input, LNK_AltNameList *list_out) +{ + Temp scratch = scratch_begin(&arena, 1); + B32 is_parse_ok = 0; + String8List pair = str8_split_by_string_chars(scratch.arena, input, str8_lit("="), 0); + if (pair.node_count == 2) { + str8_list_push(arena, &list_out->from_list, pair.first->string); + str8_list_push(arena, &list_out->to_list, pair.last->string); + is_parse_ok = 1; + } + scratch_end(scratch); + return is_parse_ok; +} + +internal String8 * +lnk_parse_alt_name_directive_list(Arena *arena, String8List list, LNK_AltNameList *list_out) +{ + for (String8Node *str_n = list.first; str_n != 0; str_n = str_n->next) { + B32 is_parse_ok = lnk_parse_alt_name_directive(arena, str_n->string, list_out); + if ( ! is_parse_ok) { + return &str_n->string; + } + } + return 0; +} + +//////////////////////////////// + +internal void +lnk_print_build_info() +{ + fprintf(stdout, " Compiler: %s\n", COMPILER_STRING); + fprintf(stdout, " Mode : %s\n", BUILD_MODE_STRING); + fprintf(stdout, " Date : %s %s\n", __TIME__, __DATE__); + fprintf(stdout, " Version : %s\n", BUILD_VERSION_STRING); +} + +internal void +lnk_print_help(void) +{ + Temp scratch = scratch_begin(0,0); + + fprintf(stdout, "--- Help -------------------------------------------------------\n"); + fprintf(stdout, " %s\n", BUILD_VERSION_STRING); + fprintf(stdout, "\n"); + fprintf(stdout, " Usage: rad-link.exe [Options] [Files] [@rsp]\n"); + fprintf(stdout, "\n"); + + fprintf(stdout, " Options:\n"); + for (U64 i = 0; i < ArrayCount(g_cmd_switch_map); ++i) { + Temp temp = temp_begin(scratch.arena); + + char *name = g_cmd_switch_map[i].name; + char *args = g_cmd_switch_map[i].args; + char *desc = g_cmd_switch_map[i].desc; + LNK_CmdSwitchType type = g_cmd_switch_map[i].type; + + if (strcmp(name, "") == 0 || + strcmp(name, "NOT_IMPLEMENTED") == 0 || + type == LNK_CmdSwitch_Help) { + continue; + } + + String8 name_args = push_str8f(temp.arena, "%s%s", name, args); + + fprintf(stdout, " /%-32.*s %s%s\n", + str8_varg(name_args), + desc, + type == LNK_CmdSwitch_NotImplemented ? "Not Implemented" : ""); + + temp_end(temp); + } + + fprintf(stdout, "\n"); + + scratch_end(scratch); +} + +//////////////////////////////// + +internal LNK_Config * +lnk_config_from_cmd_line(Arena *arena, String8List raw_cmd_line) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + // parse command line + String8List unwrapped_cmd_line = lnk_unwrap_rsp(scratch.arena, raw_cmd_line); + LNK_CmdLine cmd_line = lnk_cmd_line_parse_windows_rules(scratch.arena, unwrapped_cmd_line); + + // setup default flags + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Align, "%u", KB(4)); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Debug, "none"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_FileAlign, "%u", 512); + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Fixed)) { + lnk_cmd_line_push_optionf(scratch.arena, &cmd_line, LNK_CmdSwitch_DynamicBase, ""); + } + if (lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Dll)) { + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_SubSystem, "%S", pe_string_from_subsystem(PE_WindowsSubsystem_WINDOWS_GUI)); + } + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Heap, "%u,%u", MB(1), KB(4)); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_HighEntropyVa, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_ManifestUac, "\"level='asInvoker' uiAccess='false'\""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_NxCompat, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_LargeAddressAware, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_PdbPageSize, "%u", KB(4)); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Stack, "%u,%u", MB(1), KB(1)); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_TimeStamp, "%u", os_get_process_start_time_unix()); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_Age, "%u", 1); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_CheckUnusedDelayLoadDll, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_DelayBind, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_DoMerge, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_EnvLib, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_Exe, ""); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_Guid, "imageblake3"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_LargePages, "no"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_LinkVer, "14.0"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_OsVer, "6.0"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_PageSize, "%u", KB(4)); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_PathStyle, "system"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_SectVirtOff, "0x1000"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_Workers, "%u", os_get_system_info()->logical_processor_count); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Manifest, "embed"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_TargetOs, "windows"); + //lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_Log, "debug"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_SymbolTableCapDefined, "0x3ffff"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_SymbolTableCapInternal, "0x1000"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_SymbolTableCapWeak, "0x3ffff"); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_SymbolTableCapLib, "0x3ffff"); + +#if !BUILD_DEBUG + //lnk_cmd_line_push_optionf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_SuppressError, "37"); +#endif + + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Rad_MtPath)) { + String8 mt_path = lnk_get_mt_path(scratch.arena); + lnk_cmd_line_push_option_if_not_presentf(scratch.arena, &cmd_line, LNK_CmdSwitch_Rad_MtPath, "%S", mt_path); + } + + LNK_Config *config = push_array(arena, LNK_Config, 1); + config->raw_cmd_line = raw_cmd_line; + config->work_dir = os_get_current_path(scratch.arena); + + // process command line switches + for (LNK_CmdOption *cmd = cmd_line.first_option; cmd != 0; cmd = cmd->next) { + LNK_CmdSwitchType cmd_switch = lnk_cmd_switch_type_from_string(cmd->string); + switch (cmd_switch) { + case LNK_CmdSwitch_Null: { + String8 value = str8_list_join(scratch.arena, &cmd->value_strings, &(StringJoin){.sep=str8_lit_comp(",")}); + lnk_error(LNK_Warning_UnknownSwitch, "unknown switch: \"/%S%s%S\"", cmd->string, value.size ? ":" : "", value); + } break; + + default: { InvalidPath; } break; + + case LNK_CmdSwitch_NotImplemented: { + String8 value = str8_list_join(scratch.arena, &cmd->value_strings, &(StringJoin){.sep=str8_lit_comp(",")}); + lnk_not_implemented("switch \"%S\" is not implemented \"%S\"", cmd->string, value); + } break; + + case LNK_CmdSwitch_Align: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->sect_align, LNK_ParseU64Flag_CheckPow2); + } break; + + case LNK_CmdSwitch_AllowBind: { + lnk_cmd_switch_set_flag_inv_16(cmd->value_strings, cmd_switch, &config->dll_characteristics, PE_DllCharacteristic_NO_BIND); + } break; + + case LNK_CmdSwitch_AllowIsolation: { + lnk_cmd_switch_set_flag_inv_16(cmd->value_strings, cmd_switch, &config->dll_characteristics, PE_DllCharacteristic_NO_ISOLATION); + } break; + + case LNK_CmdSwitch_AlternateName: { + String8 *error_string = lnk_parse_alt_name_directive_list(arena, cmd->value_strings, &config->alt_name_list); + if (error_string != 0) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid syntax \"%S\", expected format \"FROM=TO\"", *error_string); + } + } break; + + case LNK_CmdSwitch_AppContainer: { + lnk_cmd_switch_set_flag_16(cmd->value_strings, cmd_switch, &config->dll_characteristics, PE_DllCharacteristic_APPCONTAINER); + } break; + + case LNK_CmdSwitch_Base: { + if (cmd->value_strings.node_count == 2) { + String8Node *first_node = cmd->value_strings.first; + //String8Node *second_node = first_node->next; + B32 is_response_file = str8_match(str8_lit("@"), first_node->string, StringMatchFlag_RightSideSloppy); + if (is_response_file) { + //String8 file_path = first_node->string; + //String8 tag = second_node->string; + lnk_not_implemented("Response files are not implemented for /BASE"); + } else { + Rng1U64 addr_size = {0}; + if (lnk_cmd_switch_parse_tuple(cmd->value_strings, cmd_switch, &addr_size)) { + config->user_base_addr = addr_size.v[0]; + config->max_image_size = addr_size.v[1]; + } + } + } else if (cmd->value_strings.node_count == 1) { + U64 addr; + if (lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &addr, 0)) { + config->user_base_addr = addr; + } + } else if (cmd->value_strings.node_count == 0) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "expected at least 1 parameter"); + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "too many parameters"); + } + } break; + + case LNK_CmdSwitch_Debug: { + if (cmd->value_strings.node_count == 0) { + config->debug_mode = LNK_DebugMode_Full; + } else if (cmd->value_strings.node_count == 1) { + LNK_DebugMode debug_mode = lnk_debug_mode_from_string(cmd->value_strings.first->string); + if (debug_mode == LNK_DebugMode_GHash) { + config->debug_mode = LNK_DebugMode_Full; + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "GHASH is not supported, switching to FULL"); + } else if (debug_mode != LNK_DebugMode_Null) { + config->debug_mode = debug_mode; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid parameter \"%S\"", cmd->value_strings.first->string); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + } break; + + case LNK_CmdSwitch_DefaultLib: { + String8List default_lib_list = str8_list_copy(arena, &cmd->value_strings); + str8_list_concat_in_place(&config->input_default_lib_list, &default_lib_list); + } break; + + case LNK_CmdSwitch_Delay: { + if (cmd->value_strings.node_count == 0 || cmd->value_strings.node_count > 1) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } else { + String8 value = cmd->value_strings.first->string; + if (str8_match(value, str8_lit("unload"), StringMatchFlag_CaseInsensitive)) { + config->flags |= LNK_ConfigFlag_DelayUnload; + } else if (str8_match(value, str8_lit("nobind"), StringMatchFlag_CaseInsensitive)) { + config->flags &= ~LNK_ConfigFlag_DelayBind; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unknown parameter \"%S\"", value); + } + } + } break; + + case LNK_CmdSwitch_DelayLoad: { + String8List delay_load_dll_list = str8_list_copy(arena, &cmd->value_strings); + str8_list_concat_in_place(&config->delay_load_dll_list, &delay_load_dll_list); + } break; + + case LNK_CmdSwitch_Dll: { + config->file_characteristics |= PE_ImageFileCharacteristic_FILE_DLL; + } break; + + case LNK_CmdSwitch_DisallowLib: { + lnk_not_implemented("TODO: how is this switch different from /nodefaultlib?"); + } break; + + case LNK_CmdSwitch_DynamicBase: { + lnk_cmd_switch_set_flag_16(cmd->value_strings, cmd_switch, &config->dll_characteristics, PE_DllCharacteristic_DYNAMIC_BASE); + } break; + + case LNK_CmdSwitch_Entry: { + lnk_cmd_switch_parse_string_copy(arena, cmd->value_strings, cmd_switch, &config->user_entry_point_name); + config->entry_point_name = config->user_entry_point_name; + } break; + + case LNK_CmdSwitch_FastFail: { + // do nothing + } break; + + case LNK_CmdSwitch_FileAlign: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->file_align, LNK_ParseU64Flag_CheckPow2); + } break; + + case LNK_CmdSwitch_Fixed: { + lnk_cmd_switch_set_flag_64(cmd->value_strings, cmd_switch, &config->flags, LNK_ConfigFlag_Fixed); + } break; + + case LNK_CmdSwitch_FunctionPadMin: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->function_pad_min, LNK_ParseU64Flag_CheckUnder32bit); + } break; + + case LNK_CmdSwitch_Heap: { + Rng1U64 reserve_commit; + reserve_commit.v[0] = config->heap_reserve; + reserve_commit.v[1] = config->heap_commit; + if (lnk_cmd_switch_parse_tuple(cmd->value_strings, cmd_switch, &reserve_commit)) { + if (reserve_commit.v[0] >= reserve_commit.v[1]) { + U64 reserve_aligned = AlignPow2(reserve_commit.v[0], 4); + U64 commit_aligned = AlignPow2(reserve_commit.v[1], 4); +#if 0 + if (reserve_aligned != reserve_commit.v[0]) { + lnk_error_cmd_switch(LNK_WARNING_CMDL, cmd_switch, "reserve is not power of two, aligned to %u bytes", reserve_aligned); + } + if (commit_aligned != reserve_commit.v[1]) { + lnk_error_cmd_switch(LNK_WARNING_CMDL, cmd_switch, "commit is not power of two, aligned to %u bytes", commit_aligned); + } +#endif + config->heap_reserve = reserve_aligned; + config->heap_commit = commit_aligned; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "commit(%llu) is greater than reserve(%llu)", reserve_commit.v[1], reserve_commit.v[0]); + } + } + } break; + + case LNK_CmdSwitch_HighEntropyVa: { + lnk_cmd_switch_set_flag_16(cmd->value_strings, cmd_switch, &config->dll_characteristics, PE_DllCharacteristic_HIGH_ENTROPY_VA); + } break; + + case LNK_CmdSwitch_Ignore: { + U64 error_code; + if (lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &error_code, 0)) { + switch (error_code) { + case LNK_MsWarningCode_UnsuedDelayLoadDll: { + lnk_suppress_error(LNK_Warning_UnusedDelayLoadDll); + } break; + case LNK_MsWarningCode_MissingExternalTypeServer: { + lnk_suppress_error(LNK_Warning_MissingExternalTypeServer); + } break; + case LNK_MsWarningCode_SectionFlagsConflict: { + lnk_suppress_error(LNK_Warning_SectionFlagsConflict); + } break; + default: { + lnk_not_implemented("TODO: /IGNORE:%llu", error_code); + } break; + } + } + } break; + + case LNK_CmdSwitch_ImpLib: { + lnk_cmd_switch_parse_string_copy(arena, cmd->value_strings, cmd_switch, &config->imp_lib_name); + } break; + + case LNK_CmdSwitch_Include: { + String8List include_symbol_list = str8_list_copy(arena, &cmd->value_strings); + str8_list_concat_in_place(&config->include_symbol_list, &include_symbol_list); + } break; + + case LNK_CmdSwitch_Incremental: { + LNK_SwitchState state; + if (lnk_cmd_switch_parse_flag(cmd->value_strings, cmd_switch, &state)) { + if (state == LNK_SwitchState_Yes) { + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "incremental linkage is not supported"); + } + } + } break; + + case LNK_CmdSwitch_LargeAddressAware: { + lnk_cmd_switch_set_flag_16(cmd->value_strings, cmd_switch, &config->file_characteristics, PE_ImageFileCharacteristic_LARGE_ADDRESS_AWARE); + } break; + + case LNK_CmdSwitch_LibPath: { + String8List lib_dir_list = str8_list_copy(arena, &cmd->value_strings); + for (String8Node *dir_n = lib_dir_list.first; dir_n != 0; dir_n = dir_n->next) { + if (!os_folder_path_exists(dir_n->string)) { + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "path doesn't exist %S", dir_n->string); + } + } + str8_list_concat_in_place(&config->lib_dir_list, &lib_dir_list); + } break; + + case LNK_CmdSwitch_Machine: { + if (cmd->value_strings.node_count == 1) { + COFF_MachineType machine = coff_machine_from_string(cmd->value_strings.first->string); + if (machine != COFF_MachineType_UNKNOWN) { + config->machine = machine; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unknown parameter \"%S\"", cmd->value_strings.first->string); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + } break; + + case LNK_CmdSwitch_Manifest: { + if (cmd->value_strings.node_count == 1) { + String8List param_list = str8_split_by_string_chars(scratch.arena, cmd->value_strings.first->string, str8_lit(","), 0); + String8Array param_arr = str8_array_from_list(scratch.arena, ¶m_list); + if (param_arr.count > 0) { + if (str8_match(param_arr.v[0], str8_lit("embed"), StringMatchFlag_CaseInsensitive)) { + config->manifest_opt = LNK_ManifestOpt_Embed; + if (config->delete_manifest == LNK_SwitchState_Null) { + config->delete_manifest = 1; + } + + if (param_arr.count == 1) { + if (lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Dll)) { + config->manifest_resource_id = 2; + } else { + config->manifest_resource_id = 1; + } + } else if (param_arr.count > 1) { + // parse resource id + if (str8_match(param_arr.v[1], str8_lit("id="), StringMatchFlag_RightSideSloppy|StringMatchFlag_CaseInsensitive)) { + String8List res_id_list = str8_split_by_string_chars(scratch.arena, param_arr.v[1], str8_lit("="), 0); + String8Array res_id_arr = str8_array_from_list(scratch.arena, &res_id_list); + if (res_id_arr.count == 2) { + U64 resource_id; + if (try_u64_from_str8_c_rules(res_id_arr.v[1], &resource_id)) { + config->manifest_resource_id = resource_id; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse resource_id \"%S\"", res_id_arr.v[1]); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid syntax expected form ID=resource_id but got \"%S\"", param_arr.v[1]); + } + } else { + lnk_error_cmd_switch_invalid_param(LNK_Error_Cmdl, cmd_switch, param_arr.v[0]); + } + } else { + lnk_error_cmd_switch_invalid_param_count(LNK_Error_Cmdl, cmd_switch); + } + } else if (str8_match(param_arr.v[0], str8_lit("no"), StringMatchFlag_CaseInsensitive)) { + config->manifest_opt = LNK_ManifestOpt_No; + } else { + lnk_error_cmd_switch_invalid_param(LNK_Error_Cmdl, cmd_switch, param_arr.v[0]); + } + } else { + lnk_error_cmd_switch_invalid_param_count(LNK_Error_Cmdl, cmd_switch); + } + } else if (cmd->value_strings.node_count == 0) { + config->manifest_opt = LNK_ManifestOpt_Embed; + } + } break; + + case LNK_CmdSwitch_ManifestDependency: { + String8List manifest_dependency_list = str8_list_copy(arena, &cmd->value_strings); + str8_list_concat_in_place(&config->manifest_dependency_list, &manifest_dependency_list); + } break; + + case LNK_CmdSwitch_ManifestFile: { + lnk_cmd_switch_parse_string_copy(arena, cmd->value_strings, cmd_switch, &config->manifest_name); + } break; + + case LNK_CmdSwitch_ManifestInput: { + // see :manifest_input + } break; + + case LNK_CmdSwitch_ManifestUac: { + if (cmd->value_strings.node_count == 1) { + String8 uac = lnk_error_check_and_strip_quotes(LNK_Error_Cmdl, cmd_switch, cmd->value_strings.first->string); + String8List param_list = str8_split_by_string_chars(scratch.arena, uac, str8_lit(" "), 0); + String8Array param_arr = str8_array_from_list(scratch.arena, ¶m_list); + if (param_arr.count > 0) { + if (str8_match(str8_lit("level="), param_arr.v[0], StringMatchFlag_RightSideSloppy|StringMatchFlag_CaseInsensitive)) { + String8 level_param = param_arr.v[0]; + String8List level_list = str8_split_by_string_chars(scratch.arena, level_param, str8_lit("="), 0); + if (level_list.node_count == 2) { + if (str8_match(level_list.first->string, str8_lit("level"), StringMatchFlag_CaseInsensitive)) { + String8 level = level_list.last->string; + if (str8_match(level, str8_lit("'asInvoker'"), 0) || + str8_match(level, str8_lit("'highestAvailable'"), 0) || + str8_match(level, str8_lit("'requireAdministrator'"), 0)) { + // manifest level was parsed! + config->manifest_uac = 1; + config->manifest_level = push_str8_copy(arena, level); + if (param_arr.count > 1) { + String8 ui_access_param = param_arr.v[1]; + String8List ui_access_list = str8_split_by_string_chars(scratch.arena, ui_access_param, str8_lit("="), 0); + if (ui_access_list.node_count == 2) { + String8 ui_access = ui_access_list.last->string; + if (str8_match(ui_access, str8_lit("'true'"), 0) || + str8_match(ui_access, str8_lit("'false'"), 0)) { + // ui access was parsed! + config->manifest_ui_access = push_str8_copy(arena, ui_access); + } else { + lnk_error_invalid_uac_ui_access_param(LNK_Error_Cmdl, cmd_switch, ui_access_param); + } + } else { + lnk_error_invalid_uac_ui_access_param(LNK_Error_Cmdl, cmd_switch, ui_access_param); + } + } + } else { + lnk_error_invalid_uac_level_param(LNK_Error_Cmdl, cmd_switch, level_param); + } + } else { + lnk_error_invalid_uac_level_param(LNK_Error_Cmdl, cmd_switch, level_param); + } + } else { + lnk_error_invalid_uac_level_param(LNK_Error_Cmdl, cmd_switch, level_param); + } + } else if (str8_match(str8_lit("no"), param_arr.v[0], StringMatchFlag_CaseInsensitive)) { + config->manifest_uac = 0; + } else { + lnk_error_cmd_switch_invalid_param(LNK_Error_Cmdl, cmd_switch, param_arr.v[0]); + } + } else { + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "empty param string"); + } + } else { + lnk_error_cmd_switch_invalid_param_count(LNK_Error_Cmdl, cmd_switch); + } + } break; + + case LNK_CmdSwitch_Natvis: { + // warn about invalid natvis extension + for (String8Node *node = cmd->value_strings.first; node != 0; node = node->next) { + String8 ext = str8_skip_last_dot(node->string); + if (!str8_match(ext, str8_lit("natvis"), StringMatchFlag_CaseInsensitive)) { + lnk_error_cmd_switch(LNK_Warning_InvalidNatvisFileExt, cmd_switch, "Visual Studio expects .natvis extension: \"%S\"", node->string); + } + } + + String8List natvis_list = str8_list_copy(arena, &cmd->value_strings); + str8_list_concat_in_place(&config->natvis_list, &natvis_list); + } break; + + case LNK_CmdSwitch_NoDefaultLib: { + String8List no_default_lib_list = str8_list_copy(arena, &cmd->value_strings); + str8_list_concat_in_place(&config->disallow_lib_list, &no_default_lib_list); + } break; + + case LNK_CmdSwitch_NoLogo: { + // we don't print logo + } break; + + case LNK_CmdSwitch_NxCompat: { + lnk_cmd_switch_set_flag_16(cmd->value_strings, cmd_switch, &config->dll_characteristics, PE_DllCharacteristic_NX_COMPAT); + } break; + + case LNK_CmdSwitch_Opt: { + for (String8Node *n = cmd->value_strings.first; n != 0; n = n->next) { + String8 param = n->string; + if (str8_match(param, str8_lit("ref"), StringMatchFlag_CaseInsensitive)) { + config->opt_ref = LNK_SwitchState_Yes; + } else if (str8_match(param, str8_lit("noref"), StringMatchFlag_CaseInsensitive)) { + config->opt_ref = LNK_SwitchState_No; + } else if (str8_match(param, str8_lit("icf"), StringMatchFlag_CaseInsensitive) || + str8_match(param, str8_lit("icf="), StringMatchFlag_CaseInsensitive | StringMatchFlag_RightSideSloppy)) { + String8List vals = str8_split_by_string_chars(scratch.arena, param, str8_lit("="), 0); + if (vals.node_count > 2) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "too many parameters for iteration"); + continue; + } + if (vals.node_count == 2) { + B32 is_parsed = try_u64_from_str8_c_rules(vals.last->string, &config->opt_iter_count); + if (!is_parsed) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse iterations \"%S\"", vals.last->string); + continue; + } + } + config->opt_icf = LNK_SwitchState_Yes; + } else if (str8_match(param, str8_lit("noicf"), StringMatchFlag_CaseInsensitive)) { + config->opt_icf = LNK_SwitchState_No; + } else if (str8_match(param, str8_lit("lbr"), StringMatchFlag_CaseInsensitive)) { + config->opt_lbr = LNK_SwitchState_Yes; + } else if (str8_match(param, str8_lit("nolibr"), StringMatchFlag_CaseInsensitive)) { + config->opt_lbr = LNK_SwitchState_No; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unknown option \"%S\"", param); + } + } + } break; + + case LNK_CmdSwitch_Out: { + lnk_cmd_switch_parse_string_copy(arena, cmd->value_strings, cmd_switch, &config->image_name); + } break; + + case LNK_CmdSwitch_Pdb: { + lnk_cmd_switch_parse_string_copy(arena, cmd->value_strings, cmd_switch, &config->pdb_name); + } break; + + case LNK_CmdSwitch_PdbPageSize: { + U64 page_size; + if (lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &page_size, LNK_ParseU64Flag_CheckPow2)) { + if (page_size >= MSF_MIN_PAGE_SIZE) { + if (page_size < MSF_MAX_PAGE_SIZE) { + config->pdb_page_size = page_size; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "page size must be <= %u bytes", MSF_MAX_PAGE_SIZE); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "page size must be >= %u bytes", MSF_MIN_PAGE_SIZE); + } + } + } break; + + case LNK_CmdSwitch_Release: { + if (cmd->value_strings.node_count == 0) { + config->flags |= LNK_ConfigFlag_WriteImageChecksum; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + } break; + + case LNK_CmdSwitch_Stack: { + Rng1U64 reserve_commit; + reserve_commit.v[0] = config->stack_reserve; + reserve_commit.v[1] = config->stack_commit; + if (lnk_cmd_switch_parse_tuple(cmd->value_strings, cmd_switch, &reserve_commit)) { + if (reserve_commit.v[0] >= reserve_commit.v[1]) { + U64 reserve_aligned = AlignPow2(reserve_commit.v[0], 4); + U64 commit_aligned = AlignPow2(reserve_commit.v[1], 4); +#if 0 + if (reserve_aligned != reserve_commit.v[0]) { + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "reserve is not power of two, aligned to %u", reserve_aligned); + } + if (commit_aligned != reserve_commit.v[1]) { + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "commit is not power of two, aligned to %u", commit_aligned); + } +#endif + config->stack_reserve = reserve_aligned; + config->stack_commit = commit_aligned; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "commit(%llu) is greater than reserve(%llu)", reserve_commit.v[1], reserve_commit.v[0]); + } + } + } break; + + case LNK_CmdSwitch_SubSystem: { + if (cmd->value_strings.node_count <= 2 && cmd->value_strings.node_count > 0) { + // set subsystem type + PE_WindowsSubsystem subsystem = pe_subsystem_from_string(cmd->value_strings.first->string); + if (subsystem != PE_WindowsSubsystem_UNKNOWN) { + config->subsystem = subsystem; + + // parse version (optional) + if (cmd->value_strings.node_count == 2) { + String8List value_strings = cmd->value_strings; + str8_list_pop_front(&value_strings); // pop subsystem parameter + lnk_cmd_switch_parse_version(value_strings, cmd_switch, &config->subsystem_ver); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid subsystem \"%S\"", cmd->value_strings.first->string); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + } break; + + case LNK_CmdSwitch_Time: { + } break; + + case LNK_CmdSwitch_TsAware: { + lnk_cmd_switch_set_flag_inv_64(cmd->value_strings, cmd_switch, &config->flags, LNK_ConfigFlag_NoTsAware); + } break; + + case LNK_CmdSwitch_Version: { + lnk_cmd_switch_parse_version(cmd->value_strings, cmd_switch, &config->image_ver); + } break; + + case LNK_CmdSwitch_Rad_Age: { + lnk_cmd_switch_parse_u32(cmd->value_strings, cmd_switch, &config->age, 0); + } break; + + case LNK_CmdSwitch_Rad_BuildInfo: { + lnk_print_build_info(); + os_abort(0); + } break; + + case LNK_CmdSwitch_Rad_CheckUnusedDelayLoadDll: { + lnk_cmd_switch_set_flag_64(cmd->value_strings, cmd_switch, &config->flags, LNK_ConfigFlag_CheckUnusedDelayLoadDll); + } break; + + case LNK_CmdSwitch_Rad_Debug: { + lnk_cmd_switch_parse_flag(cmd->value_strings, cmd_switch, &config->rad_debug); + } break; + case LNK_CmdSwitch_Rad_DebugName: { + lnk_cmd_switch_parse_string_copy(arena, cmd->value_strings, cmd_switch, &config->rad_debug_name); + } break; + + case LNK_CmdSwitch_Rad_DelayBind: { + lnk_cmd_switch_set_flag_64(cmd->value_strings, cmd_switch, &config->flags, LNK_ConfigFlag_DelayBind); + } break; + + case LNK_CmdSwitch_Rad_DeleteManifest: { + lnk_cmd_switch_parse_flag(cmd->value_strings, cmd_switch, &config->delete_manifest); + } break; + + case LNK_CmdSwitch_Rad_DoMerge: { + lnk_cmd_switch_set_flag_64(cmd->value_strings, cmd_switch, &config->flags, LNK_ConfigFlag_Merge); + } break; + + case LNK_CmdSwitch_Rad_EnvLib: { + lnk_cmd_switch_set_flag_64(cmd->value_strings, cmd_switch, &config->flags, LNK_ConfigFlag_EnvLib); + } break; + + case LNK_CmdSwitch_Rad_Exe: { + lnk_cmd_switch_set_flag_16(cmd->value_strings, cmd_switch, &config->file_characteristics, PE_ImageFileCharacteristic_EXE); + } break; + + case LNK_CmdSwitch_Rad_Guid: { + if (cmd->value_strings.node_count == 1) { + if (str8_match(cmd->value_strings.first->string, str8_lit("imageblake3"), StringMatchFlag_CaseInsensitive)) { + config->guid_type = Lnk_DebugInfoGuid_ImageBlake3; + } else if (str8_match(cmd->value_strings.first->string, str8_lit("random"), StringMatchFlag_CaseInsensitive)) { + config->guid = os_make_guid(); + } else { + OS_Guid guid; + if (os_try_guid_from_string(cmd->value_strings.first->string, &guid)) { + config->guid = guid; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse \"%S\"", cmd->value_strings.first->string); + } + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters, expected GUID formatted as following: \"0000000-0000-0000-0000-000000000000\""); + } + } break; + + case LNK_CmdSwitch_Rad_LargePages: { + if (cmd->value_strings.node_count == 0) { + OS_ProcessInfo *process_info = os_get_process_info(); + if (process_info->large_pages_allowed) { + arena_default_flags |= ArenaFlag_LargePages; + } else { + lnk_error_cmd_switch(LNK_Warning_LargePages, cmd_switch, "Large pages aren't enabled on this system."); +#if OS_WINDOWS + lnk_supplement_error("To enable large pages:"); + lnk_supplement_error("\t- Press Win+R and open \"gpedit.msc\""); + lnk_supplement_error("\t- Navigate to Local Computer Policy > Computer Configuration > Windows Settings > Security Settings > Local Policies > User Rights And Assignments"); + lnk_supplement_error("\t- Double-click on \"Lock pages in memory\""); + lnk_supplement_error("\t- Click \"Add User or Group...\""); + lnk_supplement_error("\t- Type in your user name"); + lnk_supplement_error("\t- Click Oks and reboot the machine"); +#endif + } + } else if (cmd->value_strings.node_count == 1) { + if (str8_match(cmd->value_strings.first->string, str8_lit("quiet"), StringMatchFlag_CaseInsensitive)) { + OS_ProcessInfo *process_info = os_get_process_info(); + if (process_info->large_pages_allowed) { + arena_default_flags |= ArenaFlag_LargePages; + } + } else if (str8_match(cmd->value_strings.first->string, str8_lit("no"), StringMatchFlag_CaseInsensitive)) { + arena_default_flags &= ~ArenaFlag_LargePages; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid parameter: \"%S\", expected NO or QUIET", cmd->value_strings.first->string); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + } break; + + case LNK_CmdSwitch_Rad_LinkVer: { + lnk_cmd_switch_parse_version(cmd->value_strings, cmd_switch, &config->link_ver); + } break; + + case LNK_CmdSwitch_Rad_Log: { + if (cmd->value_strings.node_count == 1) { + if (str8_match(cmd->value_strings.first->string, str8_lit("all"), StringMatchFlag_CaseInsensitive)) { + for (U64 ilog = 0; ilog < LNK_Log_Count; ilog += 1) { + lnk_set_log_status((LNK_LogType)ilog, 1); + } + } else { + LNK_LogType log_type = lnk_log_type_from_string(cmd->value_strings.first->string); + if (log_type == LNK_Log_Null) { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unknown parameter \"%S\"", cmd->value_strings.first->string); + } else { + lnk_set_log_status(log_type, 1); + } + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters, expected 1"); + } + } break; + + case LNK_CmdSwitch_Rad_MtPath: { + lnk_cmd_switch_parse_string_copy(arena, cmd->value_strings, cmd_switch, &config->mt_path); + } break; + + case LNK_CmdSwitch_Rad_OsVer: { + lnk_cmd_switch_parse_version(cmd->value_strings, cmd_switch, &config->os_ver); + } break; + + case LNK_CmdSwitch_Rad_PageSize: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->page_size, 0); + } break; + + case LNK_CmdSwitch_Rad_PathStyle: { + if (cmd->value_strings.node_count == 1) { + PathStyle path_style = path_style_from_string(str8_list_first(&cmd->value_strings)); + if (path_style != PathStyle_Null) { + config->path_style = path_style; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unable to parse parameter \"%S\"", cmd->value_strings.first->string); + } + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "invalid number of parameters"); + } + } break; + + case LNK_CmdSwitch_Rad_SectVirtOff: { + U64 sect_virt_off; + if (lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, §_virt_off, LNK_ParseU64Flag_CheckUnder32bit)) { + if (sect_virt_off >= 0x1000) { + config->section_virt_off = sect_virt_off; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "section virtual offset must be >= 0x1000"); + } + } + } break; + + case LNK_CmdSwitch_Rad_SuppressError: { + U64List error_code_list = {0}; + if (lnk_cmd_switch_parse_u64_list(scratch.arena, cmd->value_strings, cmd_switch, &error_code_list, 0)) { + for (U64Node *error_code_n = error_code_list.first; error_code_n != 0; error_code_n = error_code_n->next) { + if (error_code_n->data < LNK_Error_Count) { + lnk_suppress_error(error_code_n->data); + } else { + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "unknown error code %llu", error_code_n->data); + } + } + } + } break; + + case LNK_CmdSwitch_Rad_SymbolTableCapDefined: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->symbol_table_cap_defined, 0); + } break; + case LNK_CmdSwitch_Rad_SymbolTableCapInternal: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->symbol_table_cap_internal, 0); + } break; + case LNK_CmdSwitch_Rad_SymbolTableCapWeak: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->symbol_table_cap_weak, 0); + } break; + case LNK_CmdSwitch_Rad_SymbolTableCapLib: { + lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &config->symbol_table_cap_lib, 0); + } break; + + case LNK_CmdSwitch_Rad_TargetOs: { + if (cmd->value_strings.node_count == 1) { + String8 os_string = str8_list_first(&cmd->value_strings); + OperatingSystem target_os = operating_system_from_string(os_string); + if (target_os != OperatingSystem_Null) { + config->target_os = target_os; + } else { + lnk_error_cmd_switch(LNK_Error_Cmdl, cmd_switch, "unknown operating system type %S", os_string); + } + } else { + lnk_error_cmd_switch(LNK_Warning_Cmdl, cmd_switch, "expected 1 parameter"); + } + } break; + + case LNK_CmdSwitch_Rad_TimeStamp: { + lnk_cmd_switch_parse_u32(cmd->value_strings, cmd_switch, &config->time_stamp, 0); + } break; + + case LNK_CmdSwitch_Rad_Version: { + fprintf(stdout, "%s\n", BUILD_TITLE); + os_abort(0); + } break; + + case LNK_CmdSwitch_Rad_Workers: { + U64 worker_count; + if (lnk_cmd_switch_parse_u64(cmd->value_strings, cmd_switch, &worker_count, 0)) { + config->worker_count = worker_count; + } + } break; + + case LNK_CmdSwitch_Help: { + lnk_print_help(); + os_abort(0); + } break; + } + } + + // :manifest_input + if (lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_ManifestInput)) { + switch (config->manifest_opt) { + case LNK_ManifestOpt_Null: { + lnk_error_cmd_switch(LNK_Error_Cmdl, LNK_CmdSwitch_ManifestInput, "missing /MANIFEST:EMBED"); + } break; + case LNK_ManifestOpt_No: { + lnk_error_cmd_switch(LNK_Warning_Cmdl, LNK_CmdSwitch_ManifestInput, "missing /MANIFEST:EMBED, ignoring inputs"); + } break; + case LNK_ManifestOpt_Embed: { + for (LNK_CmdOption *cmd = cmd_line.first_option; cmd != 0; cmd = cmd->next) { + LNK_CmdSwitchType cmd_switch = lnk_cmd_switch_type_from_string(cmd->string); + if (cmd_switch == LNK_CmdSwitch_ManifestInput) { + String8List manifest_list = str8_list_copy(arena, &cmd->value_strings); + str8_list_concat_in_place(&config->input_list[LNK_Input_Manifest], &manifest_list); + } + } + } break; + } + } + + // input files + for (String8Node *input_node = cmd_line.input_list.first; input_node != 0; input_node = input_node->next) { + String8 path = push_str8_copy(arena, input_node->string); + String8 ext = str8_skip_last_dot(path); + + // map file extension to input type + LNK_InputType input_type = lnk_input_type_from_string(ext); + + // do we support this file format? + if (input_type == LNK_Input_Null) { + lnk_error(LNK_Error_Cmdl, "unknown file format \"%S\"", path); + continue; + } + + // psuh file path + str8_list_push(arena, &config->input_list[input_type], path); + } + + // os version and subsystem are always same? + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Rad_OsVer)) { + config->os_ver = config->subsystem_ver; + } + + // don't emit bind table with /ALLOWBIND:NO + if (config->dll_characteristics & PE_DllCharacteristic_NO_BIND) { + config->flags &= ~LNK_ConfigFlag_DelayBind; + } + + // gather lib paths from enviroment string + if (config->flags & LNK_ConfigFlag_EnvLib) { +#if OS_WINDOWS + OS_ProcessInfo *process_info = os_get_process_info(); + for (String8Node *node = process_info->environment.first; node != 0; node = node->next) { + String8List var_list = str8_split_by_string_chars(scratch.arena, node->string, str8_lit("="), 0); + if (var_list.node_count != 2) { + continue; + } + String8 key = var_list.first->string; + String8 val = var_list.last->string; + if (str8_match(key, str8_lit("Lib"), StringMatchFlag_CaseInsensitive) || + str8_match(key, str8_lit("LibPath"), StringMatchFlag_CaseInsensitive)) { + String8List val_list = str8_split_by_string_chars(scratch.arena, val, str8_lit(";"), 0); + String8List val_list_copy = str8_list_copy(arena, &val_list); + str8_list_concat_in_place(&config->lib_dir_list, &val_list_copy); + } + } +#endif + } + + // set flags for /OPT + { + // these flags remove and merge inline functions and methods defined in class, + // and makes stepping tougher, in debug mode we don't link with these optimizations + // unless user specifically orverrides. + if (config->debug_mode != LNK_DebugMode_None) { + if (config->opt_ref == LNK_SwitchState_Null) { + config->opt_ref = LNK_SwitchState_No; + } + if (config->opt_icf == LNK_SwitchState_Null) { + config->opt_icf = LNK_SwitchState_No; + } + } + + // by default enable all optimizations + if (config->opt_ref == LNK_SwitchState_Null) { + config->opt_ref = LNK_SwitchState_Yes; + } + if (config->opt_icf == LNK_SwitchState_Null) { + config->opt_icf = LNK_SwitchState_Yes; + } + if (config->opt_lbr == LNK_SwitchState_Null) { + config->opt_lbr = LNK_SwitchState_Yes; + } + } + + // error check base address flags + if (config->flags & LNK_ConfigFlag_Fixed) { + if (lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_DynamicBase)) { + B32 is_dynamic_base_set = !!(config->dll_characteristics & PE_DllCharacteristic_DYNAMIC_BASE); + if (is_dynamic_base_set) { + lnk_error(LNK_Error_IncomatibleCmdOptions, "unable to link with /FIXED and /DYNAMICBASE at the same time"); + } + } + } + + // set flags for /FIXED + if (config->flags & LNK_ConfigFlag_Fixed) { + config->file_characteristics |= PE_ImageFileCharacteristic_STRIPPED; + config->dll_characteristics &= ~PE_DllCharacteristic_DYNAMIC_BASE; + } + + // set flag for /guard + if (config->guard_flags != LNK_Guard_None) { + config->dll_characteristics |= PE_DllCharacteristic_GUARD_CF; + } + + // handle empty /OUT + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Out)) { + String8 name = str8_list_first(&config->input_list[LNK_Input_Obj]); + String8 ext = (config->file_characteristics & PE_ImageFileCharacteristic_FILE_DLL) ? str8_lit("dll") : str8_lit("exe"); + config->image_name = make_file_path_with_ext(scratch.arena, name, ext); + } + config->image_name = os_make_full_path(arena, config->image_name); + + // handle empty /PDB + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Pdb)) { + config->pdb_name = make_file_path_with_ext(arena, config->image_name, str8_lit("pdb")); + } + config->pdb_name = os_make_full_path(arena, config->pdb_name); + + // handle empty /RAD_DEBUG_NAME + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_Rad_DebugName)) { + config->rad_debug_name = make_file_name_with_ext(arena, config->image_name, str8_lit("rdi")); + } + config->rad_debug_name = os_make_full_path(arena, config->rad_debug_name); + + // handle empty /IMPLIB + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_ImpLib)) { + config->imp_lib_name = make_file_name_with_ext(arena, config->image_name, str8_lit("lib")); + } + config->imp_lib_name = os_make_full_path(arena, config->imp_lib_name); + + // handle empty /MANIFESTFILE + if (!lnk_cmd_line_has_switch(cmd_line, LNK_CmdSwitch_ManifestFile)) { + config->manifest_name = make_file_path_with_ext(arena, config->image_name, str8_lit("manifest")); + } + + if (lnk_get_log_status(LNK_Log_Debug)) { + String8 full_cmd_line = str8_list_join(scratch.arena, &raw_cmd_line, &(StringJoin){ .sep = str8_lit_comp(" ") }); + lnk_log(LNK_Log_Debug, "--------------------------------------------------------------------------------"); + lnk_log(LNK_Log_Debug, "Command Line: %S", full_cmd_line); + lnk_log(LNK_Log_Debug, "Work Dir : %S", config->work_dir); + lnk_log(LNK_Log_Debug, "--------------------------------------------------------------------------------"); + } + + scratch_end(scratch); + ProfEnd(); + return config; +} + +internal LNK_Config * +lnk_build_config(Arena *arena, int argc, char **argv) +{ + Temp scratch = scratch_begin(&arena, 1); + + String8List raw_cmd_line = os_string_list_from_argcv(arena, argc, argv); + +#if OS_WINDOWS + // remove exe name first argument + str8_list_pop_front(&raw_cmd_line); +#endif + + // init config + LNK_Config *config = lnk_config_from_cmd_line(arena, raw_cmd_line); + + scratch_end(scratch); + return config; +} + diff --git a/src/linker/lnk_config.h b/src/linker/lnk_config.h new file mode 100644 index 00000000..895b76cd --- /dev/null +++ b/src/linker/lnk_config.h @@ -0,0 +1,480 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef enum +{ + LNK_CmdSwitch_Null, + LNK_CmdSwitch_NotImplemented, + LNK_CmdSwitch_Deprecated, + + LNK_CmdSwitch_Align, + LNK_CmdSwitch_AllowBind, + LNK_CmdSwitch_AllowIsolation, + LNK_CmdSwitch_AlternateName, + LNK_CmdSwitch_AppContainer, + LNK_CmdSwitch_Base, + LNK_CmdSwitch_Debug, + LNK_CmdSwitch_DefaultLib, + LNK_CmdSwitch_Delay, + LNK_CmdSwitch_DelayLoad, + LNK_CmdSwitch_Dll, + LNK_CmdSwitch_DynamicBase, + LNK_CmdSwitch_Entry, + LNK_CmdSwitch_FastFail, + LNK_CmdSwitch_FileAlign, + LNK_CmdSwitch_Fixed, + LNK_CmdSwitch_FunctionPadMin, + LNK_CmdSwitch_Heap, + LNK_CmdSwitch_HighEntropyVa, + LNK_CmdSwitch_Ignore, + LNK_CmdSwitch_ImpLib, + LNK_CmdSwitch_Include, + LNK_CmdSwitch_Incremental, + LNK_CmdSwitch_LargeAddressAware, + LNK_CmdSwitch_LibPath, + LNK_CmdSwitch_Machine, + LNK_CmdSwitch_Manifest, + LNK_CmdSwitch_ManifestDependency, + LNK_CmdSwitch_ManifestFile, + LNK_CmdSwitch_ManifestInput, + LNK_CmdSwitch_ManifestUac, + LNK_CmdSwitch_Natvis, + LNK_CmdSwitch_NoDefaultLib, + LNK_CmdSwitch_NoLogo, + LNK_CmdSwitch_NxCompat, + LNK_CmdSwitch_Opt, + LNK_CmdSwitch_Out, + LNK_CmdSwitch_Pdb, + LNK_CmdSwitch_PdbPageSize, + LNK_CmdSwitch_Stack, + LNK_CmdSwitch_SubSystem, + LNK_CmdSwitch_Time, + LNK_CmdSwitch_TsAware, + + // -- NOT Implemented: + + LNK_CmdSwitch_AssemblyDebug, + LNK_CmdSwitch_AssemblyLinkResource, + LNK_CmdSwitch_AssemblyModule, + LNK_CmdSwitch_AssemblyResource, + LNK_CmdSwitch_ClrImageType, + LNK_CmdSwitch_ClrLoaderOptimization, + LNK_CmdSwitch_ClrSupportLastError, + LNK_CmdSwitch_ClrThreadAttribute, + LNK_CmdSwitch_ClrRunManagedCodeCheck, + LNK_CmdSwitch_ClrUnmanagedCheck, + LNK_CmdSwitch_Def, + LNK_CmdSwitch_DelaySign, + LNK_CmdSwitch_DependentLoadFlag, + LNK_CmdSwitch_Driver, + LNK_CmdSwitch_DisallowLib, + LNK_CmdSwitch_EmitVolatileMetadata, + LNK_CmdSwitch_ErrorReport, + LNK_CmdSwitch_Export, + LNK_CmdSwitch_ExportAdmin, + LNK_CmdSwitch_FastGenProfile, + LNK_CmdSwitch_Force, + LNK_CmdSwitch_Guard, + LNK_CmdSwitch_GenProfile, + LNK_CmdSwitch_IdlOut, + LNK_CmdSwitch_IgnoreIdl, + LNK_CmdSwitch_Ilk, + LNK_CmdSwitch_IntegrityCheck, + LNK_CmdSwitch_Kernel, + LNK_CmdSwitch_KeyContainer, + LNK_CmdSwitch_KeyFile, + LNK_CmdSwitch_LinkerRepro, + LNK_CmdSwitch_LinkerReproTarget, + LNK_CmdSwitch_Ltcg, + LNK_CmdSwitch_LtcgOut, + LNK_CmdSwitch_Map, + LNK_CmdSwitch_MapInfo, + LNK_CmdSwitch_Merge, + LNK_CmdSwitch_Midl, + LNK_CmdSwitch_NoAssembly, + LNK_CmdSwitch_NoEntry, + LNK_CmdSwitch_NoExp, + LNK_CmdSwitch_NoImpLib, + LNK_CmdSwitch_Order, + LNK_CmdSwitch_PdbStripped, + LNK_CmdSwitch_Profile, + LNK_CmdSwitch_Release, + LNK_CmdSwitch_SafeSeh, + LNK_CmdSwitch_Section, + LNK_CmdSwitch_SourceLink, + LNK_CmdSwitch_Stub, + LNK_CmdSwitch_SwapRun, + LNK_CmdSwitch_TlbId, + LNK_CmdSwitch_UserProfile, + LNK_CmdSwitch_Verbose, + LNK_CmdSwitch_Version, + LNK_CmdSwitch_Winmd, + LNK_CmdSwitch_WinmdDelaySign, + LNK_CmdSwitch_WinmdKeyContainer, + LNK_CmdSwitch_WinmdKeyFile, + LNK_CmdSwitch_WholeArchive, + LNK_CmdSwitch_Wx, + + LNK_CmdSwitch_Rad_Age, + LNK_CmdSwitch_Rad_BuildInfo, + LNK_CmdSwitch_Rad_CheckUnusedDelayLoadDll, + LNK_CmdSwitch_Rad_Debug, + LNK_CmdSwitch_Rad_DebugName, + LNK_CmdSwitch_Rad_DelayBind, + LNK_CmdSwitch_Rad_DeleteManifest, + LNK_CmdSwitch_Rad_DoMerge, + LNK_CmdSwitch_Rad_EnvLib, + LNK_CmdSwitch_Rad_Exe, + LNK_CmdSwitch_Rad_Guid, + LNK_CmdSwitch_Rad_LargePages, + LNK_CmdSwitch_Rad_LinkVer, + LNK_CmdSwitch_Rad_Log, + LNK_CmdSwitch_Rad_Logo, + LNK_CmdSwitch_Rad_MtPath, + LNK_CmdSwitch_Rad_OsVer, + LNK_CmdSwitch_Rad_PageSize, + LNK_CmdSwitch_Rad_PathStyle, + LNK_CmdSwitch_Rad_SectVirtOff, + LNK_CmdSwitch_Rad_SuppressError, + LNK_CmdSwitch_Rad_SymbolTableCapDefined, + LNK_CmdSwitch_Rad_SymbolTableCapInternal, + LNK_CmdSwitch_Rad_SymbolTableCapWeak, + LNK_CmdSwitch_Rad_SymbolTableCapLib, + LNK_CmdSwitch_Rad_TargetOs, + LNK_CmdSwitch_Rad_TimeStamp, + LNK_CmdSwitch_Rad_Version, + LNK_CmdSwitch_Rad_Workers, + + LNK_CmdSwitch_Help, + + LNK_CmdSwitch_Count +} LNK_CmdSwitchType; + +typedef enum +{ + LNK_SwitchState_Null, + LNK_SwitchState_No, + LNK_SwitchState_Yes +} LNK_SwitchState; + +typedef enum +{ + LNK_Input_Null, + LNK_Input_Obj, + LNK_Input_Lib, + LNK_Input_Res, + LNK_Input_Manifest, + LNK_Input_Count +} LNK_InputType; + +enum +{ + LNK_ConfigFlag_Fixed = (1 << 0), + LNK_ConfigFlag_Merge = (1 << 1), + LNK_ConfigFlag_EnvLib = (1 << 2), + LNK_ConfigFlag_DelayUnload = (1 << 3), + LNK_ConfigFlag_DelayBind = (1 << 4), + LNK_ConfigFlag_CheckUnusedDelayLoadDll = (1 << 5), + LNK_ConfigFlag_NoTsAware = (1 << 6), + LNK_ConfigFlag_WriteImageChecksum = (1 << 8), + LNK_ConfigFlag_ManifestEmbed = (1 << 9), +}; +typedef U64 LNK_ConfigFlags; + +typedef enum +{ + LNK_DebugMode_Null, + LNK_DebugMode_None, + LNK_DebugMode_FastLink, + LNK_DebugMode_GHash, + LNK_DebugMode_Full, +} LNK_DebugMode; + +enum +{ + LNK_Guard_None = 0, + LNK_Guard_Cf = (1 << 0), + LNK_Guard_LongJmp = (1 << 1), + LNK_Guard_EhCont = (1 << 2), + LNK_Guard_All = LNK_Guard_Cf | LNK_Guard_LongJmp | LNK_Guard_EhCont +}; +typedef U32 LNK_GuardFlags; + +typedef enum +{ + LNK_ManifestOpt_Null, + LNK_ManifestOpt_Embed, + LNK_ManifestOpt_No +} LNK_ManifestOpt; + +typedef struct LNK_AltNameList +{ + String8List from_list; + String8List to_list; +} LNK_AltNameList; + +typedef enum +{ + LNK_DebugInfoGuid_Null, + Lnk_DebugInfoGuid_ImageBlake3, +} LNK_DebugInfoGuidType; + +typedef struct LNK_Config +{ + LNK_ConfigFlags flags; + LNK_DebugMode debug_mode; + LNK_SwitchState opt_ref; + LNK_SwitchState opt_icf; + LNK_SwitchState opt_lbr; + U64 opt_iter_count; + LNK_GuardFlags guard_flags; + LNK_DebugInfoGuidType guid_type; + OS_Guid guid; + COFF_TimeStamp time_stamp; + U32 age; + U64 section_virt_off; + U64 file_align; + U64 sect_align; + U64 stack_reserve; + U64 stack_commit; + U64 heap_reserve; + U64 heap_commit; + U64 user_base_addr; + U64 max_image_size; + U64 page_size; + U64 pdb_page_size; + U64 worker_count; + U64 function_pad_min; + U64 manifest_resource_id; + Version link_ver; + Version os_ver; + Version image_ver; + OperatingSystem target_os; + COFF_MachineType machine; + PE_WindowsSubsystem subsystem; + Version subsystem_ver; + PE_ImageFileCharacteristics file_characteristics; + PE_DllCharacteristics dll_characteristics; + String8 user_entry_point_name; + String8 entry_point_name; + String8List lib_dir_list; + PathStyle path_style; + LNK_ManifestOpt manifest_opt; + String8 work_dir; + String8 image_name; + String8 imp_lib_name; + String8List raw_cmd_line; + String8 pdb_name; + String8 mt_path; + String8List input_list[LNK_Input_Count]; + String8List input_default_lib_list; + String8List disallow_lib_list; + String8List delay_load_dll_list; + String8List natvis_list; + LNK_SwitchState delete_manifest; + String8 manifest_name; + B32 manifest_uac; + String8 manifest_level; + String8 manifest_ui_access; + String8List manifest_dependency_list; + LNK_SwitchState rad_debug; + String8 rad_debug_name; + String8List include_symbol_list; + LNK_AltNameList alt_name_list; + U64 symbol_table_cap_defined; + U64 symbol_table_cap_internal; + U64 symbol_table_cap_weak; + U64 symbol_table_cap_lib; +} LNK_Config; + +typedef enum +{ + LNK_ParseU64Flag_CheckUnder32bit = (1 << 0), + LNK_ParseU64Flag_CheckPow2 = (1 << 1), +} LNK_ParseU64Flags; + +//////////////////////////////// + +typedef enum +{ + LNK_MsErrorCode_Lnk1000 = 1000, + LNK_MsErrorCode_Lnk1103 = 1103, + LNK_MsErrorCode_Lnk1104 = 1104, + LNK_MsErrorCode_Lnk1106 = 1106, + LNK_MsErrorCode_Lnk1107 = 1107, + LNK_MsErrorCode_Lnk1112 = 1112, + LNK_MsErrorCode_Lnk1113 = 1113, + LNK_MsErrorCode_Lnk1120 = 1120, + LNK_MsErrorCode_Lnk1123 = 1123, + LNK_MsErrorCode_Lnk1127 = 1127, + LNK_MsErrorCode_Lnk1136 = 1136, + LNK_MsErrorCode_Lnk1140 = 1140, + LNK_MsErrorCode_Lnk1141 = 1141, + LNK_MsErrorCode_Lnk1143 = 1143, + LNK_MsErrorCode_Lnk1152 = 1152, + LNK_MsErrorCode_Lnk1158 = 1158, + LNK_MsErrorCode_Lnk1164 = 1164, + LNK_MsErrorCode_Lnk1166 = 1166, + LNK_MsErrorCode_Lnk1168 = 1168, + LNK_MsErrorCode_Lnk1169 = 1169, + LNK_MsErrorCode_Lnk1170 = 1170, + LNK_MsErrorCode_Lnk1179 = 1179, + LNK_MsErrorCode_Lnk1181 = 1181, + LNK_MsErrorCode_Lnk1189 = 1189, + LNK_MsErrorCode_Lnk1196 = 1196, + LNK_MsErrorCode_Lnk1200 = 1200, + LNK_MsErrorCode_Lnk1201 = 1201, + LNK_MsErrorCode_Lnk1211 = 1211, + LNK_MsErrorCode_Lnk1215 = 1215, + LNK_MsErrorCode_Lnk1218 = 1218, + LNK_MsErrorCode_Lnk1221 = 1221, + LNK_MsErrorCode_Lnk1223 = 1223, + LNK_MsErrorCode_Lnk1224 = 1224, + LNK_MsErrorCode_Lnk1237 = 1237, + LNK_MsErrorCode_Lnk1240 = 1240, + LNK_MsErrorCode_Lnk1241 = 1241, + LNK_MsErrorCode_Lnk1245 = 1245, + LNK_MsErrorCode_Lnk1248 = 1248, + LNK_MsErrorCode_Lnk1256 = 1256, + LNK_MsErrorCode_Lnk1264 = 1264, + LNK_MsErrorCode_Lnk1277 = 1277, + LNK_MsErrorCode_Lnk1282 = 1282, + LNK_MsErrorCode_Lnk1287 = 1287, + LNK_MsErrorCode_Lnk1296 = 1296, + LNK_MsErrorCode_Lnk1301 = 1301, + LNK_MsErrorCode_Lnk1302 = 1302, + LNK_MsErrorCode_Lnk1306 = 1306, + LNK_MsErrorCode_Lnk1309 = 1309, + LNK_MsErrorCode_Lnk1312 = 1312, + LNK_MsErrorCode_Lnk1313 = 1313, + LNK_MsErrorCode_Lnk1314 = 1314, + LNK_MsErrorCode_Lnk1318 = 1318, + LNK_MsErrorCode_Lnk1332 = 1332, + LNK_MsErrorCode_Lnk1352 = 1352, + LNK_MsErrorCode_Lnk1561 = 1561, + LNK_MsErrorCode_Lnk2001 = 2001, + LNK_MsErrorCode_Lnk2004 = 2004, + LNK_MsErrorCode_Lnk2005 = 2005, + LNK_MsErrorCode_Lnk2008 = 2008, + LNK_MsErrorCode_Lnk2011 = 2011, + LNK_MsErrorCode_Lnk2013 = 2013, + LNK_MsErrorCode_Lnk2017 = 2017, + LNK_MsErrorCode_Lnk2019 = 2019, + LNK_MsErrorCode_Lnk2020 = 2020, + LNK_MsErrorCode_Lnk2022 = 2022, + LNK_MsErrorCode_Lnk2023 = 2023, + LNK_MsErrorCode_Lnk2026 = 2026, + LNK_MsErrorCode_Lnk2027 = 2027, + LNK_MsErrorCode_Lnk2031 = 2031, + LNK_MsErrorCode_Lnk2033 = 2033, + LNK_MsErrorCode_Lnk2038 = 2038, + LNK_MsErrorCode_Lnk2039 = 2039, + + LNK_MsWarningCode_Lnk4001 = 4001, + LNK_MsWarningCode_Lnk4002 = 4002, + LNK_MsWarningCode_Lnk4006 = 4006, + LNK_MsWarningCode_Lnk4010 = 4010, + LNK_MsWarningCode_Lnk4014 = 4014, + LNK_MsWarningCode_Lnk4020 = 4020, + LNK_MsWarningCode_Lnk4022 = 4022, + LNK_MsWarningCode_Lnk4039 = 4039, + LNK_MsWarningCode_Lnk4044 = 4044, + LNK_MsWarningCode_Lnk4049 = 4049, + LNK_MsWarningCode_Lnk4065 = 4065, + LNK_MsWarningCode_Lnk4070 = 4070, + LNK_MsWarningCode_Lnk4071 = 4071, + LNK_MsWarningCode_Lnk4073 = 4073, + LNK_MsWarningCode_Lnk4075 = 4075, + LNK_MsWarningCode_Lnk4076 = 4076, + LNK_MsWarningCode_SectionFlagsConflict = 4078, + LNK_MsWarningCode_Lnk4086 = 4086, + LNK_MsWarningCode_Lnk4092 = 4092, + LNK_MsWarningCode_Lnk4096 = 4096, + LNK_MsWarningCode_Lnk4098 = 4098, + LNK_MsWarningCode_MissingExternalTypeServer = 4099, + LNK_MsWarningCode_Lnk4102 = 4102, + LNK_MsWarningCode_Lnk4104 = 4104, + LNK_MsWarningCode_Lnk4105 = 4105, + LNK_MsWarningCode_Lnk4194 = 4194, + LNK_MsWarningCode_Lnk4197 = 4197, + LNK_MsWarningCode_UnsuedDelayLoadDll = 4199, + LNK_MsWarningCode_Lnk4200 = 4200, + LNK_MsWarningCode_Lnk4204 = 4204, + LNK_MsWarningCode_Lnk4205 = 4205, + LNK_MsWarningCode_Lnk4206 = 4206, + LNK_MsWarningCode_Lnk4210 = 4210, + LNK_MsWarningCode_Lnk4216 = 4216, + LNK_MsWarningCode_Lnk4217 = 4217, + LNK_MsWarningCode_Lnk4219 = 4219, + LNK_MsWarningCode_Lnk4220 = 4220, + LNK_MsWarningCode_Lnk4221 = 4221, + LNK_MsWarningCode_Lnk4222 = 4222, + LNK_MsWarningCode_Lnk4224 = 4224, + LNK_MsWarningCode_Lnk4227 = 4227, + LNK_MsWarningCode_Lnk4229 = 4229, + LNK_MsWarningCode_Lnk4237 = 4237, + LNK_MsWarningCode_Lnk4247 = 4247, + LNK_MsWarningCode_Lnk4248 = 4248, + LNK_MsWarningCode_Lnk4253 = 4253, + LNK_MsWarningCode_Lnk4254 = 4254, + LNK_MsWarningCode_Lnk4286 = 4286, +} LNK_MsErrorCode; + +//////////////////////////////// +// Enum <-> String + +internal String8 lnk_string_cmd_switch_type(LNK_CmdSwitchType type); +internal LNK_CmdSwitchType lnk_cmd_switch_from_string(String8 string); +internal LNK_InputType lnk_input_type_from_string(String8 string); +internal LNK_DebugMode lnk_debug_mode_from_string(String8 string); + +//////////////////////////////// +// Command Line Helpers + +internal LNK_CmdOption * lnk_cmd_line_push_option_if_not_presentf(Arena *arena, LNK_CmdLine *cmd_line, LNK_CmdSwitchType cmd_switch_type, char *param_fmt, ...); +internal LNK_CmdOption * lnk_cmd_line_push_optionf(Arena *arena, LNK_CmdLine *cmd_line, LNK_CmdSwitchType cmd_switch_type, char *param_fmt, ...); +internal B32 lnk_cmd_line_has_switch(LNK_CmdLine cmd_line, LNK_CmdSwitchType cmd_switch_type); + +//////////////////////////////// +// Errors + +internal void lnk_error_cmd_switch(LNK_ErrorCode code, LNK_CmdSwitchType cmd_switch, char *fmt, ...); +internal void lnk_error_cmd_switch_invalid_param_count(LNK_ErrorCode code, LNK_CmdSwitchType cmd_switch); +internal void lnk_error_cmd_switch_invalid_param(LNK_ErrorCode code, LNK_CmdSwitchType cmd_switch, String8 param); + +//////////////////////////////// +// Getters + +internal U64 lnk_get_base_addr(LNK_Config *config); +internal Version lnk_get_default_subsystem_version(PE_WindowsSubsystem subsystem, COFF_MachineType machine); +internal Version lnk_get_min_subsystem_version(PE_WindowsSubsystem subsystem, COFF_MachineType machine); +internal String8 lnk_get_mt_path(Arena *arena); + +internal B32 lnk_do_debug_info(LNK_Config *config); + +//////////////////////////////// +// Specialized Parsers + +internal B32 lnk_cmd_switch_parse_version(String8List value_strings, LNK_CmdSwitchType cmd_switch, Version *ver_out); +internal B32 lnk_cmd_switch_parse_tuple(String8List value_strings, LNK_CmdSwitchType cmd_switch, Rng1U64 *tuple_out); +internal B32 lnk_cmd_switch_parse_u64(String8List value_strings, LNK_CmdSwitchType cmd_switch, U64 *value_out, LNK_ParseU64Flags flags); +internal B32 lnk_cmd_switch_parse_u32(String8List value_strings, LNK_CmdSwitchType cmd_switch, U32 *value_out, LNK_ParseU64Flags flags); +internal B32 lnk_cmd_switch_parse_flag(String8List value_strings, LNK_CmdSwitchType cmd_switch, LNK_SwitchState *value_out); +internal void lnk_cmd_switch_set_flag_inv_16(String8List value_strings, LNK_CmdSwitchType cmd_switch, U16 *flags, U16 bits); +internal void lnk_cmd_switch_set_flag_inv_64(String8List value_strings, LNK_CmdSwitchType cmd_switch, U64 *flags, U64 bits); +internal void lnk_cmd_switch_set_flag_16(String8List value_strings, LNK_CmdSwitchType cmd_switch, U16 *flags, U16 bits); +internal void lnk_cmd_switch_set_flag_32(String8List value_strings, LNK_CmdSwitchType cmd_switch, U32 *flags, U32 bits); +internal void lnk_cmd_switch_set_flag_64(String8List value_strings, LNK_CmdSwitchType cmd_switch, U64 *flags, U64 bits); +internal B32 lnk_cmd_switch_parse_string(String8List value_strings, LNK_CmdSwitchType cmd_switch, String8 *string_out); +internal void lnk_cmd_switch_parse_string_copy(Arena *arena, String8List value_strings, LNK_CmdSwitchType cmd_switch, String8 *string_out); + +internal B32 lnk_parse_alt_name_directive(Arena *arena, String8 input, LNK_AltNameList *list_out); +internal String8 * lnk_parse_alt_name_directive_list(Arena *arena, String8List list, LNK_AltNameList *list_out); + +//////////////////////////////// + +internal LNK_Config * lnk_config_from_raw_cmd_line(Arena *arena, String8List raw_cmd_line); +internal LNK_Config * lnk_build_config(Arena *arena, int argc, char **argv); + diff --git a/src/linker/lnk_debug_info.c b/src/linker/lnk_debug_info.c new file mode 100644 index 00000000..134ab865 --- /dev/null +++ b/src/linker/lnk_debug_info.c @@ -0,0 +1,5307 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +//////////////////////////////// + +internal +THREAD_POOL_TASK_FUNC(lnk_parse_debug_s_task) +{ + U64 obj_idx = task_id; + LNK_ParseDebugSTaskData *task = raw_task; + + LNK_Obj *obj = task->obj_arr[obj_idx]; + LNK_ChunkList sect_list = task->sect_list_arr[obj_idx]; + CV_DebugS *debug_s = &task->debug_s_arr[obj_idx]; + + for (LNK_ChunkNode *node = sect_list.first; node != 0; node = node->next) { + LNK_ChunkPtr chunk = node->data; + Assert(chunk->type == LNK_Chunk_Leaf); + + // parse & merge sub sections + CV_DebugS ds = cv_parse_debug_s(arena, chunk->u.leaf); + cv_debug_s_concat_in_place(debug_s, &ds); + + // make sure there is one string table + String8List string_data_list = cv_sub_section_from_debug_s(*debug_s, CV_C13SubSectionKind_StringTable); + if (string_data_list.node_count > 1) { + // TODO: print section index + lnk_error_obj(LNK_Warning_IllData, obj, ".debug$S has %u string table sub-sections defined, picking first sub-section", string_data_list.node_count); + } + + // make sure there is one file checksum table + String8List checksum_data_list = cv_sub_section_from_debug_s(*debug_s, CV_C13SubSectionKind_FileChksms); + if (checksum_data_list.node_count > 1) { + // TODO: print section index + lnk_error_obj(LNK_Warning_IllData, obj, ".debug$S has %u file checksum sub-sections defined, picking first sub-section", checksum_data_list.node_count); + } + } +} + +internal CV_DebugS * +lnk_parse_debug_s_sections(TP_Context *tp, TP_Arena *arena, U64 obj_count, LNK_Obj **obj_arr, LNK_ChunkList *sect_list_arr) +{ + ProfBeginFunction(); + + LNK_ParseDebugSTaskData task_data = {0}; + task_data.obj_arr = obj_arr; + task_data.sect_list_arr = sect_list_arr; + task_data.debug_s_arr = push_array(arena->v[0], CV_DebugS, obj_count); + + tp_for_parallel(tp, arena, obj_count, lnk_parse_debug_s_task, &task_data); + + ProfEnd(); + return task_data.debug_s_arr; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_check_debug_t_sig_and_get_data_task) +{ + U64 obj_idx = task_id; + LNK_CheckDebugTSigTaskData *task = raw_task; + + String8Array data_arr = task->data_arr_arr[obj_idx]; + LNK_Obj *obj = task->obj_arr[obj_idx]; + + for (String8 *data_ptr = &data_arr.v[0], *data_opl = data_arr.v + data_arr.count; + data_ptr < data_opl; + ++data_ptr) { + if (data_ptr->size == 0) { + continue; + } + + if (data_ptr->size < sizeof(CV_Signature)) { + // TODO: print section index + lnk_error_obj(LNK_Error_IllData, obj, ".debug$T must have at least 4 bytes for CodeView signature"); + } + + CV_Signature *sig_ptr = (CV_Signature *)data_ptr->str; + switch (*sig_ptr) { + default: { + lnk_error_obj(LNK_Warning_IllData, obj, "unknown CodeView type signature in section (TODO: print section index)"); + *data_ptr = str8(0,0); + } break; + case CV_Signature_C6: { + lnk_not_implemented("TODO: C6 types"); + *data_ptr = str8(0,0); + } break; + case CV_Signature_C7: { + lnk_not_implemented("TODO: C7 types"); + *data_ptr = str8(0,0); + } break; + case CV_Signature_C11: { + lnk_not_implemented("TODO: C11 types"); + *data_ptr = str8(0,0); + } break; + case CV_Signature_C13: { + data_ptr->str += sizeof(CV_Signature); + data_ptr->size -= sizeof(CV_Signature); + } break; + } + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_parse_debug_t_task) +{ + ProfBeginFunction(); + U64 obj_idx = task_id; + LNK_ParseDebugTTaskData *task = raw_task; + String8Array data_arr = task->data_arr_arr[obj_idx]; + CV_DebugT *debug_t = &task->debug_t_arr[obj_idx]; + *debug_t = cv_debug_t_from_data_arr(arena, data_arr, CV_LeafAlign); + ProfEnd(); +} + +internal CV_DebugT * +lnk_parse_debug_t_sections(TP_Context *tp, TP_Arena *arena, U64 obj_count, LNK_Obj **obj_arr, LNK_ChunkList *debug_t_list_arr) +{ + ProfBeginFunction(); + + // list -> array + String8Array *data_arr_arr = lnk_data_arr_from_chunk_ptr_list_arr(arena->v[0], debug_t_list_arr, obj_count); + + // validate signatures + LNK_CheckDebugTSigTaskData check_sig; + check_sig.obj_arr = obj_arr; + check_sig.data_arr_arr = data_arr_arr; + tp_for_parallel(tp, 0, obj_count, lnk_check_debug_t_sig_and_get_data_task, &check_sig); + + // parse debug types + LNK_ParseDebugTTaskData parse; + parse.data_arr_arr = data_arr_arr; + parse.debug_t_arr = push_array_no_zero(arena->v[0], CV_DebugT, obj_count); + tp_for_parallel(tp, arena, obj_count, lnk_parse_debug_t_task, &parse); + + ProfEnd(); + return parse.debug_t_arr; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_parse_cv_symbols_task) +{ + LNK_ParseCVSymbolsTaskData *task = raw_task; + LNK_CodeViewSymbolsInput *input = &task->inputs[task_id]; + cv_parse_symbol_sub_section(arena, input->symbol_list, 0, input->raw_symbols, CV_SymbolAlign); +} + +internal LNK_PchInfo * +lnk_setup_pch(Arena *arena, U64 obj_count, LNK_Obj *obj_arr, CV_DebugT *debug_t_arr, CV_DebugT *debug_p_arr, CV_SymbolListArray *parsed_symbols) +{ + Temp scratch = scratch_begin(&arena, 1); + + String8 work_dir = os_get_current_path(scratch.arena); + + HashTable *debug_p_ht = hash_table_init(scratch.arena, obj_count); + CV_LeafHeader **endprecomp_arr = push_array(scratch.arena, CV_LeafHeader *, obj_count); + + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + CV_DebugT *debug_p = &debug_p_arr[obj_idx]; + CV_DebugT *debug_t = &debug_t_arr[obj_idx]; + + if (debug_t->count && debug_p->count) { + lnk_error_obj(LNK_Warning_MultipleDebugTAndDebugP, + &obj_arr[obj_idx], + "multiple sections with debug types detected, obj must have either .debug$T or .debug$P (using .debug$T for type server)"); + continue; + } + + if (debug_p->count) { + String8 obj_path = obj_arr[obj_idx].path; + obj_path = path_absolute_dst_from_relative_dst_src(scratch.arena, obj_path, work_dir); + if (hash_table_search_path(debug_p_ht, obj_path)) { + lnk_error_obj(LNK_Warning_DuplicateObjPath, &obj_arr[obj_idx], "duplicate obj path %S", obj_path); + } else { + hash_table_push_path_u64(scratch.arena, debug_p_ht, obj_path, obj_idx); + } + } + } + + LNK_PchInfo* pch_arr = push_array_no_zero(arena, LNK_PchInfo, obj_count); + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + CV_DebugT debug_t = debug_t_arr[obj_idx]; + if (cv_debug_t_is_pch(debug_t)) { + CV_Leaf precomp_leaf = cv_debug_t_get_leaf(debug_t, 0); + CV_PrecompInfo precomp = cv_precomp_info_from_leaf(precomp_leaf); + + String8 obj_path = path_absolute_dst_from_relative_dst_src(scratch.arena, precomp.obj_name, work_dir); + + // map obj name in LF_PRECOMP to obj index + U64 debug_p_obj_idx; + if (!hash_table_search_path_u64(debug_p_ht, obj_path, &debug_p_obj_idx)) { + lnk_error_obj(LNK_Error_PrecompObjNotFound, &obj_arr[obj_idx], "LF_PRECOMP references non-existent obj %S", obj_path); + lnk_exit(LNK_Error_PrecompObjNotFound); + } + + // get LF_PRECOMP + CV_DebugT debug_p = debug_p_arr[debug_p_obj_idx]; + CV_Leaf endprecomp_leaf = cv_debug_t_get_leaf(debug_p, precomp.leaf_count); + CV_LeafEndPreComp *endprecomp = (CV_LeafEndPreComp*) endprecomp_leaf.data.str; + + // error check LF_PRECOMP + if (precomp.start_index > CV_MinComplexTypeIndex) { + lnk_error_obj(LNK_Warning_AtypicalStartIndex, &obj_arr[obj_idx], "atypical start index 0x%X in LF_PRECOMP", precomp.start_index); + } + if (precomp.start_index < CV_MinComplexTypeIndex) { + lnk_error_obj(LNK_Error_InvalidStartIndex, &obj_arr[obj_idx], "invalid start index 0x%X in LF_PRECOMP; must be >= 0x%X", precomp.start_index, CV_MinComplexTypeIndex); + } + if (precomp.leaf_count > debug_p.count) { + lnk_error_obj(LNK_Error_InvalidPrecompLeafCount, &obj_arr[obj_idx], "leaf count %u LF_PRECOMP exceeds leaf count %u in .debug$P in %S", precomp.leaf_count, debug_p.count, obj_arr[debug_p_obj_idx].path); + } + + // error check LF_ENDPRECOMP + if (endprecomp_leaf.kind != CV_LeafKind_ENDPRECOMP) { + lnk_error_obj(LNK_Error_EndprecompNotFound, &obj_arr[obj_idx], "unable to find LF_ENDPRECOMP @ 0x%X in %S", precomp.leaf_count, obj_arr[debug_p_obj_idx].path); + } + if (endprecomp_leaf.data.size != sizeof(CV_LeafEndPreComp)) { + lnk_error_obj(LNK_Error_IllData, &obj_arr[obj_idx], "invalid size 0x%X for LF_ENDPRECOMP", endprecomp_leaf.data.size); + } + if (endprecomp->sig != precomp.sig) { + lnk_error_obj(LNK_Error_PrecompSigMismatch, &obj_arr[obj_idx], "signature mismatch between LF_PRECOMP(0x%X) and LF_ENDPRECOMP(0x%X); precomp obj %S", precomp.sig, endprecomp->sig, obj_arr[debug_p_obj_idx].path); + } + { // check against S_OBJNAME sig in precompiled obj $$SYMBOLS + CV_SymbolList symbol_list = parsed_symbols[debug_p_obj_idx].v[0]; + if (symbol_list.count) { + CV_ObjInfo obj_info = cv_obj_info_from_symbol(symbol_list.first->data); + if (obj_info.sig != 0 && obj_info.sig != precomp.sig) { + lnk_error_obj(LNK_Error_PrecompSigMismatch, &obj_arr[obj_idx], "signature mismatch between LF_PRECOMP(0x%X) and S_OBJNAME(0x%X) in %S", precomp.sig, obj_info.sig, &obj_arr[debug_p_obj_idx].path); + } + } else { + lnk_error_obj(LNK_Warning_PrecompObjSymbolsNotFound, &obj_arr[obj_idx], "symbols not found, unable to chceck LF_PRECOMP signature against S_OBJ"); + } + } + + // see :pch_check + LNK_PchInfo *pch = &pch_arr[obj_idx]; + pch->ti_lo = precomp.start_index; + pch->ti_hi = precomp.start_index + precomp.leaf_count; + pch->debug_p_obj_idx = debug_p_obj_idx; + + // [start_index, start_index+type_index_count) + debug_t_arr[obj_idx].count -= 1; + debug_t_arr[obj_idx].v += 1; + + endprecomp_arr[debug_p_obj_idx] = cv_debug_t_get_leaf_header(debug_p, precomp.leaf_count); + } else { + LNK_PchInfo *pch = &pch_arr[obj_idx]; + pch->ti_lo = CV_MinComplexTypeIndex; + pch->ti_hi = CV_MinComplexTypeIndex; + pch->debug_p_obj_idx = 0; // :null_obj + } + } + + // remove LF_ENDPRECOMP + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + if (endprecomp_arr[obj_idx]) { + endprecomp_arr[obj_idx]->kind = CV_LeafKind_NOTYPE; + endprecomp_arr[obj_idx]->size = sizeof(CV_LeafKind); + } + } + + scratch_end(scratch); + return pch_arr; +} + +internal void +lnk_do_debug_info_discard(CV_DebugS *debug_s_arr, CV_SymbolListArray *parsed_symbols, U64 obj_idx) +{ + // remove symbols + for (U64 i = 0; i < parsed_symbols[obj_idx].count; ++i) { + MemoryZeroStruct(&parsed_symbols[obj_idx].v[i]); + } + + // remove inline sites + String8List *inlineelines_ptr = cv_sub_section_ptr_from_debug_s(&debug_s_arr[obj_idx], CV_C13SubSectionKind_InlineeLines); + MemoryZeroStruct(inlineelines_ptr); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_get_external_leaves_task) +{ + ProfBeginFunction(); + + LNK_GetExternalLeavesTask *task = raw_task; + + U64 ts_idx = task_id; + + task->external_ti_ranges[ts_idx] = push_array_no_zero(arena, Rng1U64, CV_TypeIndexSource_COUNT); + task->external_leaves[ts_idx] = push_array_no_zero(arena, CV_DebugT, CV_TypeIndexSource_COUNT); + task->is_corrupted[ts_idx] = 1; + + // TODO: pick TPI and IPI to flattten to make sure we don't waste compute on throw-away streams + MSF_Parsed *msf_parse = msf_parsed_from_data(arena, task->msf_data_arr[ts_idx]); + + if (msf_parse) { + PDB_TypeServerParse tpi_parse, ipi_parse; + PDB_OpenTypeServerError tpi_error = pdb_type_server_parse_from_data(msf_parse->streams[PDB_FixedStream_Tpi], &tpi_parse); + PDB_OpenTypeServerError ipi_error = pdb_type_server_parse_from_data(msf_parse->streams[PDB_FixedStream_Ipi], &ipi_parse); + + if (tpi_error == PDB_OpenTypeServerError_OK && + ipi_error == PDB_OpenTypeServerError_OK) { + task->is_corrupted[ts_idx] = 0; + + task->external_ti_ranges[ts_idx][CV_TypeIndexSource_NULL] = rng_1u64(0,0); + task->external_ti_ranges[ts_idx][CV_TypeIndexSource_TPI ] = tpi_parse.ti_range; + task->external_ti_ranges[ts_idx][CV_TypeIndexSource_IPI ] = ipi_parse.ti_range; + + MemoryZeroStruct(&task->external_leaves[ts_idx][CV_TypeIndexSource_NULL]); + task->external_leaves[ts_idx][CV_TypeIndexSource_TPI] = cv_debug_t_from_data(arena, tpi_parse.leaf_data, PDB_LEAF_ALIGN); + task->external_leaves[ts_idx][CV_TypeIndexSource_IPI] = cv_debug_t_from_data(arena, ipi_parse.leaf_data, PDB_LEAF_ALIGN); + } else { + if (tpi_error != PDB_OpenTypeServerError_OK) { + lnk_error(LNK_Error_UnableToOpenTypeServer, "failed to open TPI in %S, reson %S", task->path_arr[ts_idx], pdb_string_from_open_type_server_error(tpi_error)); + } + if (ipi_error != PDB_OpenTypeServerError_OK) { + lnk_error(LNK_Error_UnableToOpenTypeServer, "failed to open IPI in %S, reason %S", task->path_arr[ts_idx], pdb_string_from_open_type_server_error(ipi_error)); + } + } + } else { + MemoryZeroTyped(task->external_ti_ranges[ts_idx], CV_TypeIndexSource_COUNT); + MemoryZeroTyped(task->external_leaves[ts_idx], CV_TypeIndexSource_COUNT); + } + + ProfEnd(); +} + +internal CV_DebugT * +lnk_merge_debug_t_and_debug_p(Arena *arena, U64 obj_count, CV_DebugT *debug_t_arr, CV_DebugT *debug_p_arr) +{ + CV_DebugT *result = push_array_no_zero(arena, CV_DebugT, obj_count); + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + CV_DebugT *debug_p = &debug_p_arr[obj_idx]; + CV_DebugT *debug_t = &debug_t_arr[obj_idx]; + if (debug_p->count) { + Assert(!debug_t->count); + result[obj_idx] = *debug_p; + } else if (debug_t->count) { + Assert(!debug_p->count); + result[obj_idx] = *debug_t; + } else { + MemoryZeroStruct(&result[obj_idx]); + } + } + return result; +} + +internal LNK_CodeViewInput +lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, String8List lib_dir_list, LNK_ObjList obj_list) +{ + ProfBegin("Extract CodeView"); + Temp scratch = scratch_begin(0,0); + + // obj list -> array + U64 obj_count = obj_list.count; + LNK_Obj **obj_arr = lnk_obj_arr_from_list(tp_arena->v[0], obj_list); + + // gather debug info sections from objs + ProfBegin("Collect CodeView"); + // TODO: fix memory leak, we need a Temp wrapper for pool arena + B32 collect_discarded_flag = 0; + LNK_ChunkList *debug_s_list_arr = lnk_collect_obj_chunks(tp, tp_arena, obj_count, obj_arr, str8_lit(".debug"), str8_lit("S"), collect_discarded_flag); + LNK_ChunkList *debug_p_list_arr = lnk_collect_obj_chunks(tp, tp_arena, obj_count, obj_arr, str8_lit(".debug"), str8_lit("P"), collect_discarded_flag); + LNK_ChunkList *debug_t_list_arr = lnk_collect_obj_chunks(tp, tp_arena, obj_count, obj_arr, str8_lit(".debug"), str8_lit("T"), collect_discarded_flag); + ProfEnd(); + + if (lnk_get_log_status(LNK_Log_Debug) || PROFILE_TELEMETRY) { + U64 total_debug_s_size = 0; + U64 total_debug_t_size = 0; + U64 total_debug_p_size = 0; + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + for (LNK_ChunkNode *chunk = debug_s_list_arr[obj_idx].first; chunk != 0; chunk = chunk->next) { + total_debug_s_size += chunk->data->u.leaf.size; + } + for (LNK_ChunkNode *chunk = debug_t_list_arr[obj_idx].first; chunk != 0; chunk = chunk->next) { + total_debug_t_size += chunk->data->u.leaf.size; + } + for (LNK_ChunkNode *chunk = debug_p_list_arr[obj_idx].first; chunk != 0; chunk = chunk->next) { + total_debug_p_size += chunk->data->u.leaf.size; + } + } + String8 total_debug_s_size_string = str8_from_memory_size2(scratch.arena, total_debug_s_size); + String8 total_debug_t_size_string = str8_from_memory_size2(scratch.arena, total_debug_t_size); + String8 total_debug_p_size_string = str8_from_memory_size2(scratch.arena, total_debug_p_size); + if (lnk_get_log_status(LNK_Log_Debug)) { + lnk_log(LNK_Log_Debug, "[Total .debug$S Input Size %S]", total_debug_s_size_string); + lnk_log(LNK_Log_Debug, "[Total .debug$T Input Size %S]", total_debug_t_size_string); + lnk_log(LNK_Log_Debug, "[Total .debug$P Input Size %S]", total_debug_p_size_string); + } +#if PROFILE_TELEMETRY + tmMessage(0, TMMF_ICON_NOTE, "Total .debug$S Input Size: %.*s", str8_varg(total_debug_s_size_string)); + tmMessage(0, TMMF_ICON_NOTE, "Total .debug$T Input Size: %.*s", str8_varg(total_debug_t_size_string)); + tmMessage(0, TMMF_ICON_NOTE, "Total .debug$P Input Size: %.*s", str8_varg(total_debug_p_size_string)); +#endif + } + + // TODO: temp hack, remove when we have null obj with .debug$T + { + String8 raw_null_leaf = cv_serialize_leaf_ex(scratch.arena, CV_LeafKind_NOTYPE, str8(0,0), 1); + + String8List srl = {0}; + str8_serial_begin(scratch.arena, &srl); + str8_serial_push_u32(scratch.arena, &srl, CV_Signature_C13); + str8_serial_push_string(scratch.arena, &srl, raw_null_leaf); + String8 null_debug_data = str8_serial_end(tp_arena->v[0], &srl); + + LNK_Chunk *null_chunk = push_array(tp_arena->v[0], LNK_Chunk, 1); + null_chunk->type = LNK_Chunk_Leaf; + null_chunk->u.leaf = null_debug_data; + lnk_chunk_list_push(tp_arena->v[0], &debug_t_list_arr[0], null_chunk); + } + + ProfBegin("Parse CodeView"); + CV_DebugS *debug_s_arr = lnk_parse_debug_s_sections(tp, tp_arena, obj_count, obj_arr, debug_s_list_arr); + CV_DebugT *debug_p_arr = lnk_parse_debug_t_sections(tp, tp_arena, obj_count, obj_arr, debug_p_list_arr); + CV_DebugT *debug_t_arr = lnk_parse_debug_t_sections(tp, tp_arena, obj_count, obj_arr, debug_t_list_arr); + ProfEnd(); + + ProfBegin("Sort Type Servers"); + + U64 external_count = 0, internal_count = 0; + LNK_Obj *sorted_obj_arr = push_array_no_zero(tp_arena->v[0], LNK_Obj, obj_count); + CV_DebugS *sorted_debug_s_arr = push_array_no_zero(tp_arena->v[0], CV_DebugS, obj_count); + CV_DebugT *sorted_debug_t_arr = push_array_no_zero(tp_arena->v[0], CV_DebugT, obj_count); + CV_DebugT *sorted_debug_p_arr = push_array_no_zero(tp_arena->v[0], CV_DebugT, obj_count); + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + B32 is_type_server = cv_debug_t_is_type_server(debug_t_arr[obj_idx]); + if (is_type_server) { + Assert(internal_count + external_count < obj_count); + U64 slot_idx = (obj_count - external_count - 1); + ++external_count; + + // TODO: report error: somehow obj was compiled with /Zi and /Yc + Assert(debug_p_arr[obj_idx].count == 0); + + sorted_obj_arr[slot_idx] = *obj_arr[obj_idx]; + sorted_debug_s_arr[slot_idx] = debug_s_arr[obj_idx]; + sorted_debug_t_arr[slot_idx] = debug_t_arr[obj_idx]; + MemoryZeroStruct(&sorted_debug_p_arr[slot_idx]); + } else { + Assert(internal_count + external_count < obj_count); + U64 slot_idx = internal_count; + ++internal_count; + + sorted_obj_arr[slot_idx] = *obj_arr[obj_idx]; + sorted_debug_s_arr[slot_idx] = debug_s_arr[obj_idx]; + sorted_debug_t_arr[slot_idx] = debug_t_arr[obj_idx]; + sorted_debug_p_arr[slot_idx] = debug_p_arr[obj_idx]; + } + } + + ProfEnd(); + + // setup pointers to arrays + LNK_Obj *internal_obj_arr = sorted_obj_arr; + LNK_Obj *external_obj_arr = sorted_obj_arr + internal_count; + CV_DebugS *internal_debug_s_arr = sorted_debug_s_arr; + CV_DebugS *external_debug_s_arr = sorted_debug_s_arr + internal_count; + CV_DebugT *internal_debug_t_arr = sorted_debug_t_arr; + CV_DebugT *external_debug_t_arr = sorted_debug_t_arr + internal_count; + CV_DebugT *internal_debug_p_arr = sorted_debug_p_arr; + CV_DebugT *external_debug_p_arr = sorted_debug_p_arr + internal_count; + + ProfBegin("Parse Symbols"); + + ProfBegin("Count Symbol Inputs"); + U64 internal_total_symbol_input_count = 0; + U64 external_total_symbol_input_count = 0; + for (U64 obj_idx = 0; obj_idx < internal_count; ++obj_idx) { + String8List raw_symbols = cv_sub_section_from_debug_s(internal_debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols); + internal_total_symbol_input_count += raw_symbols.node_count; + } + for (U64 obj_idx = 0; obj_idx < external_count; ++obj_idx) { + String8List raw_symbols = cv_sub_section_from_debug_s(external_debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols); + external_total_symbol_input_count += raw_symbols.node_count; + } + ProfEnd(); + + ProfBegin("Prepare Symbol Inputs"); + U64 total_symbol_input_count = internal_total_symbol_input_count + external_total_symbol_input_count; + LNK_CodeViewSymbolsInput *symbol_inputs = push_array_no_zero(tp_arena->v[0], LNK_CodeViewSymbolsInput, total_symbol_input_count); + CV_SymbolListArray *parsed_symbols = push_array_no_zero(tp_arena->v[0], CV_SymbolListArray, obj_count); + { + CV_SymbolList *reserved_lists = push_array(tp_arena->v[0], CV_SymbolList, total_symbol_input_count); + for (U64 obj_idx = 0, input_idx = 0; obj_idx < obj_count; ++obj_idx) { + String8List raw_symbols = cv_sub_section_from_debug_s(sorted_debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols); + + // init parse output + if (raw_symbols.node_count > 0) { + parsed_symbols[obj_idx].count = raw_symbols.node_count; + parsed_symbols[obj_idx].v = reserved_lists + input_idx; + } else { + parsed_symbols[obj_idx].count = 0; + parsed_symbols[obj_idx].v = 0; + } + + // init worker input + for (String8Node *data_n = raw_symbols.first; data_n != 0; data_n = data_n->next, ++input_idx) { + Assert(input_idx < total_symbol_input_count); + LNK_CodeViewSymbolsInput *in = &symbol_inputs[input_idx]; + in->obj_idx = obj_idx; + in->symbol_list = &reserved_lists[input_idx]; + in->raw_symbols = data_n->string; + } + } + } + ProfEnd(); + + ProfBegin("Symbol Parse"); + LNK_ParseCVSymbolsTaskData task = {0}; + task.inputs = symbol_inputs; + tp_for_parallel(tp, tp_arena, total_symbol_input_count, lnk_parse_cv_symbols_task, &task); + ProfEnd(); + + // TODO: do we rely on this behaviour? + // + // :zero_out_symbol_sub_section + ProfBegin("Zero-out Symbols Sub-sections"); + for (U64 i = 0; i < obj_count; ++i) { + CV_DebugS *debug_s = &sorted_debug_s_arr[i]; + String8List *symbols_ptr = cv_sub_section_ptr_from_debug_s(debug_s, CV_C13SubSectionKind_Symbols); + MemoryZeroStruct(symbols_ptr); + } + ProfEnd(); + + ProfEnd(); + + CV_SymbolListArray *internal_parsed_symbols = parsed_symbols; + CV_SymbolListArray *external_parsed_symbols = parsed_symbols + internal_count; + LNK_CodeViewSymbolsInput *internal_symbol_inputs = symbol_inputs; + LNK_CodeViewSymbolsInput *external_symbol_inputs = symbol_inputs + internal_count; + + LNK_PchInfo *pch_arr = lnk_setup_pch(tp_arena->v[0], + internal_count, + internal_obj_arr, + internal_debug_t_arr, + internal_debug_p_arr, + internal_parsed_symbols); + + CV_DebugT *merged_debug_t_p_arr = lnk_merge_debug_t_and_debug_p(tp_arena->v[0], internal_count, internal_debug_t_arr, internal_debug_p_arr); + + ProfBegin("Analyze & Read External Type Server Files"); + String8Array type_server_path_arr; + Rng1U64 **external_ti_ranges; + CV_DebugT **external_leaves; + U64 *obj_to_ts_idx_arr = push_array_no_zero(tp_arena->v[0], U64, external_count); + U64List *ts_to_obj_arr = push_array(tp_arena->v[0], U64List, external_count); + { + HashTable *type_server_path_ht = hash_table_init(scratch.arena, 256); + HashTable *ignored_path_ht = hash_table_init(scratch.arena, 256); + String8List type_server_path_list; MemoryZeroStruct(&type_server_path_list); + + // push null + str8_list_pushf(scratch.arena, &type_server_path_list, ""); + + for (U64 obj_idx = 0; obj_idx < external_count; ++obj_idx) { + // first leaf always type server + CV_DebugT debug_t = external_debug_t_arr[obj_idx]; + CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, 0); + CV_TypeServerInfo ts = cv_type_server_info_from_leaf(leaf); + + // search disk for type server + String8List match_list = os_file_search(scratch.arena, lib_dir_list, ts.name); + + // chop file name from path and search on it + // + // TODO: check if ts.name is a path and in that case do file search + if (match_list.node_count == 0) { + String8 file_name = str8_skip_last_slash(ts.name); + match_list = os_file_search(scratch.arena, lib_dir_list, file_name); + } + + B32 do_debug_info_discard = 0; + + // too many matches? + if (match_list.node_count > 1) { + if (!hash_table_search_path(ignored_path_ht, ts.name)) { + hash_table_push_path_u64(scratch.arena, ignored_path_ht, ts.name, 0); + lnk_error_obj(LNK_Warning_MultipleExternalTypeServers, obj_arr[obj_idx], "located multiple external type servers:"); + lnk_supplement_error_list(match_list); + } + do_debug_info_discard = 1; + } + // no match? + else if (match_list.node_count == 0) { + if (!hash_table_search_path(ignored_path_ht, ts.name)) { + hash_table_push_string_u64(scratch.arena, ignored_path_ht, ts.name, 0); + lnk_error_obj(LNK_Warning_MissingExternalTypeServer, obj_arr[obj_idx], "unable to open external type server %S", ts.name); + } + do_debug_info_discard = 1; + } + + // external type server is missing, discard parts of debug info that need types + if (do_debug_info_discard) { + lnk_do_debug_info_discard(external_debug_s_arr, external_parsed_symbols, obj_idx); + continue; + } + + String8 path = match_list.first->string; + { + struct HT_Value { + CV_TypeServerInfo ts; + LNK_Obj *obj; + U64 ts_idx; + }; + + // was this type server queued? + KeyValuePair *is_path_queued = hash_table_search_path(type_server_path_ht, path); + if (is_path_queued) { + struct HT_Value *present = is_path_queued->value_raw; + + // make sure type servers sigs match + if (MemoryMatchStruct(&ts.sig, &present->ts.sig)) { + // wire obj to type server data + obj_to_ts_idx_arr[obj_idx] = present->ts_idx; + + // wire type server to obj + u64_list_push(tp_arena->v[0], &ts_to_obj_arr[present->ts_idx], obj_idx); + } else { + lnk_error_obj(LNK_Error_ExternalTypeServerConflict, + obj_arr[obj_idx], + "external type server signature conflicts with type server loaded from '%S'", + present->obj->path); + } + } else { + U64 ts_idx = type_server_path_list.node_count; + path = push_str8_copy(tp_arena->v[0], path); + str8_list_push(scratch.arena, &type_server_path_list, path); + + // wire obj to type server + obj_to_ts_idx_arr[obj_idx] = ts_idx; + + // wire type server to obj + u64_list_push(tp_arena->v[0], &ts_to_obj_arr[ts_idx], obj_idx); + + // fill out value + struct HT_Value *value = push_array(scratch.arena, struct HT_Value, 1); + value->ts = ts; + value->obj = obj_arr[obj_idx]; + value->ts_idx = ts_idx; + + // update hash table + hash_table_push_path_raw(scratch.arena, type_server_path_ht, path, value); + } + } + } + + // read type servers from disk in parallel + type_server_path_arr = str8_array_from_list(tp_arena->v[0], &type_server_path_list); + + { + ProfBegin("Read External Type Servers"); + String8Array msf_data_arr = os_data_from_file_path_parallel(tp, scratch.arena, type_server_path_arr); + ProfEnd(); + + ProfBeginDynamic("Open External Type Servers [Count %llu]", type_server_path_arr.count); + LNK_GetExternalLeavesTask task; + task.path_arr = type_server_path_arr.v; + task.msf_data_arr = msf_data_arr.v; + task.external_ti_ranges = push_array_no_zero(tp_arena->v[0], Rng1U64 *, msf_data_arr.count); + task.external_leaves = push_array_no_zero(tp_arena->v[0], CV_DebugT *, msf_data_arr.count); + task.is_corrupted = push_array_no_zero(scratch.arena, B8, msf_data_arr.count); + tp_for_parallel(tp, tp_arena, msf_data_arr.count, lnk_get_external_leaves_task, &task); + ProfEnd(); + + String8List unopen_type_server_list; MemoryZeroStruct(&unopen_type_server_list); + + // discard debug info that depends on the missing type server + for (U64 ts_idx = 1; ts_idx < msf_data_arr.count; ++ts_idx) { + if (task.is_corrupted[ts_idx]) { + U64List obj_idx_list = ts_to_obj_arr[ts_idx]; + for (U64Node *node = obj_idx_list.first; node != 0; node = node->next) { + lnk_do_debug_info_discard(external_debug_s_arr, external_parsed_symbols, node->data); + } + } + } + + // format error + for (U64 ts_idx = 1; ts_idx < msf_data_arr.count; ++ts_idx) { + if (task.is_corrupted[ts_idx]) { + U64List obj_idx_list = ts_to_obj_arr[ts_idx]; + str8_list_pushf(scratch.arena, &unopen_type_server_list, "\t%S\n", type_server_path_arr.v[ts_idx]); + str8_list_pushf(scratch.arena, &unopen_type_server_list, "\t\tDependent obj(s):\n"); + for (U64Node *obj_idx_node = obj_idx_list.first; obj_idx_node != 0; obj_idx_node = obj_idx_node->next) { + String8 obj_path = external_obj_arr[obj_idx_node->data].path; + str8_list_pushf(scratch.arena, &unopen_type_server_list, "\t\t\t%S\n", obj_path); + } + } + } + if (unopen_type_server_list.node_count) { + String8List error_msg_list = { 0 }; + str8_list_pushf(scratch.arena, &error_msg_list, "unable to open external type server(s):\n"); + str8_list_concat_in_place(&error_msg_list, &unopen_type_server_list); + String8 error_msg = str8_list_join(scratch.arena, &error_msg_list, 0); + lnk_error(LNK_Error_UnableToOpenTypeServer, "%S", error_msg); + } + + // output + external_ti_ranges = task.external_ti_ranges; + external_leaves = task.external_leaves; + } + } + ProfEnd(); + + // fill out result + LNK_CodeViewInput cv = {0}; + cv.count = obj_count; + cv.internal_count = internal_count; + cv.external_count = external_count; + cv.type_server_count = type_server_path_arr.count; + cv.type_server_path_arr = type_server_path_arr.v; + cv.ts_to_obj_arr = ts_to_obj_arr; + cv.obj_arr = sorted_obj_arr; + cv.pch_arr = pch_arr; + cv.debug_s_arr = sorted_debug_s_arr; + cv.debug_p_arr = sorted_debug_p_arr; + cv.debug_t_arr = sorted_debug_t_arr; + cv.merged_debug_t_p_arr = merged_debug_t_p_arr; + cv.total_symbol_input_count = total_symbol_input_count; + cv.symbol_inputs = symbol_inputs; + cv.parsed_symbols = parsed_symbols; + cv.internal_obj_arr = internal_obj_arr; + cv.external_obj_arr = external_obj_arr; + cv.internal_debug_s_arr = internal_debug_s_arr; + cv.external_debug_s_arr = external_debug_s_arr; + cv.internal_debug_t_arr = internal_debug_t_arr; + cv.external_debug_t_arr = external_debug_t_arr; + cv.internal_debug_p_arr = internal_debug_p_arr; + cv.external_debug_p_arr = external_debug_p_arr; + cv.internal_total_symbol_input_count = internal_total_symbol_input_count; + cv.internal_symbol_inputs = internal_symbol_inputs; + cv.internal_parsed_symbols = internal_parsed_symbols; + cv.external_total_symbol_input_count = external_total_symbol_input_count; + cv.external_symbol_inputs = external_symbol_inputs; + cv.external_parsed_symbols = external_parsed_symbols; + cv.external_ti_ranges = external_ti_ranges; + cv.external_leaves = external_leaves; + cv.external_obj_to_ts_idx_arr = obj_to_ts_idx_arr; + cv.external_obj_range = rng_1u64(internal_count, internal_count + external_count); + + scratch_end(scratch); + ProfEnd(); + return cv; +} + +//////////////////////////////// +// Leaf Deduper + + +internal LNK_LeafRef +lnk_leaf_ref(U32 enc_loc_idx, U32 enc_leaf_idx) +{ + LNK_LeafRef ref; + ref.enc_loc_idx = enc_loc_idx; + ref.enc_leaf_idx = enc_leaf_idx; + return ref; +} + +internal LNK_LeafRef +lnk_obj_leaf_ref(U32 obj_idx, U32 leaf_idx) +{ + return lnk_leaf_ref(obj_idx, leaf_idx); +} + +internal LNK_LeafRef +lnk_ts_leaf_ref(CV_TypeIndexSource ti_source, U32 ts_idx, U32 leaf_idx) +{ + ts_idx |= LNK_LeafRefFlag_LocIdxExternal; + + if (ti_source == CV_TypeIndexSource_IPI) { + leaf_idx |= LNK_LeafRefFlag_LeafIdxIPI; + } + + return lnk_leaf_ref(ts_idx, leaf_idx); +} + +internal int +lnk_leaf_ref_compare(LNK_LeafRef a, LNK_LeafRef b) +{ + int cmp = 0; + if (a.enc_loc_idx < b.enc_loc_idx) { + cmp = -1; + } else if (a.enc_loc_idx > b.enc_loc_idx) { + cmp = +1; + } else { + if (a.enc_leaf_idx < b.enc_leaf_idx) { + cmp = -1; + } else if (a.enc_leaf_idx > b.enc_leaf_idx) { + cmp = +1; + } + } + return cmp; +} + +internal int +lnk_leaf_ref_is_before(void *raw_a, void *raw_b) +{ + LNK_LeafRef **a = raw_a; + LNK_LeafRef **b = raw_b; + int is_before; + if ((*a)->enc_loc_idx == (*b)->enc_loc_idx) { + is_before = (*a)->enc_leaf_idx < (*b)->enc_leaf_idx; + } else { + is_before = (*a)->enc_loc_idx < (*b)->enc_loc_idx; + } + return is_before; +} + +internal LNK_LeafLocType +lnk_loc_type_from_leaf_ref(LNK_LeafRef leaf_ref) +{ + if (leaf_ref.enc_loc_idx & LNK_LeafRefFlag_LocIdxExternal) { + return LNK_LeafLocType_External; + } + return LNK_LeafLocType_Internal; +} + +internal LNK_LeafLocType +lnk_loc_type_from_obj_idx(LNK_CodeViewInput *input, U64 obj_idx) +{ + if (input->external_obj_range.min <= obj_idx && obj_idx < input->external_obj_range.max) { + return LNK_LeafLocType_External; + } + return LNK_LeafLocType_Internal; +} + +internal U64 +lnk_loc_idx_from_obj_idx(LNK_CodeViewInput *input, U64 obj_idx) +{ + if (input->external_obj_range.min <= obj_idx && obj_idx < input->external_obj_range.max) { + return input->external_obj_to_ts_idx_arr[obj_idx - input->external_obj_range.min]; + } + return obj_idx; +} + +internal CV_TypeIndex +lnk_ti_lo_from_leaf_ref(LNK_CodeViewInput *input, LNK_LeafRef leaf_ref) +{ + CV_TypeIndex ti_lo; + + LNK_LeafLocType loc_type = lnk_loc_type_from_leaf_ref(leaf_ref); + switch (loc_type) { + case LNK_LeafLocType_Internal: { + ti_lo = CV_MinComplexTypeIndex; + } break; + case LNK_LeafLocType_External: { + U64 ts_idx = leaf_ref.enc_loc_idx & ~LNK_LeafRefFlag_LocIdxExternal; + CV_TypeIndexSource ti_source = (leaf_ref.enc_loc_idx & LNK_LeafRefFlag_LeafIdxIPI) ? CV_TypeIndexSource_IPI : CV_TypeIndexSource_TPI; + ti_lo = input->external_ti_ranges[ts_idx][ti_source].min; + } break; + default: ti_lo = 0; break; + } + + return ti_lo; +} + +internal CV_TypeIndex +lnk_ti_lo_from_loc(LNK_CodeViewInput *input, LNK_LeafLocType loc_type, U64 loc_idx, CV_TypeIndexSource ti_source) +{ + CV_TypeIndex ti_lo = 0; + if (loc_type == LNK_LeafLocType_Internal) { + ti_lo = CV_MinComplexTypeIndex; + } else if (loc_type == LNK_LeafLocType_External) { + ti_lo = input->external_ti_ranges[loc_idx][ti_source].min; + } + return ti_lo; +} + +internal String8 +lnk_data_from_leaf_ref(LNK_CodeViewInput *input, LNK_LeafRef leaf_ref) +{ + String8 data; + + LNK_LeafLocType loc_type = lnk_loc_type_from_leaf_ref(leaf_ref); + switch (loc_type) { + case LNK_LeafLocType_Internal: { + U32 obj_idx = leaf_ref.enc_loc_idx & ~LNK_LeafRefFlag_LocIdxExternal; + U32 leaf_idx = leaf_ref.enc_leaf_idx; + CV_DebugT debug_t = input->merged_debug_t_p_arr[obj_idx]; + data = cv_debug_t_get_raw_leaf(debug_t, leaf_idx); + } break; + + case LNK_LeafLocType_External: { + U64 ts_idx = leaf_ref.enc_loc_idx & ~LNK_LeafRefFlag_LocIdxExternal; + U64 leaf_idx = leaf_ref.enc_leaf_idx & ~LNK_LeafRefFlag_LeafIdxIPI; + CV_TypeIndexSource ti_source = leaf_ref.enc_leaf_idx & LNK_LeafRefFlag_LeafIdxIPI ? CV_TypeIndexSource_IPI : CV_TypeIndexSource_TPI; + CV_DebugT debug_t = input->external_leaves[ts_idx][ti_source]; + data = cv_debug_t_get_raw_leaf(debug_t, leaf_idx); + } break; + + default: data = str8(0,0); break; + } + + return data; +} + +internal CV_TypeIndex +lnk_type_index_from_leaf_ref(LNK_CodeViewInput *input, LNK_LeafRef leaf_ref) +{ + CV_TypeIndex type_index = 0; + LNK_LeafLocType loc_type = lnk_loc_type_from_leaf_ref(leaf_ref); + switch (loc_type) { + case LNK_LeafLocType_Internal: { + LNK_PchInfo pch_info = input->pch_arr[leaf_ref.enc_loc_idx]; + type_index = pch_info.ti_hi + leaf_ref.enc_leaf_idx; + } break; + case LNK_LeafLocType_External: { + CV_TypeIndex lo = lnk_ti_lo_from_leaf_ref(input, leaf_ref); + type_index = lo + leaf_ref.enc_leaf_idx & ~LNK_LeafRefFlag_LeafIdxIPI; + } break; + default: InvalidPath; + } + return type_index; +} + +internal CV_Leaf +lnk_cv_leaf_from_leaf_ref(LNK_CodeViewInput *input, LNK_LeafRef leaf_ref) +{ + String8 raw_leaf = lnk_data_from_leaf_ref(input, leaf_ref); + CV_Leaf leaf; + cv_deserial_leaf(raw_leaf, 0, 1, &leaf); + return leaf; +} + +internal U128 +lnk_hash_from_leaf_ref(LNK_LeafHashes *hashes, LNK_LeafRef leaf_ref) +{ + LNK_LeafLocType loc_type; + CV_TypeIndexSource ti_source; + if (leaf_ref.enc_loc_idx & LNK_LeafRefFlag_LocIdxExternal) { + loc_type = LNK_LeafLocType_External; + ti_source = (leaf_ref.enc_leaf_idx & LNK_LeafRefFlag_LeafIdxIPI) ? CV_TypeIndexSource_IPI : CV_TypeIndexSource_TPI; + } else { + loc_type = LNK_LeafLocType_Internal; + ti_source = CV_TypeIndexSource_TPI; + } + + U32 loc_idx = leaf_ref.enc_loc_idx & ~LNK_LeafRefFlag_LocIdxExternal; + U32 leaf_idx = leaf_ref.enc_leaf_idx & ~LNK_LeafRefFlag_LeafIdxIPI; + U128 hash = hashes->v[loc_type][loc_idx][ti_source].v[leaf_idx]; + + return hash; +} + +internal LNK_LeafRef +lnk_leaf_ref_from_loc_idx_and_ti(LNK_CodeViewInput *input, + LNK_LeafLocType loc_type, + CV_TypeIndexSource ti_source, + U64 loc_idx, + CV_TypeIndex obj_ti) +{ + LNK_LeafRef leaf_ref; + + switch (loc_type) { + case LNK_LeafLocType_External: { + U64 ts_idx = loc_idx; + + CV_TypeIndex ti_lo = input->external_ti_ranges[ts_idx][ti_source].min; + Assert(obj_ti >= ti_lo); + + // encode leaf index for type server + leaf_ref = lnk_ts_leaf_ref(ti_source, ts_idx, obj_ti - ti_lo); + } break; + + case LNK_LeafLocType_Internal: { + U64 obj_idx = loc_idx; + + LNK_PchInfo pch = input->pch_arr[obj_idx]; + if (obj_ti < pch.ti_lo) { + CV_TypeIndex ti_lo = CV_MinComplexTypeIndex; + Assert(obj_ti >= ti_lo); + leaf_ref = lnk_obj_leaf_ref(obj_idx, obj_ti - ti_lo); + } + // PCH indirection + else if (obj_ti < pch.ti_hi) { + // we don't support nested precompiled types + Assert(input->pch_arr[pch.debug_p_obj_idx].debug_p_obj_idx == /* null_obj: */ 0); + Assert(input->pch_arr[pch.debug_p_obj_idx].ti_lo == input->pch_arr[pch.debug_p_obj_idx].ti_hi); + leaf_ref = lnk_obj_leaf_ref(pch.debug_p_obj_idx, obj_ti - pch.ti_lo); + } else { + leaf_ref = lnk_obj_leaf_ref(obj_idx, pch.ti_lo + (obj_ti - pch.ti_hi) - CV_MinComplexTypeIndex); + } + } break; + + default: leaf_ref = lnk_leaf_ref(0, 0); break; + } + + return leaf_ref; +} + +internal B32 +lnk_match_leaf_ref(LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafRef a, LNK_LeafRef b) +{ + B32 are_same = 0; + + U128 a_hash = lnk_hash_from_leaf_ref(hashes, a); + U128 b_hash = lnk_hash_from_leaf_ref(hashes, b); + + if (u128_match(a_hash, b_hash)) { + CV_Leaf a_leaf = lnk_cv_leaf_from_leaf_ref(input, a); + CV_Leaf b_leaf = lnk_cv_leaf_from_leaf_ref(input, b); + Assert(a_leaf.kind == b_leaf.kind); +#if 0 + { + Temp scratch = scratch_begin(0,0); + CV_TypeIndexInfoList ti_info_list = cv_get_leaf_type_index_offsets(scratch.arena, a_leaf.kind, a_leaf.data); + String8Array a_raw_data_arr = cv_get_data_around_type_indices(scratch.arena, ti_info_list, a_leaf.data); + String8Array b_raw_data_arr = cv_get_data_around_type_indices(scratch.arena, ti_info_list, b_leaf.data); + for (U64 i = 0; i < a_raw_data_arr.count; ++i) { + String8 a_chunk = a_raw_data_arr.v[i]; + String8 b_chunk = b_raw_data_arr.v[i]; + Assert(str8_match(a_chunk, b_chunk, 0)); + } + scratch_end(scratch); + } +#endif + are_same = 1; + } + + return are_same; +} + +internal B32 +lnk_match_leaf_ref_deep(Arena *arena, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafRef a, LNK_LeafRef b) +{ + B32 are_equal = 0; + + U128 a_hash = lnk_hash_from_leaf_ref(hashes, a); + U128 b_hash = lnk_hash_from_leaf_ref(hashes, b); + + if (u128_match(a_hash, b_hash)) { + String8 a_raw_leaf = lnk_data_from_leaf_ref(input, a); + String8 b_raw_leaf = lnk_data_from_leaf_ref(input, b); + + CV_LeafHeader *a_header = (CV_LeafHeader *) a_raw_leaf.str; + CV_LeafHeader *b_header = (CV_LeafHeader *) b_raw_leaf.str; + + if (a_header->kind == b_header->kind && a_header->size == b_header->size) { + CV_Leaf a_leaf = cv_leaf_from_string(a_raw_leaf); + CV_Leaf b_leaf = cv_leaf_from_string(b_raw_leaf); + + Temp temp = temp_begin(arena); + + CV_TypeIndexInfoList ti_info_list = cv_get_leaf_type_index_offsets(temp.arena, a_leaf.kind, a_leaf.data); + String8Array a_raw_data_arr = cv_get_data_around_type_indices(temp.arena, ti_info_list, a_leaf.data); + String8Array b_raw_data_arr = cv_get_data_around_type_indices(temp.arena, ti_info_list, b_leaf.data); + + are_equal = 1; + + for (U64 i = 0; i < a_raw_data_arr.count; ++i) { + String8 a_chunk = a_raw_data_arr.v[i]; + String8 b_chunk = b_raw_data_arr.v[i]; + Assert(a_chunk.size == b_chunk.size); + are_equal = str8_match(a_chunk, b_chunk, 0); + if (!are_equal) { + goto skip_type_index_compare; + } + } + + CV_TypeIndex a_ti_lo = lnk_ti_lo_from_leaf_ref(input, a); + CV_TypeIndex b_ti_lo = lnk_ti_lo_from_leaf_ref(input, b); + AssertAlways(a_ti_lo == b_ti_lo); + + for (CV_TypeIndexInfo *ti_info = ti_info_list.first; ti_info != 0; ti_info = ti_info->next) { + CV_TypeIndex *a_ti_ptr = (CV_TypeIndex *) (a_leaf.data.str + ti_info->offset); + CV_TypeIndex *b_ti_ptr = (CV_TypeIndex *)(b_leaf.data.str + ti_info->offset); + + if (*a_ti_ptr >= a_ti_lo && *b_ti_ptr >= b_ti_lo) { + LNK_LeafLocType a_loc_type = (a.enc_loc_idx & LNK_LeafRefFlag_LocIdxExternal) >> 31; + LNK_LeafLocType b_loc_type = (b.enc_loc_idx & LNK_LeafRefFlag_LocIdxExternal) >> 31; + + U64 a_loc_idx = a.enc_loc_idx & ~LNK_LeafRefFlag_LocIdxExternal; + U64 b_loc_idx = b.enc_loc_idx & ~LNK_LeafRefFlag_LocIdxExternal; + + LNK_LeafRef a_sub_leaf_ref = lnk_leaf_ref_from_loc_idx_and_ti(input, a_loc_type, ti_info->source, a_loc_idx, *a_ti_ptr); + LNK_LeafRef b_sub_leaf_ref = lnk_leaf_ref_from_loc_idx_and_ti(input, b_loc_type, ti_info->source, b_loc_idx, *b_ti_ptr); + + are_equal = lnk_match_leaf_ref_deep(arena, input, hashes, a_sub_leaf_ref, b_sub_leaf_ref); + if (!are_equal) { + break; + } + } + // compare simple leaves + else { + are_equal = *a_ti_ptr == *b_ti_ptr; + if (!are_equal) { + break; + } + } + } + +skip_type_index_compare:; + temp_end(temp); + } + } + + return are_equal; +} + +internal U128 +lnk_hash_cv_leaf(Arena *arena, + LNK_CodeViewInput *input, + LNK_LeafHashes *hashes, + LNK_LeafLocType loc_type, + U32 loc_idx, + Rng1U64 *ti_ranges, + CV_TypeIndex curr_ti, + CV_Leaf leaf, + CV_TypeIndexInfoList ti_info_list) +{ + // init hasher + blake3_hasher hasher; blake3_hasher_init(&hasher); + + // hash leaf size + blake3_hasher_update(&hasher, &leaf.data.size, sizeof leaf.data.size); + + // hash leaf kind + blake3_hasher_update(&hasher, &leaf.kind, sizeof leaf.kind); + + // hash bytes around indices + { + Temp temp = temp_begin(arena); + String8Array raw_data_arr = cv_get_data_around_type_indices(temp.arena, ti_info_list, leaf.data); + for (U64 i = 0; i < raw_data_arr.count; ++i) { + blake3_hasher_update(&hasher, raw_data_arr.v[i].str, raw_data_arr.v[i].size); + } + temp_end(temp); + } + + // mix-in sub leaf hashes + for (CV_TypeIndexInfo *ti_n = ti_info_list.first; ti_n != 0; ti_n = ti_n->next) { + CV_TypeIndex sub_ti = *(CV_TypeIndex *) (leaf.data.str + ti_n->offset); + + // is type index complex? + if (sub_ti >= ti_ranges[ti_n->source].min) { + // Mostly leaves are laid out as DAG and we can get to sub leaf hash through index lookup, + // however MASM doesn't follow DAG rule, for example: + // + // Engine\Source\Developer\Windows\LiveCoding\Private\External\LC_JumpToSelf.asm + // .debug$T (No. 4): + // LF_PROCEDURE (0x1000) [0008-0014] + // Return type: 3 + // Call Convention: Near C + // Function Attribs: NULL + // Argumnet Count: 0 + // Argument List Type: 1001 + // LF_ARGLIST (0x1001) [0018-001C] + // Types 0 + // LF_LABEL (0x1002) [0020-0024] + // $UNDEFINED: E + // + // Note: LF_ARGLIST(0x1001) > LF_PROCEDURE(0x1000) + // + // Luckily we don't have many leaves that break DAG rule and we can skip without + // much memory and perf penalty (In Ancient Game we skip 7 leaves) + if (sub_ti < curr_ti) { + LNK_LeafRef sub_leaf_ref = lnk_leaf_ref_from_loc_idx_and_ti(input, loc_type, ti_n->source, loc_idx, sub_ti); + + // query sub hash + U128 sub_hash = lnk_hash_from_leaf_ref(hashes, sub_leaf_ref); + + // make sure sub hash was computed (:zero_hash_array) + Assert(!u128_match(sub_hash, u128_zero())); + + // mix-in sub hash + blake3_hasher_update(&hasher, &sub_hash, sizeof sub_hash); + } else { + Temp scratch = scratch_begin(0,0); + String8 leaf_kind_str = cv_string_from_leaf_kind(leaf.kind); + String8 leaf_info = push_str8f(scratch.arena, "LF_%S(type_index: 0x%x) forward refs member type index 0x%x (leaf struct offset: 0x%llx)", leaf_kind_str, curr_ti, sub_ti, ti_n->offset); + if (loc_type == LNK_LeafLocType_Internal) { + lnk_error_obj(LNK_Error_InvalidTypeIndex, input->internal_obj_arr+loc_idx, "%S", leaf_info); + } else if (loc_type == LNK_LeafLocType_External) { + lnk_error(LNK_Error_InvalidTypeIndex, "%S: %S", input->type_server_path_arr[loc_idx], leaf_info); + } else { + InvalidPath; + } + scratch_end(scratch); + } + } + // simple indices are stable across compile units + else { + blake3_hasher_update(&hasher, &sub_ti, sizeof sub_ti); + } + } + + U128 hash; + blake3_hasher_finalize(&hasher, (U8 *) &hash, sizeof hash); + + return hash; +} + +internal void +lnk_hash_cv_leaf_deep(Arena *arena, + LNK_CodeViewInput *input, + Rng1U64 *ti_ranges, + CV_DebugT *leaves, + LNK_LeafHashes *hashes, + LNK_LeafLocType loc_type, + U32 loc_idx, + CV_TypeIndexInfoList ti_info_list, + String8 data) +{ + Temp temp = temp_begin(arena); + + struct stack_s { + struct stack_s *next; + CV_TypeIndexInfoList ti_info_list; + CV_TypeIndexInfo *ti_info; + CV_Leaf leaf; + String8 data; + CV_TypeIndex ti; + CV_TypeIndexSource ti_source; + }; + + // set up root frame + struct stack_s *root_frame = push_array_no_zero(temp.arena, struct stack_s, 1); + root_frame->next = 0; + root_frame->ti_info_list = ti_info_list; + root_frame->ti_info = ti_info_list.first; + root_frame->data = data; + root_frame->ti = 0; + root_frame->ti_source = CV_TypeIndexSource_NULL; + MemoryZeroStruct(&root_frame->leaf); + + U128Array *curr_hashes = hashes->v[loc_type][loc_idx]; + + struct stack_s *stack = root_frame; + while (stack) { + while (stack->ti_info) { + CV_TypeIndexInfo *curr_ti_info = stack->ti_info; + + // advance iterator + stack->ti_info = stack->ti_info->next; + + // get type index info + CV_TypeIndex *ti_ptr = (CV_TypeIndex *) (stack->data.str + curr_ti_info->offset); + + // is index complex? + if (*ti_ptr >= ti_ranges[curr_ti_info->source].min) { + // TODO: handle malformed index + AssertAlways(*ti_ptr < ti_ranges[curr_ti_info->source].max); + U64 ti_idx = (*ti_ptr - ti_ranges[curr_ti_info->source].min); + + // was leaf hashed? + if (MemoryIsZeroStruct(&curr_hashes[curr_ti_info->source].v[ti_idx])) { // :zero_hash_array + CV_Leaf leaf = cv_debug_t_get_leaf(leaves[curr_ti_info->source], ti_idx); + + // find index offsets + CV_TypeIndexInfoList sub_ti_info_list = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data); + + // do we have sub leaves? + if (sub_ti_info_list.count) { + // fill out new frame + struct stack_s *frame = push_array_no_zero(temp.arena, struct stack_s, 1); + frame->next = 0; + frame->ti_info_list = sub_ti_info_list; + frame->ti_info = sub_ti_info_list.first; + frame->leaf = leaf; + frame->data = leaf.data; + frame->ti = *ti_ptr; + frame->ti_source = curr_ti_info->source; + + // recurse to sub leaf + SLLStackPush(stack, frame); + break; + } else { + curr_hashes[curr_ti_info->source].v[ti_idx] = lnk_hash_cv_leaf(temp.arena, + input, + hashes, + loc_type, + loc_idx, + ti_ranges, + CV_TypeIndex_Max, + leaf, + sub_ti_info_list); + } + } + } + } + + // no more type indices, pop frame + if (!stack->ti_info) { + + if (stack != root_frame) { + // sub leaves are hashed we can now hash parent leaf + Temp temp2 = temp_begin(temp.arena); + U64 leaf_idx = stack->ti - ti_ranges[stack->ti_source].min; + curr_hashes[stack->ti_source].v[leaf_idx] = lnk_hash_cv_leaf(temp2.arena, + input, + hashes, + loc_type, + loc_idx, + ti_ranges, + CV_TypeIndex_Max, + stack->leaf, + stack->ti_info_list); + temp_end(temp2); + } + + SLLStackPop(stack); + } + } + + temp_end(temp); +} + +internal LNK_LeafBucket * +lnk_leaf_hash_table_insert_or_update(LNK_LeafHashTable *leaf_ht, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, U128 new_hash, LNK_LeafBucket *new_bucket) +{ + LNK_LeafBucket *result = 0; + B32 is_inserted_or_updated = 0; + + U64 best_idx = u128_mod64(new_hash, leaf_ht->cap); + U64 idx = best_idx; + + do { + retry:; + LNK_LeafBucket *curr_bucket = leaf_ht->bucket_arr[idx]; + + if (curr_bucket == 0) { + LNK_LeafBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&leaf_ht->bucket_arr[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + is_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } else if (lnk_match_leaf_ref(input, hashes, curr_bucket->leaf_ref, new_bucket->leaf_ref)) { + int leaf_cmp = lnk_leaf_ref_compare(curr_bucket->leaf_ref, new_bucket->leaf_ref); + + if (leaf_cmp <= 0) { + // are we inserting bucket that was already inserterd? + Assert(leaf_cmp < 0); + + result = new_bucket; + + is_inserted_or_updated = 1; + + // don't need to update, more recent leaf is in the bucket + break; + } + + LNK_LeafBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&leaf_ht->bucket_arr[idx], new_bucket, curr_bucket); + if (compare_bucket == curr_bucket) { + result = compare_bucket; + + is_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } + + // advance + idx = (idx + 1) % leaf_ht->cap; + } while (idx != best_idx); + + Assert(is_inserted_or_updated); + + return result; +} + +internal LNK_LeafBucket * +lnk_leaf_hash_table_search(LNK_LeafHashTable *ht, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafRef leaf_ref) +{ + LNK_LeafBucket *match = 0; + + U128 hash = lnk_hash_from_leaf_ref(hashes, leaf_ref); + U64 best_bucket_idx = u128_mod64(hash, ht->cap); + U64 bucket_idx = best_bucket_idx; + + do { + LNK_LeafBucket *bucket = ht->bucket_arr[bucket_idx]; + + if (bucket == 0) { + break; + } + + if (lnk_match_leaf_ref(input, hashes, bucket->leaf_ref, leaf_ref)) { + match = bucket; + break; + } + + bucket_idx = (bucket_idx + 1) % ht->cap; + } while (bucket_idx != best_bucket_idx); + + return match; +} + +//////////////////////////////// + +internal +THREAD_POOL_TASK_FUNC(lnk_count_per_source_leaf_task) +{ + ProfBeginFunction(); + + LNK_CountPerSourceLeafTask *task = raw_task; + LNK_LeafRangeList leaf_range_list = task->leaf_ranges_per_task[task_id]; + + for (LNK_LeafRange *leaf_range = leaf_range_list.first; leaf_range != 0; leaf_range = leaf_range->next) { + CV_DebugT debug_t = *leaf_range->debug_t; + for (U64 leaf_idx = leaf_range->range.min; leaf_idx < leaf_range->range.max; ++leaf_idx) { + CV_LeafHeader *leaf_header = cv_debug_t_get_leaf_header(debug_t, leaf_idx); + CV_TypeIndexSource leaf_source = cv_type_index_source_from_leaf_kind(leaf_header->kind); + task->count_arr_arr[leaf_source][task_id] += 1; + } + } + + ProfEnd(); +} + +internal void +lnk_cv_debug_t_count_leaves_per_source(TP_Context *tp, U64 count, CV_DebugT *debug_t_arr, U64 per_source_count_arr[CV_TypeIndexSource_COUNT]) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + ProfBegin("Compute Per Task Ranges"); + U64 per_task_leaf_count = 10000; + LNK_LeafRangeList *leaf_ranges_per_task = push_array(scratch.arena, LNK_LeafRangeList, tp->worker_count); + for (U64 i = 0, task_weight = 0, task_id = 0; i < count; ++i) { + CV_DebugT *debug_t = &debug_t_arr[i]; + for (U64 k = 0; k < debug_t->count; k += per_task_leaf_count) { + U64 cap = per_task_leaf_count - task_weight; + + LNK_LeafRange *leaf_range = push_array(scratch.arena, LNK_LeafRange, 1); + leaf_range->range = rng_1u64(k, Min(k + cap, debug_t->count)); + leaf_range->debug_t = debug_t; + + LNK_LeafRangeList *list = &leaf_ranges_per_task[task_id]; + SLLQueuePush(list->first, list->last, leaf_range); + ++list->count; + + task_weight += dim_1u64(leaf_range->range); + if (task_weight >= per_task_leaf_count) { + task_id = (task_id + 1) % tp->worker_count; + task_weight = 0; + } + } + } + ProfEnd(); + + + LNK_CountPerSourceLeafTask task; + task.leaf_ranges_per_task = leaf_ranges_per_task; + task.count_arr_arr = push_matrix_u64(scratch.arena, CV_TypeIndexSource_COUNT, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_count_per_source_leaf_task, &task); + + for (U64 i = 0; i < CV_TypeIndexSource_COUNT; ++i) { + per_source_count_arr[i] += sum_array_u64(tp->worker_count, task.count_arr_arr[i]); + } + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_hash_debug_t_task) +{ + ProfBeginFunction(); + + U64 obj_idx = task_id; + LNK_LeafHasherTask *task = raw_task; + + Arena *fixed_arena = task->fixed_arenas[worker_id]; + CV_DebugT debug_t = task->debug_t_arr[obj_idx]; + U128Array out_hashes = task->hashes->v[LNK_LeafLocType_Internal][obj_idx][CV_TypeIndexSource_TPI]; + + Rng1U64 ti_ranges[CV_TypeIndexSource_COUNT]; + for (U64 ti_source = 0; ti_source < ArrayCount(ti_ranges); ++ti_source) { + ti_ranges[ti_source] = rng_1u64(task->input->pch_arr[obj_idx].ti_lo, task->input->pch_arr[obj_idx].ti_hi + debug_t.count); + } + + for (U64 leaf_idx = 0; leaf_idx < debug_t.count; ++leaf_idx) { + Temp temp = temp_begin(fixed_arena); + + // :debug_zero_hash_assert make sure we don't write same hash more than once + //Assert(MemoryIsZeroStruct(&out_hash_arr.v[leaf_idx])); + + CV_TypeIndex curr_ti = lnk_type_index_from_leaf_ref(task->input, lnk_leaf_ref(obj_idx, leaf_idx)); + CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, leaf_idx); + CV_TypeIndexInfoList ti_info_list = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data); + + out_hashes.v[leaf_idx] = lnk_hash_cv_leaf(temp.arena, + task->input, + task->hashes, + LNK_LeafLocType_Internal, + obj_idx, + ti_ranges, + curr_ti, + leaf, + ti_info_list); + + temp_end(temp); + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_hash_type_server_leaves_task) +{ + ProfBeginFunction(); + + LNK_LeafHasherTask *task = raw_task; + U64 obj_idx = task_id; + + LNK_CodeViewInput *input = task->input; + LNK_LeafHashes *hashes = task->hashes; + + CV_SymbolListArray parsed_symbols = input->external_parsed_symbols[obj_idx]; + CV_DebugS debug_s = input->external_debug_s_arr[obj_idx]; + U64 ts_idx = input->external_obj_to_ts_idx_arr[obj_idx]; + CV_DebugT *leaves = input->external_leaves[ts_idx]; + Rng1U64 *ti_ranges = input->external_ti_ranges[ts_idx]; + + // hash leaves referenced in symbols + for (U64 i = 0; i < parsed_symbols.count; ++i) { + CV_SymbolList symbol_list = parsed_symbols.v[i]; + for (CV_SymbolNode *symnode = symbol_list.first; symnode != 0; symnode = symnode->next) { + Temp temp = temp_begin(task->fixed_arenas[worker_id]); + CV_TypeIndexInfoList ti_info_list = cv_get_symbol_type_index_offsets(temp.arena, symnode->data.kind, symnode->data.data); + lnk_hash_cv_leaf_deep(temp.arena, task->input, ti_ranges, leaves, hashes, LNK_LeafLocType_External, ts_idx, ti_info_list, symnode->data.data); + temp_end(temp); + } + } + + // hash leaves referenced in inlinees + String8List inline_data_list = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_InlineeLines); + for (String8Node *inline_data_node = inline_data_list.first; inline_data_node != 0; inline_data_node = inline_data_node->next) { + Temp temp = temp_begin(task->fixed_arenas[worker_id]); + CV_TypeIndexInfoList ti_info_list = cv_get_inlinee_type_index_offsets(temp.arena, inline_data_node->string); + lnk_hash_cv_leaf_deep(temp.arena, task->input, ti_ranges, leaves, hashes, LNK_LeafLocType_External, ts_idx, ti_info_list, inline_data_node->string); + temp_end(temp); + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_leaf_dedup_internal_task) +{ + LNK_LeafDedupInternal *task = raw_task; + U64 obj_idx = task_id; + CV_DebugT debug_t = task->debug_t_arr[obj_idx]; + + ProfBeginDynamic("Leaf Dedup Task 0x%X [Leaf Count %u]", obj_idx, task->debug_t_arr[obj_idx].count); + + LNK_LeafBucket *bucket = 0; + for (U64 leaf_idx = 0; leaf_idx < debug_t.count; ++leaf_idx) { + CV_LeafHeader *leaf_header = cv_debug_t_get_leaf_header(debug_t, leaf_idx); + CV_TypeIndexSource ti_source = cv_type_index_source_from_leaf_kind(leaf_header->kind); + LNK_LeafHashTable *leaf_ht = &task->leaf_ht_arr[ti_source]; + + LNK_LeafRef leaf_ref = lnk_obj_leaf_ref(obj_idx, leaf_idx); + U128 leaf_hash = lnk_hash_from_leaf_ref(task->hashes, leaf_ref); + + if (bucket == 0) { + bucket = push_array_no_zero(arena, LNK_LeafBucket, 1); + } + bucket->leaf_ref = leaf_ref; + + LNK_LeafBucket *inserted_or_updated = lnk_leaf_hash_table_insert_or_update(leaf_ht, task->input, task->hashes, leaf_hash, bucket); + + if (inserted_or_updated != bucket) { + bucket = 0; + } + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_leaf_dedup_external_task) +{ + ProfBeginFunction(); + + LNK_LeafDedupExternal *task = raw_task; + U64 ts_idx = task_id; + + LNK_CodeViewInput *input = task->input; + LNK_LeafHashTable *leaf_ht = &task->leaf_ht_arr[task->dedup_ti_source]; + U128Array hashes = task->hashes->external_hashes[ts_idx][task->dedup_ti_source]; + U64 leaf_count = dim_1u64(input->external_ti_ranges[ts_idx][task->dedup_ti_source]); + + LNK_LeafBucket *bucket = 0; + + for (U64 leaf_idx = 0; leaf_idx < leaf_count; ++leaf_idx) { + if (!MemoryIsZeroStruct(&hashes.v[leaf_idx])) { // :zero_hash_check + LNK_LeafRef leaf_ref = lnk_ts_leaf_ref(task->dedup_ti_source, ts_idx, leaf_idx); + U128 leaf_hash = lnk_hash_from_leaf_ref(task->hashes, leaf_ref); + + if (bucket == 0) { + bucket = push_array_no_zero(arena, LNK_LeafBucket, 1); + } + bucket->leaf_ref = leaf_ref; + + LNK_LeafBucket *inserted_or_updated = lnk_leaf_hash_table_insert_or_update(leaf_ht, task->input, task->hashes, leaf_hash, bucket); + + if (inserted_or_updated != bucket) { + bucket = 0; + } + } + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_count_present_buckets_task) +{ + ProfBeginFunction(); + LNK_GetPresentBucketsTask *task = raw_task; + for (U64 bucket_idx = task->range_arr[task_id].min; bucket_idx < task->range_arr[task_id].max; ++bucket_idx) { + if (task->ht->bucket_arr[bucket_idx] != 0) { + task->count_arr[task_id] += 1; + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_get_present_buckets_task) +{ + ProfBeginFunction(); + + LNK_GetPresentBucketsTask *task = raw_task; + + Rng1U64 range = task->range_arr[task_id]; + U64 cursor = task->offset_arr[task_id]; + LNK_LeafHashTable *ht = task->ht; + + for (U64 bucket_idx = range.min; bucket_idx < range.max; ++bucket_idx) { + if (ht->bucket_arr[bucket_idx]) { + task->result.v[cursor++] = ht->bucket_arr[bucket_idx]; + } + } + + ProfEnd(); +} + +internal LNK_LeafBucketArray +lnk_present_bucket_array_from_leaf_hash_table(TP_Context *tp, Arena *arena, LNK_LeafHashTable *ht) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + LNK_GetPresentBucketsTask task = {0}; + task.ht = ht; + task.count_arr = push_array(scratch.arena, U64, tp->worker_count); + task.range_arr = tp_divide_work(scratch.arena, ht->cap, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_count_present_buckets_task, &task); + + LNK_LeafBucketArray result; + result.count = sum_array_u64(tp->worker_count, task.count_arr); + result.v = push_array_no_zero(arena, LNK_LeafBucket *, result.count); + + task.result = result; + task.offset_arr = offsets_from_counts_array_u64(scratch.arena, task.count_arr, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_get_present_buckets_task, &task); + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_leaf_ref_histo_task) +{ + ProfBeginFunction(); + + LNK_LeafRadixSortTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + U32 *counts_ptr = task->counts_arr[task_id]; + + U32 loc_idx_bit_count_0 = task->loc_idx_bit_count_0; + U32 loc_idx_bit_count_1 = task->loc_idx_bit_count_1; + U32 loc_idx_bit_count_2 = task->loc_idx_bit_count_2; + + MemoryZeroTyped(task->counts_arr[task_id], task->counts_max); + + switch (task->pass_idx) { + case 0: { + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 leaf_digit0 = BitExtract(bucket->leaf_ref.enc_leaf_idx, 10, 0); + ++counts_ptr[leaf_digit0]; + } + } break; + case 1: { + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 leaf_digit1 = BitExtract(bucket->leaf_ref.enc_leaf_idx, 11, 10); + ++counts_ptr[leaf_digit1]; + } + } break; + case 2: { + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 leaf_digit2 = BitExtract(bucket->leaf_ref.enc_leaf_idx, 11, 21 - 1); // don't take into account IPI flag + ++counts_ptr[leaf_digit2]; + } + } break; + + case 3: { + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 digit0 = BitExtract(bucket->leaf_ref.enc_loc_idx, loc_idx_bit_count_0, 0); + ++counts_ptr[digit0]; + } + } break; + case 4: { + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 digit1 = BitExtract(bucket->leaf_ref.enc_loc_idx, loc_idx_bit_count_1, loc_idx_bit_count_0); + ++counts_ptr[digit1]; + } + } break; + case 5: { + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 digit2 = BitExtract(bucket->leaf_ref.enc_loc_idx, loc_idx_bit_count_2, loc_idx_bit_count_0 + loc_idx_bit_count_1); + + U64 loc_bit = !!(bucket->leaf_ref.enc_loc_idx & LNK_LeafRefFlag_LocIdxExternal); + digit2 |= loc_bit << loc_idx_bit_count_2; + + ++counts_ptr[digit2]; + } + } break; + default: InvalidPath; + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_loc_idx_radix_sort_task) +{ + ProfBeginFunction(); + + LNK_LeafRadixSortTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + U32 *counts_ptr = task->counts_arr[task_id]; + U32 loc_idx_bit_count_0 = task->loc_idx_bit_count_0; + U32 loc_idx_bit_count_1 = task->loc_idx_bit_count_1; + U32 loc_idx_bit_count_2 = task->loc_idx_bit_count_2; + + switch (task->pass_idx) { + // + // Sort items on leaf index + // + case 0: { + ProfBegin("Leaf Sort Low"); + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 leaf_digit0 = BitExtract(bucket->leaf_ref.enc_leaf_idx, 10, 0); + task->dst[counts_ptr[leaf_digit0]++] = bucket; + } + ProfEnd(); + } break; + case 1: { + ProfBegin("Leaf Sort Mid"); + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 leaf_digit1 = BitExtract(bucket->leaf_ref.enc_leaf_idx, 11, 10); + task->dst[counts_ptr[leaf_digit1]++] = bucket; + } + ProfEnd(); + } break; + case 2: { + ProfBegin("Leaf Sort High"); + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 leaf_digit2 = BitExtract(bucket->leaf_ref.enc_leaf_idx, 11, 21 - 1); // don't take into account IPI flag + task->dst[counts_ptr[leaf_digit2]++] = bucket; + } + ProfEnd(); + } break; + + // + // Sort items on obj and type server index + // + case 3: { + ProfBegin("Loc Sort Low"); + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 digit0 = BitExtract(bucket->leaf_ref.enc_loc_idx, loc_idx_bit_count_0, 0); + task->dst[counts_ptr[digit0]++] = bucket; + } + ProfEnd(); + } break; + case 4: { + ProfBegin("Loc Sort Mid"); + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 digit1 = BitExtract(bucket->leaf_ref.enc_loc_idx, loc_idx_bit_count_1, loc_idx_bit_count_0); + task->dst[counts_ptr[digit1]++] = bucket; + } + ProfEnd(); + } break; + case 5: { + ProfBegin("Loc Sort High"); + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->src[i]; + U64 digit2 = BitExtract(bucket->leaf_ref.enc_loc_idx, loc_idx_bit_count_2, loc_idx_bit_count_0 + loc_idx_bit_count_1); + + U64 loc_bit = !!(bucket->leaf_ref.enc_loc_idx & LNK_LeafRefFlag_LocIdxExternal); + digit2 |= loc_bit << loc_idx_bit_count_2; + + Assert(counts_ptr[digit2] != max_U32); + task->dst[counts_ptr[digit2]++] = bucket; + } + ProfEnd(); + } break; + + default: InvalidPath; + } + + ProfEnd(); +} + +internal void +lnk_leaf_bucket_array_sort(TP_Context *tp, LNK_LeafBucketArray arr, U64 obj_count, U64 type_server_count) +{ + Temp scratch = scratch_begin(0,0); + +#if PROFILE_TELEMETRY + String8 leaf_count_string = str8_from_count(scratch.arena, arr.count); + String8 obj_count_string = str8_from_count(scratch.arena, obj_count); + String8 type_server_count_string = str8_from_count(scratch.arena, type_server_count); + ProfBeginDynamic("Leaf Sort [Leaf Count: %.*s, Obj Count: %.*s, Type Server Count: %.*s]", str8_varg(leaf_count_string), str8_varg(obj_count_string), str8_varg(type_server_count_string)); +#endif + + if (arr.count > 140000) { + ProfBegin("Radix"); + + U32 loc_idx_max_bits = 32 - clz32(Max(obj_count, type_server_count)); + + LNK_LeafRadixSortTask task = {0}; + task.loc_idx_bit_count_0 = Clamp(0, (S32)loc_idx_max_bits - 21, 11); + task.loc_idx_bit_count_1 = Clamp(0, (S32)loc_idx_max_bits - 10, 11); + task.loc_idx_bit_count_2 = Clamp(0, (S32)loc_idx_max_bits, 10); + task.counts_max = (1 << 11); + task.loc_idx_max = arr.count; + task.ranges = tp_divide_work(scratch.arena, arr.count, tp->worker_count); + task.dst = push_array_no_zero(scratch.arena, LNK_LeafBucket *, arr.count); + task.src = arr.v; + + ProfBegin("Push Counts"); + task.counts_arr = push_array_no_zero(scratch.arena, U32 *, tp->worker_count); + for (U64 i = 0; i < tp->worker_count; ++i) { + // zero-out happens in histogram step + task.counts_arr[i] = push_array_no_zero(scratch.arena, U32, task.counts_max); + } + ProfEnd(); + + for (task.pass_idx = 0; task.pass_idx < 6; ++task.pass_idx) { + ProfBeginDynamic("Pass: %u", task.pass_idx); + + ProfBegin("Histo"); + tp_for_parallel(tp, 0, tp->worker_count, lnk_leaf_ref_histo_task, &task); + ProfEnd(); + + B32 is_range_not_empty = 0; + for (U64 task_id = 0; task_id < tp->worker_count; ++task_id) { + is_range_not_empty = task.counts_arr[task_id][0] != dim_1u64(task.ranges[task_id]); + if (is_range_not_empty) { + break; + } + } + + ProfBegin("Counts -> Offsets"); + { + U64 digit_cursor = 0; + for (U64 digit_idx = 0; digit_idx < task.counts_max; ++digit_idx) { + for (U64 task_id = 0; task_id < tp->worker_count; ++task_id) { + U64 count = task.counts_arr[task_id][digit_idx]; + task.counts_arr[task_id][digit_idx] = digit_cursor; + digit_cursor += count; + } + } + Assert(digit_cursor == arr.count); + } + ProfEnd(); + + ProfBegin("Sort"); + tp_for_parallel(tp, 0, tp->worker_count, lnk_loc_idx_radix_sort_task, &task); + Swap(LNK_LeafBucket **, task.src, task.dst); + ProfEnd(); + + ProfEnd(); + } + + if (task.src != arr.v) { + MemoryCopyTyped(arr.v, task.dst, arr.count); + } + +#if 0 + for (U64 i = 1; i < arr.count; ++i) { + AssertAlways(arr.v[i-1]->leaf_ref.enc_loc_idx <= arr.v[i]->leaf_ref.enc_loc_idx); + if (arr.v[i-1]->leaf_ref.enc_loc_idx == arr.v[i]->leaf_ref.enc_loc_idx) { + AssertAlways(arr.v[i-1]->leaf_ref.enc_leaf_idx <= arr.v[i]->leaf_ref.enc_leaf_idx); + } + } +#endif + + ProfEnd(); + } else { + ProfBegin("Radsort"); + radsort(arr.v, arr.count, lnk_leaf_ref_is_before); + ProfEnd(); + } + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_assign_type_indices_task) +{ + LNK_AssignTypeIndicesTask *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + for (U64 i = range.min; i < range.max; ++i) { + LNK_LeafBucket *bucket = task->bucket_arr.v[i]; + bucket->type_index = task->min_type_index + i; + } +} + +internal void +lnk_assign_type_indices(TP_Context *tp, LNK_LeafBucketArray bucket_arr, CV_TypeIndex min_type_index) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + LNK_AssignTypeIndicesTask task; + task.range_arr = tp_divide_work(scratch.arena, bucket_arr.count, tp->worker_count); + task.bucket_arr = bucket_arr; + task.min_type_index = min_type_index; + tp_for_parallel(tp, 0, tp->worker_count, lnk_assign_type_indices_task, &task); + + ProfEnd(); + scratch_end(scratch); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_patch_symbols_task) +{ + LNK_PatchSymbolTypesTask *task = raw_task; + Arena *fixed_arena = task->arena_arr[worker_id]; + LNK_CodeViewSymbolsInput symbol_input = task->input->symbol_inputs[task_id]; + + LNK_LeafLocType loc_type = lnk_loc_type_from_obj_idx(task->input, symbol_input.obj_idx); + U64 loc_idx = lnk_loc_idx_from_obj_idx(task->input, symbol_input.obj_idx); + + CV_TypeIndex ti_lo_arr[CV_TypeIndexSource_COUNT]; + ti_lo_arr[CV_TypeIndexSource_NULL] = lnk_ti_lo_from_loc(task->input, loc_type, loc_idx, CV_TypeIndexSource_NULL); + ti_lo_arr[CV_TypeIndexSource_TPI ] = lnk_ti_lo_from_loc(task->input, loc_type, loc_idx, CV_TypeIndexSource_TPI); + ti_lo_arr[CV_TypeIndexSource_IPI ] = lnk_ti_lo_from_loc(task->input, loc_type, loc_idx, CV_TypeIndexSource_IPI); + + for (CV_SymbolNode *symnode = symbol_input.symbol_list->first; symnode != 0; symnode = symnode->next) { + Temp temp = temp_begin(fixed_arena); + + // find type index offsets in symbol + CV_TypeIndexInfoList ti_list = cv_get_symbol_type_index_offsets(temp.arena, symnode->data.kind, symnode->data.data); + + // overwrite type indices in symbol + for (CV_TypeIndexInfo *ti_info = ti_list.first; ti_info != 0; ti_info = ti_info->next) { + CV_TypeIndex *ti_ptr = (CV_TypeIndex *) (symnode->data.data.str + ti_info->offset); + if (*ti_ptr >= ti_lo_arr[ti_info->source]) { + LNK_LeafHashTable *leaf_ht = &task->leaf_ht_arr[ti_info->source]; + LNK_LeafRef leaf_ref = lnk_leaf_ref_from_loc_idx_and_ti(task->input, loc_type, ti_info->source, loc_idx, *ti_ptr); + LNK_LeafBucket *leaf_bucket = lnk_leaf_hash_table_search(leaf_ht, task->input, task->hashes, leaf_ref); + + // we overwrite section memory directly + *ti_ptr = leaf_bucket->type_index; + } + } + + temp_end(temp); + } +} + +internal void +lnk_patch_symbols(TP_Context *tp, + LNK_CodeViewInput *input, + LNK_LeafHashes *hashes, + LNK_LeafHashTable *leaf_ht_arr) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + U64 max_ti_list_size = sizeof(CV_TypeIndexInfo) * (max_U16 / sizeof(CV_TypeIndex)); + + LNK_PatchSymbolTypesTask task = {0}; + task.input = input; + task.hashes = hashes; + task.leaf_ht_arr = leaf_ht_arr; + task.arena_arr = alloc_fixed_size_arena_array(scratch.arena, tp->worker_count, max_ti_list_size, max_ti_list_size); + tp_for_parallel(tp, 0, input->total_symbol_input_count, lnk_patch_symbols_task, &task); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_patch_inlines_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + LNK_PatchInlinesTask *task = raw_task; + + U64 loc_idx = lnk_loc_idx_from_obj_idx(task->input, task_id); + LNK_LeafLocType loc_type = lnk_loc_type_from_obj_idx(task->input, task_id); + String8List inline_data_list = cv_sub_section_from_debug_s(task->debug_s_arr[task_id], CV_C13SubSectionKind_InlineeLines); + + for (String8Node *inline_data_node = inline_data_list.first; inline_data_node != 0; inline_data_node = inline_data_node->next) { + Temp temp = temp_begin(scratch.arena); + + // get indices offsets + CV_TypeIndexInfoList ti_info_list = cv_get_inlinee_type_index_offsets(temp.arena, inline_data_node->string); + + for (CV_TypeIndexInfo *ti_info = ti_info_list.first; ti_info != 0; ti_info = ti_info->next) { + CV_TypeIndex *ti_ptr = (CV_TypeIndex *) (inline_data_node->string.str + ti_info->offset); + CV_TypeIndex ti_lo = lnk_ti_lo_from_loc(task->input, loc_type, loc_idx, ti_info->source); + if (*ti_ptr >= ti_lo) { + LNK_LeafRef leaf_ref = lnk_leaf_ref_from_loc_idx_and_ti(task->input, loc_type, ti_info->source, loc_idx, *ti_ptr); + LNK_LeafBucket *leaf_bucket = lnk_leaf_hash_table_search(&task->leaf_ht_arr[ti_info->source], task->input, task->hashes, leaf_ref); + + // patch index + *ti_ptr = leaf_bucket->type_index; + } + } + + temp_end(temp); + } + + scratch_end(scratch); + ProfEnd(); +} + +internal void +lnk_patch_inlines(TP_Context *tp, + LNK_CodeViewInput *input, + LNK_LeafHashes *hashes, + LNK_LeafHashTable *leaf_ht_arr, + U64 obj_count, + CV_DebugS *debug_s_arr) +{ + ProfBeginFunction(); + + LNK_PatchInlinesTask task = {0}; + task.input = input; + task.hashes = hashes; + task.leaf_ht_arr = leaf_ht_arr; + task.debug_s_arr = debug_s_arr; + tp_for_parallel(tp, 0, obj_count, lnk_patch_inlines_task, &task); + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_patch_leaves_task) +{ + ProfBeginFunction(); + + LNK_PatchLeavesTask *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + + for (U64 bucket_idx = range.min; bucket_idx < range.max; ++bucket_idx) { + Temp temp = temp_begin(task->fixed_arena_arr[task_id]); + + LNK_LeafBucket *bucket = task->bucket_arr[bucket_idx]; + + U64 loc_idx = bucket->leaf_ref.enc_loc_idx & ~LNK_LeafRefFlag_LocIdxExternal; + LNK_LeafLocType loc_type = lnk_loc_type_from_leaf_ref(bucket->leaf_ref); + CV_TypeIndex ti_lo = lnk_ti_lo_from_leaf_ref(task->input, bucket->leaf_ref); + String8 raw_leaf = lnk_data_from_leaf_ref(task->input, bucket->leaf_ref); + CV_Leaf leaf = cv_leaf_from_string(raw_leaf); + + // get type indices offsets + CV_TypeIndexInfoList ti_info_list = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data); + for (CV_TypeIndexInfo *ti_info = ti_info_list.first; ti_info != 0; ti_info = ti_info->next) { + CV_TypeIndex *ti_ptr = (CV_TypeIndex *) (leaf.data.str + ti_info->offset); + if (*ti_ptr >= ti_lo) { + LNK_LeafHashTable *leaf_ht = &task->leaf_ht_arr[ti_info->source]; + LNK_LeafRef sub_leaf_ref = lnk_leaf_ref_from_loc_idx_and_ti(task->input, loc_type, ti_info->source, loc_idx, *ti_ptr); + LNK_LeafBucket *sub_leaf_bucket = lnk_leaf_hash_table_search(leaf_ht, task->input, task->hashes, sub_leaf_ref); + + // patch index + *ti_ptr = sub_leaf_bucket->type_index; + } + } + + temp_end(temp); + } + + ProfEnd(); +} + +internal void +lnk_patch_leaves(TP_Context *tp, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafHashTable *leaf_ht_arr, LNK_LeafBucketArray bucket_arr) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + LNK_PatchLeavesTask task; + task.input = input; + task.hashes = hashes; + task.leaf_ht_arr = leaf_ht_arr; + task.bucket_arr = bucket_arr.v; + task.range_arr = tp_divide_work(scratch.arena, bucket_arr.count, tp->worker_count); + task.fixed_arena_arr = alloc_fixed_size_arena_array(scratch.arena, tp->worker_count, MB(1), MB(1)); + tp_for_parallel(tp, 0, tp->worker_count, lnk_patch_leaves_task, &task); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_unbucket_raw_leaves_task) +{ + LNK_UnbucketRawLeavesTask *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + for (U64 i = range.min; i < range.max; ++i) { + String8 raw_leaf = lnk_data_from_leaf_ref(task->input, task->bucket_arr[i]->leaf_ref); + task->raw_leaf_arr[i] = raw_leaf.str; + } +} + +internal CV_DebugT +lnk_unbucket_leaf_array(TP_Context *tp, Arena *arena, LNK_CodeViewInput *input, LNK_LeafBucketArray bucket_arr) +{ + ProfBeginDynamic("Unbucket Leaves [Count %llu]", bucket_arr.count); + Temp scratch = scratch_begin(&arena, 1); + + LNK_UnbucketRawLeavesTask task = {0}; + task.input = input; + task.bucket_arr = bucket_arr.v; + task.raw_leaf_arr = push_array_no_zero(arena, U8 *, bucket_arr.count); + task.range_arr = tp_divide_work(scratch.arena, bucket_arr.count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_unbucket_raw_leaves_task, &task); + + CV_DebugT debug_t = {0}; + debug_t.count = bucket_arr.count; + debug_t.v = task.raw_leaf_arr; + + scratch_end(scratch); + ProfEnd(); + return debug_t; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_post_process_cv_symbols_task) +{ + LNK_PostProcessCvSymbolsTask *task = raw_task; + LNK_CodeViewSymbolsInput symbol_input = task->symbol_inputs[task_id]; + + for (CV_SymbolNode *symnode = symbol_input.symbol_list->first; symnode != 0; symnode = symnode->next) { + CV_Symbol *symbol = &symnode->data; + + if (symbol->kind == CV_SymKind_LPROC32_ID || symbol->kind == CV_SymKind_GPROC32_ID || symbol->kind == CV_SymKind_LPROC32_DPC) { + CV_SymProc32 *proc32 = (CV_SymProc32 *) symbol->data.str; + if (proc32->itype >= task->ipi_min_type_index) { + if ((proc32->itype - task->ipi_min_type_index) < task->ipi_types.count) { + U64 leaf_idx = proc32->itype - task->ipi_min_type_index; + CV_Leaf leaf = cv_debug_t_get_leaf(task->ipi_types, leaf_idx); + + if (leaf.kind == CV_LeafKind_FUNC_ID) { + if (leaf.data.size >= sizeof(CV_LeafFuncId)) { + proc32->itype = ((CV_LeafFuncId *) leaf.data.str)->itype; + } else { + Assert(!"TODO: error handle corrupt leaf"); + } + } else if (leaf.kind == CV_LeafKind_MFUNC_ID) { + if (leaf.data.size >= sizeof(CV_LeafMFuncId)) { + proc32->itype = ((CV_LeafMFuncId *) leaf.data.str)->itype; + } else { + Assert(!"TODO: error handle corrupt leaf"); + } + } else { + Assert(!"TODO: erorr handle unexpected leaf type"); + } + } else { + Assert("TODO: error handle corrupted type index"); + } + } else { + // TODO: in some cases destructors don't have a type, need a repro + } + } + + // convert symbol to final type + switch (symbol->kind) { + case CV_SymKind_LPROC32_ID: symbol->kind = CV_SymKind_LPROC32; break; + case CV_SymKind_GPROC32_ID: symbol->kind = CV_SymKind_GPROC32; break; + case CV_SymKind_LPROC32_DPC_ID: symbol->kind = CV_SymKind_LPROC32_DPC; break; + case CV_SymKind_LPROCMIPS_ID: symbol->kind = CV_SymKind_LPROCMIPS; break; + case CV_SymKind_GPROCMIPS_ID: symbol->kind = CV_SymKind_GPROCMIPS; break; + case CV_SymKind_LPROCIA64_ID: symbol->kind = CV_SymKind_LPROCIA64; break; + case CV_SymKind_GPROCIA64_ID: symbol->kind = CV_SymKind_GPROCIA64; break; + case CV_SymKind_PROC_ID_END: symbol->kind = CV_SymKind_END; break; + } + } +} + +internal CV_DebugT * +lnk_import_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input) +{ + ProfBegin("Import Types"); + + ProfBegin("Hash Leaves"); + LNK_LeafHashes *hashes = push_array(tp_temp->v[0], LNK_LeafHashes, 1); + { + Temp scratch = scratch_begin(tp_temp->v, tp_temp->count); + + // push internal hash arrays + // + // TPI and IPI leaves in .debug$T are stored in one array (we don't move them + // to respective arrays before this point to save on memory move) + ProfBegin("Push Internal Hash Arrays"); + hashes->internal_hashes = push_array_no_zero(tp_temp->v[0], U128Array *, input->internal_count); + for (U64 obj_idx = 0; obj_idx < input->internal_count; ++obj_idx) { + CV_DebugT debug_t = input->merged_debug_t_p_arr[obj_idx]; + + U128Array arr = {0}; + arr.count = debug_t.count; + arr.v = push_array_no_zero(tp_temp->v[0], U128, debug_t.count); + // :debug_zero_hash_assert +#if BUILD_DEBUG + MemoryZeroTyped(arr.v, arr.count); +#endif + + hashes->internal_hashes[obj_idx] = push_array(tp_temp->v[0], U128Array, CV_TypeIndexSource_COUNT); + for (U64 ti_source = 0; ti_source < CV_TypeIndexSource_COUNT; ++ti_source) { + hashes->internal_hashes[obj_idx][ti_source] = arr; + } + } + ProfEnd(); + + // push external hash arrays + ProfBegin("Push External Hash Arrays"); + hashes->external_hashes = push_array_no_zero(tp_temp->v[0], U128Array *, input->type_server_count); + for (U64 ts_idx = 0; ts_idx < input->type_server_count; ++ts_idx) { + hashes->external_hashes[ts_idx] = push_array_no_zero(tp_temp->v[0], U128Array, CV_TypeIndexSource_COUNT); + for (U64 ti_source = 0; ti_source < CV_TypeIndexSource_COUNT; ++ti_source) { + U64 leaf_count = dim_1u64(input->external_ti_ranges[ts_idx][ti_source]); + hashes->external_hashes[ts_idx][ti_source].count = leaf_count; + hashes->external_hashes[ts_idx][ti_source].v = push_array(tp_temp->v[0], U128, leaf_count); // :zero_hash_check + } + } + ProfEnd(); + + LNK_LeafHasherTask task = {0}; + task.input = input; + task.hashes = hashes; + task.fixed_arenas = alloc_fixed_size_arena_array(scratch.arena, tp->worker_count, MB(1), MB(1)); + + // hash .debug$P first so we can mix in hashes for precompiled sub leaves when hashing leaves in .debug$T + ProfBeginDynamic("Hash .debug$P [Count: %llu]", input->internal_count); + task.debug_t_arr = input->internal_debug_p_arr; + tp_for_parallel(tp, 0, input->internal_count, lnk_hash_debug_t_task, &task); + ProfEnd(); + +#if PROFILE_TELEMETRY + String8 count_string = str8_from_count(scratch.arena, input->internal_count); + ProfBegin("Hash .debug$T [Count: %.*s]", str8_varg(count_string)); +#endif + task.debug_t_arr = input->internal_debug_t_arr; + tp_for_parallel(tp, 0, input->internal_count, lnk_hash_debug_t_task, &task); + ProfEnd(); + + ProfBegin("Hash Type Server Leaves [Count: %.*s]", str8_varg(count_string)); + tp_for_parallel(tp, 0, input->external_count, lnk_hash_type_server_leaves_task, &task); + ProfEnd(); + + scratch_end(scratch); + } + ProfEnd(); + + ProfBegin("Leaf Hash Table Init"); + LNK_LeafHashTable leaf_ht_arr[CV_TypeIndexSource_COUNT] = { 0 }; + U64 internal_per_source_count[CV_TypeIndexSource_COUNT] = { 0 }; + U64 external_per_source_count[CV_TypeIndexSource_COUNT] = { 0 }; + { + // count internal leaves + lnk_cv_debug_t_count_leaves_per_source(tp, input->internal_count, input->internal_debug_p_arr, internal_per_source_count); + lnk_cv_debug_t_count_leaves_per_source(tp, input->internal_count, input->internal_debug_t_arr, internal_per_source_count); + + // count external leaves + for (U64 ts_idx = 0; ts_idx < input->type_server_count; ++ts_idx) { + for (U64 ti_source = 0; ti_source < CV_TypeIndexSource_COUNT; ++ti_source) { + external_per_source_count[ti_source] += dim_1u64(input->external_ti_ranges[ts_idx][ti_source]); + } + } + + // push buckets per source + for (U64 ti_source = 0; ti_source < CV_TypeIndexSource_COUNT; ++ti_source) { + U64 bucket_cap = 0; + bucket_cap += internal_per_source_count[ti_source]; + bucket_cap += external_per_source_count[ti_source]; + bucket_cap = (U64) ((F64) bucket_cap * 1.3); + + #if PROFILE_TELEMETRY + tmMessage(0, TMMF_ICON_NOTE, "%.*s Bucket Count: %llu", str8_varg(cv_string_from_type_index_source(ti_source)), bucket_cap); + #endif + + leaf_ht_arr[ti_source].cap = bucket_cap; + leaf_ht_arr[ti_source].bucket_arr = push_array(tp_temp->v[0], LNK_LeafBucket *, bucket_cap); + } + } + ProfEnd(); + +#if PROFILE_TELEMETRY + String8 obj_count_string = str8_from_count(tp_temp->v[0], input->internal_count); + String8 tpi_count_string = str8_from_count(tp_temp->v[0], internal_per_source_count[CV_TypeIndexSource_TPI]); + String8 ipi_count_string = str8_from_count(tp_temp->v[0], internal_per_source_count[CV_TypeIndexSource_IPI]); + ProfBeginDynamic("Internal Leaf Dedup [Obj Count: %.*s, TPI: %.*s, IPI: %.*s]", + str8_varg(obj_count_string), + str8_varg(tpi_count_string), + str8_varg(ipi_count_string)); +#endif + { + + LNK_LeafDedupInternal task; + task.input = input; + task.hashes = hashes; + task.leaf_ht_arr = leaf_ht_arr; + + ProfBegin("Dedup .debug$P"); + task.debug_t_arr = input->internal_debug_p_arr; + tp_for_parallel(tp, tp_temp, input->internal_count, lnk_leaf_dedup_internal_task, &task); + ProfEnd(); + + ProfBegin("Dedup .debug$T"); + task.debug_t_arr = input->internal_debug_t_arr; + tp_for_parallel(tp, tp_temp, input->internal_count, lnk_leaf_dedup_internal_task, &task); + ProfEnd(); + } + ProfEnd(); + + ProfBeginDynamic("External Leaf Import [Type Server Count: %llu, Dependent Obj Count: %llu]", input->type_server_count, input->external_count); + { + LNK_LeafDedupExternal task = {0}; + task.input = input; + task.hashes = hashes; + task.leaf_ht_arr = leaf_ht_arr; + + ProfBeginDynamic("Dedup TPI [Leaf Count %llu]", external_per_source_count[CV_TypeIndexSource_TPI]); + task.dedup_ti_source = CV_TypeIndexSource_TPI; + tp_for_parallel(tp, tp_temp, input->type_server_count, lnk_leaf_dedup_external_task, &task); + ProfEnd(); + + ProfBeginDynamic("Dedup IPI [Leaf Count %llu]", external_per_source_count[CV_TypeIndexSource_IPI]); + task.dedup_ti_source = CV_TypeIndexSource_IPI; + tp_for_parallel(tp, tp_temp, input->type_server_count, lnk_leaf_dedup_external_task, &task); + ProfEnd(); + } + ProfEnd(); + + // extract present buckets from the hash tables + LNK_LeafBucketArray tpi_arr = lnk_present_bucket_array_from_leaf_hash_table(tp, tp_temp->v[0], &leaf_ht_arr[CV_TypeIndexSource_TPI]); + LNK_LeafBucketArray ipi_arr = lnk_present_bucket_array_from_leaf_hash_table(tp, tp_temp->v[0], &leaf_ht_arr[CV_TypeIndexSource_IPI]); + + // sort output leaves based on { location index, leaf index } to guarantee determinism + lnk_leaf_bucket_array_sort(tp, ipi_arr, input->internal_count, input->type_server_count); + lnk_leaf_bucket_array_sort(tp, tpi_arr, input->internal_count, input->type_server_count); + + // assign type indices to each bucket + lnk_assign_type_indices(tp, tpi_arr, CV_MinComplexTypeIndex); + lnk_assign_type_indices(tp, ipi_arr, CV_MinComplexTypeIndex); + + // patch indices in symbols, inline sites, and leaves + lnk_patch_symbols(tp, input, hashes, leaf_ht_arr); + lnk_patch_inlines(tp, input, hashes, leaf_ht_arr, input->count, input->debug_s_arr); + lnk_patch_leaves(tp, input, hashes, leaf_ht_arr, tpi_arr); + lnk_patch_leaves(tp, input, hashes, leaf_ht_arr, ipi_arr); + + CV_DebugT tpi_types = lnk_unbucket_leaf_array(tp, tp_temp->v[0], input, tpi_arr); + CV_DebugT ipi_types = lnk_unbucket_leaf_array(tp, tp_temp->v[0], input, ipi_arr); + + ProfBegin("Post Process CV Symbols"); + { + LNK_PostProcessCvSymbolsTask task = {0}; + task.ipi_min_type_index = CV_MinComplexTypeIndex; + task.ipi_types = ipi_types; + task.symbol_inputs = input->symbol_inputs; + task.parsed_symbols = input->parsed_symbols; + tp_for_parallel(tp, 0, input->total_symbol_input_count, lnk_post_process_cv_symbols_task, &task); + } + ProfEnd(); + + CV_DebugT *types = push_array(tp_temp->v[0], CV_DebugT, CV_TypeIndexSource_COUNT); + types[CV_TypeIndexSource_TPI] = tpi_types; + types[CV_TypeIndexSource_IPI] = ipi_types; + + ProfEnd(); + return types; +} + +//////////////////////////////// +// PDB Builder + +internal +THREAD_POOL_TASK_FUNC(lnk_filter_out_gsi_symbols_task) +{ + U64 obj_idx = task_id; + LNK_ProcessSymDataTaskData *task = raw_task; + CV_SymbolList *gsi_list = &task->gsi_list_arr[obj_idx]; + CV_SymbolListArray parsed_symbols = task->parsed_symbols[obj_idx]; + + CV_SymbolList global_list = {0}; + CV_SymbolList typedef_list = {0}; + for (U64 i = 0; i < parsed_symbols.count; ++i) { + CV_SymbolList *list = &parsed_symbols.v[i]; + U64 depth = 0; + for (CV_SymbolNode *curr = list->first, *next; curr != 0; curr = next) { + next = curr->next; + + if (cv_is_global_symbol(curr->data.kind)) { + cv_symbol_list_remove_node(list, curr); + cv_symbol_list_push_node(&global_list, curr); + } else if (cv_is_typedef(curr->data.kind)) { + if (depth == 0) { + cv_symbol_list_remove_node(list, curr); + cv_symbol_list_push_node(&typedef_list, curr); + } + } + // Undocumented symbol that appears only in objs. + // MSVC removes these symbols from output. + // + // LLD-link replaces symbol with S_SKIP: + // https://github.com/llvm/llvm-project/blob/main/lld/COFF/PDB.cpp#L575 + else if (curr->data.kind == 0x1176) { + cv_symbol_list_remove_node(list, curr); + } + + if (cv_is_scope_symbol(curr->data.kind)) { + ++depth; + } else if (cv_is_end_symbol(curr->data.kind)) { + Assert(depth > 0); + --depth; + } + } + } + + // collect GSI symbols + Assert(gsi_list->count == 0); + cv_symbol_list_concat_in_place(gsi_list, &global_list); + cv_symbol_list_concat_in_place(gsi_list, &typedef_list); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_make_proc_refs_task) +{ + ProfBeginFunction(); + + U64 obj_idx = task_id; + LNK_ProcessSymDataTaskData *task = raw_task; + PDB_DbiModule *mod = task->mod_arr[obj_idx]; + CV_SymbolList *gsi_list = &task->gsi_list_arr[obj_idx]; + CV_SymbolListArray parsed_symbols = task->parsed_symbols[obj_idx]; + + for (U64 i = 0; i < parsed_symbols.count; ++i) { + CV_SymbolList list = parsed_symbols.v[i]; + CV_SymbolList proc_refs = cv_make_proc_refs(arena, mod->imod, list); + cv_symbol_list_concat_in_place(gsi_list, &proc_refs); + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_process_sym_data_task) +{ + ProfBeginFunction(); + + U64 obj_idx = task_id; + LNK_ProcessSymDataTaskData *task = raw_task; + CV_SymbolListArray parsed_symbols = task->parsed_symbols[obj_idx]; + + static CV_Signature MODULE_SYMBOL_SIGNATURE = CV_Signature_C13; + + ProfBegin("Compute Buffer Size"); + U64 buffer_size = sizeof(MODULE_SYMBOL_SIGNATURE); + for (U64 i = 0; i < parsed_symbols.count; ++i) { + CV_SymbolList list = parsed_symbols.v[i]; + U64 data_size = cv_patch_symbol_tree_offsets(list, buffer_size, PDB_SYMBOL_ALIGN); + buffer_size += data_size; + } + ProfEnd(); + + // alloc buffer + U8 *buffer = push_array_no_zero(arena, U8, buffer_size); + U64 buffer_cursor = 0; + + // MS Symbol and Type Information p.4: + // "The first four bytes of the $$SYMBOLS segment is used as a signature to specify the version of + // the Symbol and Type OMF contained in the $$SYMBOLS segment." + CV_Signature *sig_ptr = (CV_Signature *) (buffer + buffer_cursor); + *sig_ptr = MODULE_SYMBOL_SIGNATURE; + buffer_cursor += sizeof(*sig_ptr); + + ProfBegin("Serialize Symbols"); + for (U64 i = 0; i < parsed_symbols.count; ++i) { + CV_SymbolList list = parsed_symbols.v[i]; + for (CV_SymbolNode *symbol_n = list.first; symbol_n != 0; symbol_n = symbol_n->next) { + symbol_n->data.offset = buffer_cursor; + buffer_cursor += cv_serialize_symbol_to_buffer(buffer, buffer_cursor, buffer_size, &symbol_n->data, PDB_SYMBOL_ALIGN); + } + } + ProfEnd(); + + // output + Assert(task->symbol_data_arr[obj_idx].total_size == 0); + str8_list_push(arena, &task->symbol_data_arr[obj_idx], str8(buffer, buffer_size)); + + ProfEnd(); +} + +internal LNK_ProcessedCodeViewC11Data +lnk_process_c11_data(TP_Context *tp, TP_Arena *arena, U64 obj_count, CV_DebugS *debug_s_arr, U64 string_data_base_offset, CV_StringHashTable string_ht, MSF_Context *msf, PDB_DbiModule **mod_arr) +{ + // TODO: handle c11 data + String8List *data_list_arr = push_array(arena->v[0], String8List, obj_count); + LNK_ProcessedCodeViewC11Data result; + result.data_list_arr = data_list_arr; + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_process_c13_data_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena,1); + + U64 obj_idx = task_id; + LNK_ProcessC13DataTask *task = raw_task; + CV_DebugS debug_s = task->debug_s_arr[obj_idx]; + + // parse checksum data + String8List checksum_data = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_FileChksms); + CV_ChecksumList checksum_list = cv_c13_parse_checksum_data_list(scratch.arena, checksum_data); + + // get strings sub-section + String8 string_data = cv_string_table_from_debug_s(debug_s); + + // collect source file names from checksum headers + String8List source_file_names_list = cv_c13_collect_source_file_names(arena, checksum_list, string_data); + + // relocate checksum data + cv_c13_patch_string_offsets_in_checksum_list(checksum_list, string_data, task->string_data_base_offset, task->string_ht); + + // get module sub-sections + PDB_DbiModule *mod = task->dbi_mod_arr[obj_idx]; + String8 mod_c13_data = dbi_module_read_c13_data(scratch.arena, task->msf, mod); + CV_DebugS mod_debug_s = cv_parse_debug_s_c13(scratch.arena, mod_c13_data); + + // relocate line and frame data + String8List *mod_checksum_data = cv_sub_section_ptr_from_debug_s(&mod_debug_s, CV_C13SubSectionKind_FileChksms); + U64 checksum_base = mod_checksum_data->total_size; + B32 is_checksum_patch_needed = checksum_base > 0; + if (is_checksum_patch_needed) { + String8List line_data = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_Lines); + String8List frame_data = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_FrameData); + cv_c13_patch_checksum_offsets_in_line_data_list(line_data, checksum_base); + cv_c13_patch_checksum_offsets_in_frame_data_list(frame_data, checksum_base); + } + + // push obj c13 data to module + cv_debug_s_concat_in_place(&mod_debug_s, &debug_s); + + // serialize c13 data + B32 include_sig = 0; + String8List c13_data = cv_data_c13_from_debug_s(arena, &mod_debug_s, include_sig); + + // store for later pass + task->c13_data_arr[obj_idx] = c13_data; + task->source_file_names_list_arr[obj_idx] = source_file_names_list; + + scratch_end(scratch); + ProfEnd(); +} + +internal LNK_ProcessedCodeViewC13Data +lnk_process_c13_data(TP_Context *tp, TP_Arena *arena, U64 obj_count, CV_DebugS *debug_s_arr, U64 string_data_base_offset, CV_StringHashTable string_ht, MSF_Context *msf, PDB_DbiModule **mod_arr) +{ + ProfBeginFunction(); + + LNK_ProcessC13DataTask task = {0}; + task.debug_s_arr = debug_s_arr; + task.msf = msf; + task.dbi_mod_arr = mod_arr; + task.c13_data_arr = push_array_no_zero(arena->v[0], String8List, obj_count); + task.source_file_names_list_arr = push_array_no_zero(arena->v[0], String8List, obj_count); + task.string_data_base_offset = string_data_base_offset; + task.string_ht = string_ht; + tp_for_parallel(tp, arena, obj_count, lnk_process_c13_data_task, &task); + + // fill out result + LNK_ProcessedCodeViewC13Data result = {0}; + result.data_list_arr = task.c13_data_arr; + result.source_file_names_list_arr = task.source_file_names_list_arr; + + ProfEnd(); + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_write_module_data_task) +{ + U64 obj_idx = task_id; + LNK_WriteModuleDataTask *task = raw_task; + + PDB_DbiModule *mod = task->mod_arr[obj_idx]; + String8List sym_data = task->symbol_data_arr[obj_idx]; + String8List c11_data = task->c11_data_list_arr[obj_idx]; + String8List c13_data = task->c13_data_list_arr[obj_idx]; + String8List globrefs = task->globrefs_arr[obj_idx]; + + U32 sym_data_size32 = safe_cast_u32(sym_data.total_size); + U32 c11_data_size32 = safe_cast_u32(c11_data.total_size); + U32 c13_data_size32 = safe_cast_u32(c13_data.total_size); + U32 globrefs_size32 = safe_cast_u32(globrefs.total_size); + + // layout module data + String8List module_data = {0}; + str8_list_concat_in_place(&module_data, &sym_data); + str8_list_concat_in_place(&module_data, &c11_data); + str8_list_concat_in_place(&module_data, &c13_data); + str8_list_concat_in_place(&module_data, &globrefs); + + // make stream has enough memory so it doens't trigger memory allocations in MSF + // during multi-thread write + MSF_UInt stream_pos = msf_stream_get_pos(task->msf, mod->sn); + if (stream_pos != 0) { + Assert(!"stream must be at start position"); + } + MSF_UInt stream_cap = msf_stream_get_cap(task->msf, mod->sn); + if (stream_cap < module_data.total_size) { + Assert(!"not enough bytes in destination stream to copy module data"); + } + + // write data + B32 is_write_ok = msf_stream_write_list(task->msf, mod->sn, module_data); + + // update module data sizes + if (is_write_ok) { + mod->sym_data_size = sym_data_size32; + mod->c11_data_size = c11_data_size32; + mod->c13_data_size = c13_data_size32; + mod->globrefs_size = globrefs_size32; + } else { + // TODO: error handle + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_cv_symbol_ptr_array_hasher) +{ + LNK_CvSymbolPtrArrayHasher *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + for (U64 symbol_idx = range.min; symbol_idx < range.max; ++symbol_idx) { + task->hash_arr[symbol_idx] = XXH3_64bits(task->arr[symbol_idx]->data.data.str, task->arr[symbol_idx]->data.data.size); + } +} + +internal U64 * +lnk_hash_cv_symbol_ptr_arr(TP_Context *tp, Arena *arena, CV_SymbolPtrArray arr) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + LNK_CvSymbolPtrArrayHasher task = {0}; + task.hash_arr = push_array_no_zero(arena, U64, arr.count); + task.arr = arr.v; + task.range_arr = tp_divide_work(scratch.arena, arr.count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_cv_symbol_ptr_array_hasher, &task); + + scratch_end(scratch); + ProfEnd(); + return task.hash_arr; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_push_dbi_sec_contrib_task) +{ + U64 obj_idx = task_id; + LNK_PushDbiSecContribTaskData *task = raw_task; + LNK_Section **sect_id_map = task->sect_id_map; + PDB_DbiModule *mod = task->mod_arr[obj_idx]; + LNK_Obj *obj = &task->obj_arr[obj_idx]; + PDB_DbiSectionContribList *dst_list = &task->sc_list[obj_idx]; + + // TODO: use chunked lists for SC + + // TODO: put back unused nodes + PDB_DbiSectionContribNode *sc_arr = push_array_no_zero(arena, PDB_DbiSectionContribNode, obj->sect_count); + U64 sc_count = 0; + + for (U64 chunk_idx = 0; chunk_idx < obj->sect_count; ++chunk_idx) { + LNK_Chunk *chunk = &obj->chunk_arr[chunk_idx]; + + if (!chunk || lnk_chunk_is_discarded(chunk)) { + continue; + } + + LNK_Section *sect = lnk_sect_from_chunk_ref(task->sect_id_map, chunk->ref); + if (!sect->has_layout) { + continue; + } + + // query chunk info + ISectOff chunk_sc = lnk_sc_from_chunk_ref (sect_id_map, chunk->ref); + String8 chunk_data = lnk_data_from_chunk_ref (sect_id_map, chunk->ref); + LNK_Section *chunk_sect = lnk_sect_from_chunk_ref (sect_id_map, chunk->ref); + U64 chunk_size = lnk_file_size_from_chunk_ref(sect_id_map, chunk->ref); + + // compute chunk CRC + U32 data_crc = update_crc32(0, chunk_data.str, chunk_data.size); + U32 reloc_crc = 0; // TODO: compute CRC for relocations block + + // fill out SC + PDB_DbiSectionContribNode *sc = sc_arr + sc_count++; + sc->data.base.sec = safe_cast_u16(chunk_sc.isect); + sc->data.base.pad0 = 0; + sc->data.base.sec_off = chunk_sc.off; + sc->data.base.size = safe_cast_u32(chunk_size); + sc->data.base.flags = chunk_sect->flags; + sc->data.base.mod = mod->imod; + sc->data.base.pad1 = 0; + sc->data.data_crc = data_crc; + sc->data.reloc_crc = reloc_crc; + + dbi_sec_contrib_list_push_node(dst_list, sc); + } + + // Mod1::fUpdateSecContrib + if (sc_count > 0) { + for (U64 sc_idx = 0; sc_idx < sc_count; ++sc_idx) { + if (sc_arr[sc_idx].data.base.flags & COFF_SectionFlag_CNT_CODE) { + mod->first_sc = sc_arr[sc_idx].data; + break; + } + } + } +} + +//////////////////////////////// + +internal +THREAD_POOL_TASK_FUNC(lnk_build_pdb_public_symbols_task) +{ + U64 bucket_idx = task_id; + LNK_BuildPublicSymbolsTaskData *task = raw_task; + + LNK_Section **sect_id_map = task->sect_id_map; + LNK_SymbolScopeIndex scope_idx = task->scope_idx; + LNK_SymbolList bucket = task->bucket_arr[scope_idx][bucket_idx]; + CV_SymbolList *pub_list = &task->pub_list_arr[bucket_idx]; + + for (LNK_SymbolNode *symbol_node = bucket.first; symbol_node != 0; symbol_node = symbol_node->next) { + LNK_Symbol *symbol = symbol_node->data; + + if (LNK_Symbol_IsDefined(symbol->type)) { + LNK_DefinedSymbol *defined_symbol = &symbol->u.defined; + if (defined_symbol->value_type == LNK_DefinedSymbolValue_Chunk) { + CV_Pub32Flags flags = 0; + if (defined_symbol->flags & LNK_DefinedSymbolFlag_IsFunc || defined_symbol->flags & LNK_DefinedSymbolFlag_IsThunk) { + flags |= CV_Pub32Flag_Function; + } + + U64 symbol_off = lnk_sect_off_from_symbol(sect_id_map, symbol); + U64 symbol_isect = lnk_isect_from_symbol(sect_id_map, symbol); + + U32 symbol_off32 = safe_cast_u32(symbol_off); + U16 symbol_isect16 = safe_cast_u16(symbol_isect); + + CV_SymbolNode *pub_node = cv_symbol_list_push(arena, pub_list); + pub_node->data = cv_make_pub32(arena, flags, symbol_off32, symbol_isect16, symbol->name); + } + } + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_gsi_hash_cv_list_task) +{ + ProfBeginFunction(); + + LNK_BuildPublicSymbolsTaskData *task = raw_task; + Rng1U64 range = task->symbol_ranges[task_id]; + + for (U64 symbol_idx = range.min; symbol_idx < range.max; ++symbol_idx) { + CV_Symbol *symbol = &task->symbols.v[symbol_idx]->data; + String8 name = cv_name_from_symbol(symbol->kind, symbol->data); + task->hashes[symbol_idx] = gsi_hash(task->gsi, name); + } + + ProfEnd(); +} + +internal void +lnk_build_pdb_public_symbols(TP_Context *tp, + TP_Arena *arena, + LNK_SymbolTable *symtab, + LNK_Section **sect_id_map, + PDB_PsiContext *psi, + LNK_SymbolScopeIndex scope_idx) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Make Public Symbols"); + LNK_BuildPublicSymbolsTaskData task = {0}; + task.sect_id_map = sect_id_map; + task.scope_idx = scope_idx; + task.bucket_arr = symtab->buckets; + task.pub_list_arr = push_array(scratch.arena, CV_SymbolList, symtab->bucket_count[scope_idx]); + tp_for_parallel(tp, arena, symtab->bucket_count[scope_idx], lnk_build_pdb_public_symbols_task, &task); + ProfEnd(); + + CV_SymbolPtrArray symbols = cv_symbol_ptr_array_from_list(scratch.arena, tp, symtab->bucket_count[scope_idx], task.pub_list_arr); + + ProfBegin("GSI Push"); + gsi_push_many_arr(tp, psi->gsi, symbols.count, symbols.v); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); +} + +internal String8List +lnk_build_pdb(TP_Context *tp, + TP_Arena *tp_arena, + OS_Guid guid, + COFF_MachineType machine, + COFF_TimeStamp time_stamp, + U32 age, + U64 page_size, + String8 pdb_name, + String8List lib_dir_list, + String8List natvis_list, + LNK_SymbolTable *symtab, + LNK_Section **sect_id_map, + U64 obj_count, + LNK_Obj *obj_arr, + CV_DebugS *debug_s_arr, + U64 total_symbol_input_count, + LNK_CodeViewSymbolsInput *symbol_inputs, + CV_SymbolListArray *parsed_symbols, + CV_DebugT types[CV_TypeIndexSource_COUNT]) +{ + ProfBegin("PDB"); + Temp scratch = scratch_begin(tp_arena->v, tp_arena->count); + + ProfBegin("Setup PDB Context"); + PDB_Context *pdb = pdb_alloc(page_size, machine, time_stamp, age, guid); + ProfEnd(); + + // move patched type data + // + // leaf data is stored in g_file_arena which has linker's life-time + // and this way we skip redundant leaf copy to the type server to make things faster + pdb_type_server_push_parallel(tp, pdb->type_servers[CV_TypeIndexSource_IPI], types[CV_TypeIndexSource_IPI]); + pdb_type_server_push_parallel(tp, pdb->type_servers[CV_TypeIndexSource_TPI], types[CV_TypeIndexSource_TPI]); + + ProfBegin("Collect Symbols for GSI"); + CV_SymbolList *gsi_list_arr = push_array(scratch.arena, CV_SymbolList, obj_count); + { + LNK_ProcessSymDataTaskData task = {0}; + task.gsi_list_arr = gsi_list_arr; + task.parsed_symbols = parsed_symbols; + tp_for_parallel(tp, 0, obj_count, lnk_filter_out_gsi_symbols_task, &task); + } + ProfEnd(); + + ProfBegin("Reserve DBI Modules"); + PDB_DbiModule **mod_arr = push_array(tp_arena->v[0], PDB_DbiModule *, obj_count); + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + LNK_Obj *obj = obj_arr + obj_idx; + mod_arr[obj_idx] = dbi_push_module(pdb->dbi, obj->path, obj->lib_path); + + // we don't support symbol append + Assert(mod_arr[obj_idx]->sn == MSF_INVALID_STREAM_NUMBER); + } + ProfEnd(); + + ProfBegin("Build String Table"); + CV_StringHashTable string_ht = cv_dedup_string_tables(tp_arena, tp, obj_count, debug_s_arr); + cv_string_hash_table_assign_buffer_offsets(tp, string_ht); + U64 string_data_base_offset = pdb->info->strtab.size; + pdb_strtab_add_cv_string_hash_table(&pdb->info->strtab, string_ht); + ProfEnd(); + + ProfBegin("Build DBI Modules"); + { + TP_Temp temp = tp_temp_begin(tp_arena); + { + ProfBegin("Reloc Module Data"); + + ProfBegin("Serialize Symbols"); + String8List *serialized_symbol_data = push_array(scratch.arena, String8List, obj_count); + { + LNK_ProcessSymDataTaskData task = {0}; + task.symbol_inputs = symbol_inputs; + task.parsed_symbols = parsed_symbols; + task.mod_arr = mod_arr; + task.symbol_data_arr = serialized_symbol_data; + tp_for_parallel(tp, tp_arena, obj_count, lnk_process_sym_data_task, &task); + } + ProfEnd(); + + LNK_ProcessedCodeViewC11Data processed_c11 = lnk_process_c11_data(tp, tp_arena, obj_count, debug_s_arr, string_data_base_offset, string_ht, pdb->msf, mod_arr); + LNK_ProcessedCodeViewC13Data processed_c13 = lnk_process_c13_data(tp, tp_arena, obj_count, debug_s_arr, string_data_base_offset, string_ht, pdb->msf, mod_arr); + + ProfEnd(); + + // TODO: actually collect offsets and pass them here + ProfBegin("Build Empty Global Reference Array"); + String8List *globrefs_arr = push_array(tp_arena->v[0], String8List, obj_count); + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + String8List *globrefs = &globrefs_arr[obj_idx]; + str8_serial_begin(tp_arena->v[0], globrefs); + Assert(globrefs->total_size == 0); + str8_serial_push_u32(tp_arena->v[0], globrefs, globrefs->total_size); + } + ProfEnd(); + + // reserve memory for module streams + ProfBegin("Reserve Modules Memory"); + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + // compute number of bytes needed for module data + U64 mod_size = 0; + mod_size += serialized_symbol_data[obj_idx].total_size; + mod_size += processed_c11.data_list_arr[obj_idx].total_size; + mod_size += processed_c13.data_list_arr[obj_idx].total_size; + mod_size += globrefs_arr[obj_idx].total_size; + + U32 mod_size32 = safe_cast_u32(mod_size); + + // allocate stream for module + PDB_DbiModule *mod = mod_arr[obj_idx]; + mod->sn = msf_stream_alloc_ex(pdb->msf, mod_size32); + } + ProfEnd(); + + // copy data to module streams + ProfBegin("Write Modules Data"); + LNK_WriteModuleDataTask write_module_data_task_data; + write_module_data_task_data.msf = pdb->msf; + write_module_data_task_data.mod_arr = mod_arr; + write_module_data_task_data.symbol_data_arr = serialized_symbol_data; + write_module_data_task_data.c11_data_list_arr = processed_c11.data_list_arr; + write_module_data_task_data.c13_data_list_arr = processed_c13.data_list_arr; + write_module_data_task_data.globrefs_arr = globrefs_arr; + tp_for_parallel(tp, 0, obj_count, lnk_write_module_data_task, &write_module_data_task_data); + ProfEnd(); + + // push source files per module info + ProfBegin("Build Source Files List"); + for (U64 obj_idx = 0; obj_idx < obj_count; ++obj_idx) { + PDB_DbiModule *mod = mod_arr[obj_idx]; + String8List source_file_list_scratch = processed_c13.source_file_names_list_arr[obj_idx]; + String8List source_file_list = str8_list_copy(pdb->dbi->arena, &source_file_list_scratch); + str8_list_concat_in_place(&mod->source_file_list, &source_file_list); + } + ProfEnd(); + } + tp_temp_end(temp); + } + ProfEnd(); + + ProfBegin("Make Proc Refs"); + { + LNK_ProcessSymDataTaskData task = {0}; + task.mod_arr = mod_arr; + task.gsi_list_arr = gsi_list_arr; + task.parsed_symbols = parsed_symbols; + tp_for_parallel(tp, tp_arena, obj_count, lnk_make_proc_refs_task, &task); + } + ProfEnd(); + + ProfBegin("Push Global Symbols"); + { + CV_SymbolPtrArray global_symbols = cv_symbol_ptr_array_from_list(scratch.arena, tp, obj_count, gsi_list_arr); + cv_dedup_symbol_ptr_array(tp, &global_symbols); + gsi_push_many_arr(tp, pdb->gsi, global_symbols.count, global_symbols.v); + } + ProfEnd(); + + ProfBegin("Build DBI Section Headers"); + { + LNK_Symbol *coff_sect_array_symbol = lnk_symbol_table_searchf(symtab, LNK_SymbolScopeFlag_Internal, LNK_COFF_SECT_HEADER_ARRAY_SYMBOL_NAME); + LNK_Chunk *coff_sect_chunk = lnk_defined_symbol_get_chunk(&coff_sect_array_symbol->u.defined); + String8 coff_sect_chunk_data = lnk_data_from_chunk_ref(sect_id_map, coff_sect_chunk->ref); + U64 coff_sect_count = coff_sect_chunk_data.size / sizeof(COFF_SectionHeader); + COFF_SectionHeader *coff_sect_ptr = (COFF_SectionHeader*)coff_sect_chunk_data.str; + for (COFF_SectionHeader *hdr_ptr = &coff_sect_ptr[0], *opl = hdr_ptr + coff_sect_count; + hdr_ptr < opl; + ++hdr_ptr) { + dbi_push_section(pdb->dbi, hdr_ptr); + } + } + ProfEnd(); + + ProfBegin("Build Section Contrib Map"); + { + LNK_PushDbiSecContribTaskData task = {0}; + task.obj_arr = obj_arr; + task.sect_id_map = sect_id_map; + task.mod_arr = mod_arr; + task.sc_list = push_array(scratch.arena, PDB_DbiSectionContribList, obj_count); + tp_for_parallel(tp, tp_arena, obj_count, lnk_push_dbi_sec_contrib_task, &task); + + dbi_sec_list_concat_arr(&pdb->dbi->sec_contrib_list, obj_count, task.sc_list); + } + ProfEnd(); + + ProfBegin("Build NatVis"); + { + String8Array natvis_file_path_arr = str8_array_from_list(scratch.arena, &natvis_list); + String8Array natvis_file_data_arr = os_data_from_file_path_parallel(tp, scratch.arena, natvis_file_path_arr); + + for (U64 i = 0; i < natvis_file_data_arr.count; ++i) { + String8 natvis_file_path = natvis_file_path_arr.v[i]; + String8 natvis_file_data = natvis_file_data_arr.v[i]; + + // did we read the file? + if (natvis_file_data.size == 0) { + lnk_error(LNK_Warning_FileNotFound, "unable to open natvis file \"%S\"", natvis_file_path); + continue; + } + + // sanity check file extension or VS wont load NatVis + String8 ext = str8_skip_last_dot(natvis_file_path); + if (!str8_match(ext, str8_lit("natvis"), StringMatchFlag_CaseInsensitive)) { + lnk_error(LNK_Warning_Natvis, "Visual Studio expects .natvis extension: \"%S\"", natvis_file_path); + } + + // add natvis to PDB + PDB_SrcError error = pdb_add_src(pdb->info, pdb->msf, natvis_file_path, natvis_file_data, PDB_SrcComp_NULL); + if (error != PDB_SrcError_OK) { + lnk_error(LNK_Error_Natvis, "%S", pdb_string_from_src_error(error)); + } + } + } + ProfEnd(); + + lnk_build_pdb_public_symbols(tp, tp_arena, symtab, sect_id_map, pdb->psi, LNK_SymbolScopeIndex_Defined); + + pdb_build(tp, tp_arena, pdb, string_ht); + + MSF_Error msf_err = msf_build(pdb->msf); + if (msf_err != MSF_Error_OK) { + lnk_error(LNK_Error_UnableToSerializeMsf, "unable to serialize MSF: %s", msf_error_to_string(msf_err)); + } + + ProfBegin("Get Page Nodes"); + String8List page_data_list = msf_get_page_data_nodes(tp_arena->v[0], pdb->msf); + ProfEnd(); + + + // NOTE: linker is about to exit so we can skip memory release + // and let windows free memory since it does this faster +#if 0 + ProfBegin("Context Release"); + pdb_release(&pdb); + ProfEnd(); +#endif + + scratch_end(scratch); + ProfEnd(); + return page_data_list; +} + +//////////////////////////////// +// RAD Debug Info + +internal U64 +lnk_udt_name_hash_table_hash(String8 string) +{ + return XXH3_64bits(string.str, string.size); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_build_udt_name_hash_table_task) +{ + LNK_BuildUDTNameHashTableTask *task = raw_task; + + LNK_UDTNameBucket *new_bucket = 0; + + for (U64 leaf_idx = task->ranges[task_id].min; leaf_idx < task->ranges[task_id].max; ++leaf_idx) { + CV_Leaf leaf = cv_debug_t_get_leaf(task->debug_t, leaf_idx); + if (cv_is_udt(leaf.kind)) { + CV_UDTInfo udt_info = cv_get_udt_info(leaf.kind, leaf.data); + if (~udt_info.props & CV_TypeProp_FwdRef) { + if (!cv_is_udt_name_anon(udt_info.name)) { + String8 name = cv_name_from_udt_info(udt_info); + U64 hash = lnk_udt_name_hash_table_hash(name); + U64 best_idx = hash % task->buckets_cap; + U64 bucket_idx = best_idx; + + if (new_bucket == 0) { + new_bucket = push_array(arena, LNK_UDTNameBucket, 1); + new_bucket->name = name; + new_bucket->leaf_idx = leaf_idx; + } + + B32 is_inserted_or_updated = 0; + do { + retry:; + LNK_UDTNameBucket *curr_bucket = task->buckets[bucket_idx]; + + if (curr_bucket == 0) { + LNK_UDTNameBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&task->buckets[bucket_idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + is_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } else if (str8_match(curr_bucket->name, name, 0)) { + // there is more than one UDT with identical name, pick most recent and ignore others + + if (leaf_idx < curr_bucket->leaf_idx) { + LNK_UDTNameBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&task->buckets[bucket_idx], new_bucket, curr_bucket); + if (compare_bucket == curr_bucket) { + is_inserted_or_updated = 1; + break; + } + } else { + // don't need to update, more recent leaf is in the bucket + break; + } + + // another thread took the bucket... + goto retry; + } + + // advance + bucket_idx = (bucket_idx + 1) % task->buckets_cap; + } while (bucket_idx != best_idx); + + if (is_inserted_or_updated) { + new_bucket = 0; + } + } + } + } + } +} + + +internal LNK_UDTNameBucket ** +lnk_udt_name_hash_table_from_debug_t(TP_Context *tp, + TP_Arena *arena, + CV_DebugT debug_t, + U64 *buckets_cap_out) +{ + Temp scratch = scratch_begin(&arena->v[0], 1); + LNK_BuildUDTNameHashTableTask task = {0}; + task.debug_t = debug_t; + task.buckets_cap = (U64)((F64)debug_t.count * 1.3); + task.buckets = push_array(arena->v[0], LNK_UDTNameBucket *, task.buckets_cap); + task.ranges = tp_divide_work(scratch.arena, debug_t.count, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, lnk_build_udt_name_hash_table_task, &task); + *buckets_cap_out = task.buckets_cap; + scratch_end(scratch); + return task.buckets; +} + +internal LNK_UDTNameBucket * +lnk_udt_name_hash_table_lookup(LNK_UDTNameBucket **buckets, U64 cap, String8 name) +{ + U64 hash = lnk_udt_name_hash_table_hash(name); + U64 best_idx = hash % cap; + U64 bucket_idx = best_idx; + do { + if (buckets[bucket_idx] == 0) { + break; + } + if (str8_match(buckets[bucket_idx]->name, name, 0)) { + return buckets[bucket_idx]; + } + bucket_idx = (bucket_idx + 1) % cap; + } while (bucket_idx != best_idx); + return 0; +} + +internal CV_TypeIndex +lnk_udt_name_hash_table_lookup_itype(LNK_UDTNameBucket **buckets, U64 cap, String8 name) +{ + LNK_UDTNameBucket *bucket = lnk_udt_name_hash_table_lookup(buckets, cap, name); + if (bucket != 0) { + return CV_MinComplexTypeIndex + bucket->leaf_idx; + } + return 0; +} + +internal RDIB_Type * +lnk_push_converted_codeview_type(Arena *arena, RDIB_TypeChunkList *list, RDIB_Type **itype_map, CV_TypeIndex itype) +{ + RDIB_Type *type = rdib_type_chunk_list_push(arena, list, 8196); + type->final_idx = 0; + type->itype = itype; + Assert(itype_map[itype] == 0); + itype_map[itype] = type; + return type; +} + +internal void +lnk_push_basic_itypes(Arena *arena, RDIB_DataModel data_model, RDIB_Type **itype_map, RDIB_TypeChunkList *rdib_types_list) +{ + RDI_TypeKind short_type = rdib_short_type_from_data_model(data_model); + RDI_TypeKind ushort_type = rdib_unsigned_short_type_from_data_model(data_model); + RDI_TypeKind int_type = rdib_int_type_from_data_model(data_model); + RDI_TypeKind uint_type = rdib_unsigned_int_type_from_data_model(data_model); + RDI_TypeKind long_type = rdib_long_type_from_data_model(data_model); + RDI_TypeKind ulong_type = rdib_unsigned_long_type_from_data_model(data_model); + RDI_TypeKind long_long_type = rdib_long_long_type_from_data_model(data_model); + RDI_TypeKind ulong_long_type = rdib_unsigned_long_long_type_from_data_model(data_model); + RDI_TypeKind ptr_type = rdib_pointer_size_t_type_from_data_model(data_model); + + struct { + char * name; + RDI_TypeKind kind_rdi; + CV_LeafKind kind_cv; + B32 make_pointer_near; + B32 make_pointer_32; + B32 make_pointer_64; + } table[] = { + { "void" , RDI_TypeKind_Void , CV_BasicType_VOID , 1, 1, 1 }, + { "HRESULT" , RDI_TypeKind_Handle , CV_BasicType_HRESULT , 0, 1, 1 }, + { "signed char" , RDI_TypeKind_Char8 , CV_BasicType_CHAR , 1, 1, 1 }, // TODO: we need Signed Char8 in RDI + { "short" , short_type , CV_BasicType_SHORT , 1, 1, 1 }, + { "long" , long_type , CV_BasicType_LONG , 1, 1, 1 }, + { "long long" , long_long_type , CV_BasicType_QUAD , 1, 1, 1 }, + { "__int128" , RDI_TypeKind_S128 , CV_BasicType_OCT , 1, 1, 1 }, // GCC/Clang type + { "unsigned char" , RDI_TypeKind_UChar8 , CV_BasicType_UCHAR , 1, 1, 1 }, + { "unsigned short" , ushort_type , CV_BasicType_USHORT , 1, 1, 1 }, + { "unsigned long" , ulong_type , CV_BasicType_ULONG , 1, 1, 1 }, + { "unsigned long long" , ulong_long_type , CV_BasicType_UQUAD , 1, 1, 1 }, + { "__uint128" , RDI_TypeKind_U128 , CV_BasicType_UOCT , 1, 1, 1 }, // GCC/Clang type + { "bool" , RDI_TypeKind_S8 , CV_BasicType_BOOL8 , 1, 1, 1 }, // TODO: we need a actual boolean type in RDI so we can format value as true/false. + { "__bool16" , RDI_TypeKind_S16 , CV_BasicType_BOOL16 , 1, 1, 1 }, // not real C type + { "__bool32" , RDI_TypeKind_S32 , CV_BasicType_BOOL32 , 1, 1, 1 }, // not real C type + { "float" , RDI_TypeKind_F32 , CV_BasicType_FLOAT32 , 1, 1, 1 }, + { "double" , RDI_TypeKind_F64 , CV_BasicType_FLOAT64 , 1, 1, 1 }, + { "long double" , RDI_TypeKind_F80 , CV_BasicType_FLOAT80 , 1, 1, 1 }, + { "__float128" , RDI_TypeKind_F128 , CV_BasicType_FLOAT128 , 1, 1, 1 }, // GCC/Clang type + { "__float48" , RDI_TypeKind_F48 , CV_BasicType_FLOAT48 , 1, 1, 1 }, // not real C type + { "__float32pp" , RDI_TypeKind_F32PP , CV_BasicType_FLOAT32PP , 1, 1, 1 }, // not real C type + { "_Complex float" , RDI_TypeKind_ComplexF32 , CV_BasicType_COMPLEX32 , 0, 0, 0 }, + { "_Complex double" , RDI_TypeKind_ComplexF64 , CV_BasicType_COMPLEX64 , 0, 0, 0 }, + { "_Complex long double" , RDI_TypeKind_ComplexF80 , CV_BasicType_COMPLEX80 , 0, 0, 0 }, + { "_Complex __float128" , RDI_TypeKind_ComplexF128, CV_BasicType_COMPLEX128 , 0, 0, 0 }, + { "__int8" , RDI_TypeKind_S8 , CV_BasicType_INT8 , 1, 1, 1 }, + { "__uint8" , RDI_TypeKind_U8 , CV_BasicType_UINT8 , 1, 1, 1 }, + { "__int16" , RDI_TypeKind_S16 , CV_BasicType_INT16 , 1, 1, 1 }, + { "__uint16" , RDI_TypeKind_U16 , CV_BasicType_UINT16 , 1, 1, 1 }, + { "int" , int_type , CV_BasicType_INT32 , 1, 1, 1 }, + { "unsigned int" , uint_type , CV_BasicType_UINT32 , 1, 1, 1 }, + { "__int64" , RDI_TypeKind_S64 , CV_BasicType_INT64 , 1, 1, 1 }, + { "__uint64" , RDI_TypeKind_U64 , CV_BasicType_UINT64 , 1, 1, 1 }, + { "__int128" , RDI_TypeKind_S128 , CV_BasicType_INT128 , 1, 1, 1 }, + { "__uint128" , RDI_TypeKind_U128 , CV_BasicType_UINT128 , 1, 1, 1 }, + { "char" , RDI_TypeKind_Char8 , CV_BasicType_RCHAR , 1, 1, 1 }, // always ASCII + { "wchar_t" , RDI_TypeKind_UChar16 , CV_BasicType_WCHAR , 1, 1, 1 }, // on windows always UTF-16 + { "char8_t" , RDI_TypeKind_Char8 , CV_BasicType_CHAR8 , 1, 1, 1 }, // always UTF-8 + { "char16_t" , RDI_TypeKind_Char16 , CV_BasicType_CHAR16 , 1, 1, 1 }, // always UTF-16 + { "char32_t" , RDI_TypeKind_Char32 , CV_BasicType_CHAR32 , 1, 1, 1 }, // always UTF-32 + { "__pointer" , ptr_type , CV_BasicType_PTR , 0, 0, 0 } + }; + + for (U64 i = 0; i < ArrayCount(table); ++i) { + U64 builtin_size; + if (table[i].kind_rdi == RDI_TypeKind_Void || table[i].kind_rdi == RDI_TypeKind_Handle) { + builtin_size = rdi_size_from_basic_type_kind(ptr_type); + } else { + builtin_size = rdi_size_from_basic_type_kind(table[i].kind_rdi); + } + + RDIB_Type *builtin = lnk_push_converted_codeview_type(arena, rdib_types_list, itype_map, table[i].kind_cv); + builtin->kind = table[i].kind_rdi; + builtin->builtin.name = str8_cstring(table[i].name); + builtin->builtin.size = builtin_size; + + RDIB_Type **wrapper = push_array(arena, RDIB_Type *, 1); + *wrapper = builtin; + + if (table[i].make_pointer_near) { + RDIB_Type *ptr_near = lnk_push_converted_codeview_type(arena, rdib_types_list, itype_map, table[i].kind_cv | 0x100); + ptr_near->kind = RDI_TypeKind_Ptr; + ptr_near->ptr.size = rdi_size_from_basic_type_kind(ptr_type); + ptr_near->ptr.type_ref = wrapper; + } + if (table[i].make_pointer_32) { + RDIB_Type *ptr_32 = lnk_push_converted_codeview_type(arena, rdib_types_list, itype_map, table[i].kind_cv | 0x400); + ptr_32->kind = RDI_TypeKind_Ptr; + ptr_32->ptr.size = 4; + ptr_32->ptr.type_ref = wrapper; + } + if (table[i].make_pointer_64) { + RDIB_Type *ptr_64 = lnk_push_converted_codeview_type(arena, rdib_types_list, itype_map, table[i].kind_cv | 0x600); + ptr_64->kind = RDI_TypeKind_Ptr; + ptr_64->ptr.size = 8; + ptr_64->ptr.type_ref = wrapper; + } + +#if 0 + RDIB_Type *ptr_far = lnk_push_converted_codeview_type(arena, rdib_types_list, itype_map, table[i].kind_cv | 0x200); + RDIB_Type *ptr_huge = lnk_push_converted_codeview_type(arena, rdib_types_list, itype_map, table[i].kind_cv | 0x300); + RDIB_Type *ptr_16_32 = lnk_push_converted_codeview_type(arena, rdib_types_list, itype_map, table[i].kind_cv | 0x500); + + ptr_far->kind = RDI_TypeKind_Ptr; + ptr_far->ptr.size = rdi_size_from_basic_type_kind(ptr_type); + ptr_far->ptr.type_ref = wrapper; + + ptr_huge->kind = RDI_TypeKind_Ptr; + ptr_huge->ptr.size = 4; + ptr_huge->ptr.type_ref = wrapper; + + + ptr_16_32->kind = RDI_TypeKind_Ptr; + ptr_16_32->ptr.size = 6; + ptr_16_32->ptr.type_ref = wrapper; +#endif + } +} + +internal RDIB_TypeRef +lnk_rdib_type_from_itype(LNK_ConvertTypesToRDI *task, CV_TypeIndex itype) +{ + if (itype < CV_MinComplexTypeIndex) { + // Check for supported CodeView pointer formats: + // - near 64 bit + // - 64 bit + // - 32 bit + AssertAlways((itype >> 8) == /* near */ 0x1 || + (itype >> 8) == /* 64 bit */ 0x6 || + (itype >> 8) == /* 32 bit */ 0x4 || + (itype >> 8) == 0); + } + + RDIB_TypeRef ref = &task->tpi_itype_map[0]; + if (itype < task->itype_ranges[CV_TypeIndexSource_TPI].max) { + CV_TypeIndex final_itype = itype; + if (itype > task->itype_ranges[CV_TypeIndexSource_TPI].min) { + CV_Leaf leaf = cv_debug_t_get_leaf(task->types[CV_TypeIndexSource_TPI], itype - task->itype_ranges[CV_TypeIndexSource_TPI].min); + if (cv_is_udt(leaf.kind)) { + CV_UDTInfo udt_info = cv_get_udt_info(leaf.kind, leaf.data); + if (udt_info.props & CV_TypeProp_FwdRef) { + String8 name = cv_name_from_udt_info(udt_info); + final_itype = lnk_udt_name_hash_table_lookup_itype(task->udt_name_buckets, task->udt_name_bucket_cap, name); + } + } + } + + ref = &task->tpi_itype_map[final_itype]; + } + + return ref; +} + +internal RDI_MemberKind +lnk_rdib_method_kind_from_cv_prop(CV_MethodProp prop) +{ + switch (prop) { + case CV_MethodProp_Vanilla: return RDI_MemberKind_Method; + case CV_MethodProp_Virtual: return RDI_MemberKind_VirtualMethod; + case CV_MethodProp_Static: return RDI_MemberKind_StaticMethod; + case CV_MethodProp_Friend: NotImplemented; + case CV_MethodProp_Intro: return RDI_MemberKind_VirtualMethod; + case CV_MethodProp_PureVirtual: return RDI_MemberKind_VirtualMethod; + case CV_MethodProp_PureIntro: return RDI_MemberKind_VirtualMethod; + } + return RDI_MemberKind_NULL; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_convert_types_to_rdi_task) +{ + ProfBeginFunction(); + LNK_ConvertTypesToRDI *task = raw_task; + + // upfront push output type array + U64 leaf_count = dim_1u64(task->ranges[task_id]); + rdib_type_chunk_list_reserve(arena, &task->rdib_types_lists[task_id], leaf_count); + + for(U64 leaf_idx = task->ranges[task_id].min; leaf_idx < task->ranges[task_id].max; ++leaf_idx) { + U64 itype = task->itype_ranges[CV_TypeIndexSource_TPI].min + leaf_idx; + CV_Leaf src = cv_debug_t_get_leaf(task->types[CV_TypeIndexSource_TPI], leaf_idx); + + switch (src.kind) { + case CV_LeafKind_MODIFIER: { + CV_LeafModifier *modifier = (CV_LeafModifier *) src.data.str; + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + dst->kind = RDI_TypeKind_Modifier; + dst->modifier.flags = rdi_type_modifier_flags_from_cv_modifier_flags(modifier->flags); + dst->modifier.type_ref = lnk_rdib_type_from_itype(task, modifier->itype); + } break; + case CV_LeafKind_POINTER: { + CV_LeafPointer *ptr = (CV_LeafPointer *) src.data.str; + CV_PointerKind ptr_kind = CV_PointerAttribs_ExtractKind(ptr->attribs); + CV_PointerMode ptr_mode = CV_PointerAttribs_ExtractMode(ptr->attribs); + U32 ptr_size = CV_PointerAttribs_ExtractSize(ptr->attribs); + (void)ptr_kind; + + // parse ahead type chain and squash modifiers + RDI_TypeModifierFlags modifier_flags = rdi_type_modifier_flags_from_cv_pointer_attribs(ptr->attribs); + CV_TypeIndex next_itype; + for (next_itype = ptr->itype; task->itype_ranges[CV_TypeIndexSource_TPI].min <= next_itype && next_itype < task->itype_ranges[CV_TypeIndexSource_TPI].max;) { + U64 next_leaf_idx = next_itype - task->itype_ranges[CV_TypeIndexSource_TPI].min; + CV_Leaf next_leaf = cv_debug_t_get_leaf(task->types[CV_TypeIndexSource_TPI], next_leaf_idx); + if (next_leaf.kind != CV_LeafKind_MODIFIER) { + break; + } + + // parse LF_MODIFIER + CV_LeafModifier *sym_modifier = (CV_LeafModifier *) next_leaf.data.str; + RDI_TypeModifierFlags flags = rdi_type_modifier_flags_from_cv_modifier_flags(sym_modifier->flags); + + // accumulate modifier flags + modifier_flags |= flags; + + // advance + next_itype = sym_modifier->itype; + } + + if (modifier_flags == 0) { + // No modifer just generate pointer type. + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + dst->kind = rdi_type_kind_from_pointer(ptr->attribs, ptr_mode); + dst->ptr.size = ptr_size; + dst->ptr.type_ref = lnk_rdib_type_from_itype(task, ptr->itype); + } else { + // CodeView embeds modifier in pointer struct, we don't have an equivalent + // so generate a modifier type in pointer slot and link with pointer type. + + RDIB_Type *ptr_type = rdib_type_chunk_list_push(arena, &task->rdib_types_lists[task_id], task->type_cap); + ptr_type->kind = rdi_type_kind_from_pointer(ptr->attribs, ptr_mode); + ptr_type->ptr.type_ref = lnk_rdib_type_from_itype(task, next_itype); + RDIB_Type **indirect_ptr_type = push_array(arena, RDIB_Type *, 1); + *indirect_ptr_type = ptr_type; + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + dst->kind = RDI_TypeKind_Modifier; + dst->modifier.flags = modifier_flags; + dst->modifier.type_ref = indirect_ptr_type; + } + } break; + case CV_LeafKind_PROCEDURE: { + CV_LeafProcedure *proc = (CV_LeafProcedure *) src.data.str; + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + dst->kind = RDI_TypeKind_Function; + dst->func.return_type = lnk_rdib_type_from_itype(task, proc->ret_itype); + dst->func.params_type = lnk_rdib_type_from_itype(task, proc->arg_itype); + } break; + case CV_LeafKind_MFUNCTION: { + CV_LeafMFunction *mfunc = (CV_LeafMFunction *) src.data.str; + B32 is_static_method = mfunc->this_itype == 0; + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + + if (is_static_method) { + dst->kind = RDI_TypeKindExt_StaticMethod; + dst->static_method.class_type = lnk_rdib_type_from_itype(task, mfunc->class_itype); + dst->static_method.return_type = lnk_rdib_type_from_itype(task, mfunc->ret_itype); + dst->static_method.params_type = lnk_rdib_type_from_itype(task, mfunc->arg_itype); + } else { + dst->kind = RDI_TypeKind_Method; + dst->method.class_type = lnk_rdib_type_from_itype(task, mfunc->class_itype); + dst->method.this_type = lnk_rdib_type_from_itype(task, mfunc->this_itype); + dst->method.return_type = lnk_rdib_type_from_itype(task, mfunc->ret_itype); + dst->method.params_type = lnk_rdib_type_from_itype(task, mfunc->arg_itype); + } + } break; + case CV_LeafKind_BITFIELD: { + CV_LeafBitField *bitfield = (CV_LeafBitField *) src.data.str; + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + dst->kind = RDI_TypeKind_Bitfield; + dst->bitfield.off = bitfield->pos; + dst->bitfield.count = bitfield->len; + dst->bitfield.value_type = lnk_rdib_type_from_itype(task, bitfield->itype); + } break; + case CV_LeafKind_ARRAY: { + CV_LeafArray *array = (CV_LeafArray *) src.data.str; + CV_NumericParsed size = cv_numeric_from_data_range(src.data.str + sizeof(CV_LeafArray), src.data.str + src.data.size); + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + dst->kind = RDI_TypeKind_Array; + dst->array.entry_type = lnk_rdib_type_from_itype(task, array->entry_itype); + dst->array.size = cv_u64_from_numeric(&size); + } break; + case CV_LeafKind_CLASS: + case CV_LeafKind_STRUCTURE: { + CV_LeafStruct *udt = (CV_LeafStruct *) src.data.str; + CV_NumericParsed size = cv_numeric_from_data_range(src.data.str + sizeof(CV_LeafStruct), src.data.str + src.data.size); + + String8 name; + String8 link_name; + if (udt->props & CV_TypeProp_HasUniqueName) { + name = str8_cstring_capped(src.data.str + sizeof(CV_LeafStruct) + size.encoded_size, src.data.str + src.data.size); + link_name = str8_cstring_capped_reverse(name.str + name.size + 1, src.data.str + src.data.size); + } else { + name = str8_cstring_capped_reverse(src.data.str + sizeof(CV_LeafStruct) + size.encoded_size, src.data.str + src.data.size); + link_name = name; + } + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_struct_lists[task_id], task->tpi_itype_map, itype); + dst->udt.name = name; + dst->udt.link_name = link_name; + dst->udt.members = lnk_rdib_type_from_itype(task, udt->field_itype); + dst->udt.struct_type.size = cv_u64_from_numeric(&size); + dst->udt.struct_type.derived = lnk_rdib_type_from_itype(task, udt->derived_itype); + dst->udt.struct_type.vtshape = lnk_rdib_type_from_itype(task, udt->vshape_itype); + + if (udt->props & CV_TypeProp_FwdRef) { + dst->kind = src.kind == CV_LeafKind_CLASS ? RDI_TypeKind_IncompleteClass : RDI_TypeKind_IncompleteStruct; + } else { + dst->kind = src.kind == CV_LeafKind_CLASS ? RDI_TypeKind_Class : RDI_TypeKind_Struct; + } + } break; + case CV_LeafKind_CLASS2: + case CV_LeafKind_STRUCT2: { + CV_LeafStruct2 *udt = (CV_LeafStruct2 *) src.data.str; + CV_NumericParsed size = cv_numeric_from_data_range(src.data.str + sizeof(CV_LeafStruct2), src.data.str + src.data.size); + + String8 name; + String8 link_name; + if (udt->props & CV_TypeProp_HasUniqueName) { + name = str8_cstring_capped(src.data.str + sizeof(CV_LeafStruct2) + size.encoded_size, src.data.str + src.data.size); + link_name = str8_cstring_capped_reverse(name.str + name.size + 1, src.data.str + src.data.size); + } else { + name = str8_cstring_capped_reverse(src.data.str + sizeof(CV_LeafStruct2) + size.encoded_size, src.data.str + src.data.size); + link_name = name; + } + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_struct_lists[task_id], task->tpi_itype_map, itype); + dst->udt.name = name; + dst->udt.link_name = link_name; + dst->udt.members = lnk_rdib_type_from_itype(task, udt->field_itype); + dst->udt.struct_type.size = cv_u64_from_numeric(&size); + dst->udt.struct_type.derived = lnk_rdib_type_from_itype(task, udt->derived_itype); + dst->udt.struct_type.vtshape = lnk_rdib_type_from_itype(task, udt->vshape_itype); + + if (udt->props & CV_TypeProp_FwdRef) { + dst->kind = src.kind == CV_LeafKind_CLASS2 ? RDI_TypeKind_IncompleteClass : RDI_TypeKind_IncompleteStruct; + } else { + dst->kind = src.kind == CV_LeafKind_CLASS2 ? RDI_TypeKind_Class : RDI_TypeKind_Struct; + } + } break; + case CV_LeafKind_UNION: { + CV_LeafUnion *udt = (CV_LeafUnion *) src.data.str; + CV_NumericParsed size = cv_numeric_from_data_range(src.data.str + sizeof(CV_LeafUnion), src.data.str + src.data.size); + + String8 name; + String8 link_name; + if (udt->props & CV_TypeProp_HasUniqueName) { + name = str8_cstring_capped(src.data.str + sizeof(CV_LeafUnion) + size.encoded_size, src.data.str + src.data.size); + link_name = str8_cstring_capped_reverse(name.str + name.size + 1, src.data.str + src.data.size); + } else { + name = str8_cstring_capped_reverse(src.data.str + sizeof(CV_LeafUnion) + size.encoded_size, src.data.str + src.data.size); + link_name = name; + } + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_union_lists[task_id], task->tpi_itype_map, itype); + dst->udt.name = name; + dst->udt.link_name = link_name; + dst->udt.members = lnk_rdib_type_from_itype(task, udt->field_itype); + dst->udt.union_type.size = cv_u64_from_numeric(&size); + + if (udt->props & CV_TypeProp_FwdRef) { + dst->kind = RDI_TypeKind_IncompleteUnion; + } else { + dst->kind = RDI_TypeKind_Union; + } + } break; + case CV_LeafKind_ENUM: { + CV_LeafEnum *udt = (CV_LeafEnum *) src.data.str; + + String8 name; + String8 link_name; + if (udt->props & CV_TypeProp_HasUniqueName) { + name = str8_cstring_capped(src.data.str + sizeof(*udt), src.data.str + src.data.size); + link_name = str8_cstring_capped_reverse(name.str + name.size + 1, src.data.str + src.data.size); + } else { + name = str8_cstring_capped_reverse(src.data.str + sizeof(*udt), src.data.str + src.data.size); + link_name = name; + } + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_enum_lists[task_id], task->tpi_itype_map, itype); + dst->kind = (RDI_TypeKindExt)RDI_TypeKind_Enum; + dst->udt.name = name; + dst->udt.link_name = link_name; + dst->udt.members = lnk_rdib_type_from_itype(task, udt->field_itype); + dst->udt.enum_type.base_type = lnk_rdib_type_from_itype(task, udt->base_itype); + + if (udt->props & CV_TypeProp_FwdRef) { + dst->kind = RDI_TypeKind_IncompleteEnum; + } else { + dst->kind = (RDI_TypeKindExt)RDI_TypeKind_Enum; + } + } break; + case CV_LeafKind_ARGLIST: { + CV_LeafArgList *arglist = (CV_LeafArgList *) src.data.str; + CV_TypeIndex *itypes = (CV_TypeIndex *) (arglist + 1); + + if (arglist->count * sizeof(CV_TypeIndex) + sizeof(CV_LeafArgList) > src.data.size) { + AssertAlways("error: ill-formed LF_ARGLIST"); + break; + } + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_params_lists[task_id], task->tpi_itype_map, itype); + dst->kind = RDI_TypeKindExt_Params; // there is no Params kind in RDI + dst->params.count = arglist->count; + dst->params.types = push_array(arena, RDIB_TypeRef, arglist->count); + for (U64 param_idx = 0; param_idx < arglist->count; ++param_idx) { + // strange way to encode variadic params, when outside LF_ARGLIST LF_NOTYPE actually means null... + if (itypes[param_idx] == CV_LeafKind_NOTYPE) { + dst->params.types[param_idx] = task->variadic_type_ref; + } else { + dst->params.types[param_idx] = lnk_rdib_type_from_itype(task, itypes[param_idx]); + } + } + } break; + case CV_LeafKind_FIELDLIST: { + RDIB_UDTMemberChunkList *rdib_member_list; + RDIB_TypeChunkList *rdib_member_types; + B32 is_enum = sizeof(CV_LeafKind) <= src.data.size && (*(CV_LeafKind *)src.data.str == CV_LeafKind_ENUMERATE); + if (is_enum) { + rdib_member_list = &task->rdib_enum_members_lists[worker_id]; + rdib_member_types = &task->rdib_types_enum_members_lists[worker_id]; + } else { + rdib_member_list = &task->rdib_udt_members_lists[worker_id]; + rdib_member_types = &task->rdib_types_udt_members_lists[worker_id]; + } + + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, rdib_member_types, task->tpi_itype_map, itype); + dst->kind = RDI_TypeKindExt_Members; + + for (U64 cursor = 0; cursor + sizeof(CV_LeafKind) <= src.data.size; ) { + CV_LeafKind field_kind = *(CV_LeafKind *) (src.data.str + cursor); + cursor += sizeof(field_kind); + + // do we have bytes to read? + U64 header_size = cv_header_struct_size_from_leaf_kind(field_kind); + if (cursor + header_size > src.data.size) { + break; + } + + switch (field_kind) { + case CV_LeafKind_INDEX: { + CV_LeafIndex *index = (CV_LeafIndex *) (src.data.str + cursor); + cursor += sizeof(*index); + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB member list pointer + member->kind = RDI_MemberKindExt_MemberListPointer; + member->member_list_pointer = lnk_rdib_type_from_itype(task, index->itype); + } break; + case CV_LeafKind_MEMBER: { + // prase CodeView struct/class/union data member + CV_LeafMember *leaf_member = (CV_LeafMember *) (src.data.str + cursor); + CV_NumericParsed offset = cv_numeric_from_data_range((U8 *)(leaf_member + 1), src.data.str + src.data.size); + String8 name = str8_cstring_capped(src.data.str + cursor + sizeof(CV_LeafMember) + offset.encoded_size, src.data.str + src.data.size); + cursor += sizeof(CV_LeafMember); + cursor += offset.encoded_size; + cursor += name.size + 1; + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB data member + member->kind = RDI_MemberKind_DataField; + member->data_field.name = name; + member->data_field.type_ref = lnk_rdib_type_from_itype(task, leaf_member->itype); + member->data_field.offset = cv_u64_from_numeric(&offset); + } break; + case CV_LeafKind_STMEMBER: { + // parse CodeView static member + CV_LeafStMember *st_member = (CV_LeafStMember *) (src.data.str + cursor); + String8 name = str8_cstring_capped(st_member + 1, src.data.str + src.data.size); + cursor += sizeof(CV_LeafStMember); + cursor += name.size + 1; + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB static member + member->kind = RDI_MemberKind_StaticData; + member->static_data.name = name; + member->static_data.type_ref = lnk_rdib_type_from_itype(task, st_member->itype); + } break; + case CV_LeafKind_METHOD: { + // parse CodeView over-loaded method + CV_LeafMethod *method = (CV_LeafMethod *) (src.data.str + cursor); + String8 name = str8_cstring_capped(method + 1, src.data.str + src.data.size); + cursor += sizeof(CV_LeafMethod); + cursor += name.size + 1; + + if (contains_1u64(task->itype_ranges[CV_TypeIndexSource_TPI], method->list_itype)) { + U64 method_list_leaf_idx = method->list_itype - task->itype_ranges[CV_TypeIndexSource_TPI].min; + CV_Leaf method_list_leaf = cv_debug_t_get_leaf(task->types[CV_TypeIndexSource_TPI], method_list_leaf_idx); + if (method_list_leaf.kind == CV_LeafKind_METHODLIST) { + for (U64 cursor = 0; cursor + sizeof(CV_LeafMethodListMember) <= method_list_leaf.data.size; ) { + // parse CodeView method overload info + CV_LeafMethodListMember *list_member = (CV_LeafMethodListMember *) (method_list_leaf.data.str + cursor); + CV_MethodProp prop = CV_FieldAttribs_ExtractMethodProp(list_member->attribs); + cursor += sizeof(CV_LeafMethodListMember); + U32 vftable_offset = 0; + if (prop == CV_MethodProp_Intro || prop == CV_MethodProp_PureIntro) { + str8_deserial_read_struct(src.data, cursor, &vftable_offset); + cursor += sizeof(vftable_offset); + } + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB method + member->kind = RDI_MemberKind_Method; + member->method.kind = lnk_rdib_method_kind_from_cv_prop(prop); + member->method.name = name; + member->method.type_ref = lnk_rdib_type_from_itype(task, list_member->itype); + member->method.vftable_offset = vftable_offset; + } + } else { + Assert(!"error: expected LF_METHODLIST"); + } + } + } break; + case CV_LeafKind_ONEMETHOD: { + // parse CodeView method + CV_LeafOneMethod *one_method = (CV_LeafOneMethod *) (src.data.str + cursor); + CV_MethodProp prop = CV_FieldAttribs_ExtractMethodProp(one_method->attribs); + cursor += sizeof(CV_LeafOneMethod); + U32 vftable_offset = 0; + if (prop == CV_MethodProp_Intro || prop == CV_MethodProp_PureIntro) { + str8_deserial_read_struct(src.data, cursor, &vftable_offset); + cursor += sizeof(vftable_offset); + } + String8 name = str8_cstring_capped(src.data.str + cursor, src.data.str + src.data.size); + cursor += name.size + 1; + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB member + member->kind = RDI_MemberKind_Method; + member->method.kind = lnk_rdib_method_kind_from_cv_prop(prop); + member->method.name = name; + member->method.type_ref = lnk_rdib_type_from_itype(task, one_method->itype); + member->method.vftable_offset = vftable_offset; + } break; + case CV_LeafKind_NESTTYPE: { + // parse CodeView nested type + CV_LeafNestType *nest_type = (CV_LeafNestType *) (src.data.str + cursor); + String8 name = str8_cstring_capped(nest_type + 1, src.data.str + src.data.size); + cursor += sizeof(CV_LeafNestType); + cursor += name.size + 1; + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB nested type member + member->kind = RDI_MemberKind_NestedType; + member->nested_type.name = name; + member->nested_type.type_ref = lnk_rdib_type_from_itype(task, nest_type->itype); + } break; + case CV_LeafKind_NESTTYPEEX: { + // parse CodeView nested type extended + CV_LeafNestTypeEx *nest_type_ex = (CV_LeafNestTypeEx *) (src.data.str + cursor); + String8 name = str8_cstring_capped(nest_type_ex + 1, src.data.str + src.data.size); + cursor += sizeof(CV_LeafNestTypeEx); + cursor += name.size + 1; + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB nested type member + member->kind = RDI_MemberKind_NestedType; + member->nested_type.name = name; + member->nested_type.type_ref = lnk_rdib_type_from_itype(task, nest_type_ex->itype); + } break; + case CV_LeafKind_BCLASS: { + // parse CodeView base class member + CV_LeafBClass *bclass = (CV_LeafBClass *) (src.data.str + cursor); + CV_NumericParsed offset = cv_numeric_from_data_range((U8 *)(bclass + 1), src.data.str + src.data.size); + cursor += sizeof(CV_LeafBClass); + cursor += offset.encoded_size; + + U64 offset64 = cv_u64_from_numeric(&offset); + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB base class member + member->kind = RDI_MemberKind_Base; + member->base_class.type_ref = lnk_rdib_type_from_itype(task, bclass->itype); + member->base_class.offset = offset64; + } break; + case CV_LeafKind_VBCLASS: + case CV_LeafKind_IVBCLASS: { + // parse CodeView virtual base class + CV_LeafVBClass *vbclass = (CV_LeafVBClass *) (src.data.str + cursor); + CV_NumericParsed vbptr_off = cv_numeric_from_data_range(src.data.str + cursor + sizeof(*vbclass), src.data.str + src.data.size); + CV_NumericParsed vtable_off = cv_numeric_from_data_range(src.data.str + cursor + sizeof(*vbclass) + vbptr_off.encoded_size, src.data.str + src.data.size); + cursor += sizeof(CV_LeafVBClass); + cursor += vbptr_off.encoded_size; + cursor += vtable_off.encoded_size; + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB virtual base class member + member->kind = RDI_MemberKind_VirtualBase; + member->virtual_base_class.type_ref = lnk_rdib_type_from_itype(task, vbclass->itype); + member->virtual_base_class.vbptr_off = cv_u64_from_numeric(&vbptr_off); + member->virtual_base_class.vtable_off = cv_u64_from_numeric(&vtable_off); + } break; + case CV_LeafKind_VFUNCTAB: { + // parse CodeView virtual function table + CV_LeafVFuncTab *vfunc_tab = (CV_LeafVFuncTab *) (src.data.str + cursor); + cursor += sizeof(*vfunc_tab); + + // TODO: we don't have an equivalent in RDI + } break; + case CV_LeafKind_ENUMERATE: { + // parse CodeView enum member + CV_LeafEnumerate *enumerate = (CV_LeafEnumerate *) (src.data.str + cursor); + CV_NumericParsed value = cv_numeric_from_data_range((U8 *) (enumerate + 1), src.data.str + src.data.size); + String8 name = str8_cstring_capped(src.data.str + cursor + sizeof(CV_LeafEnumerate) + value.encoded_size, src.data.str + src.data.size); + cursor += sizeof(CV_LeafEnumerate); + cursor += value.encoded_size; + cursor += name.size + 1; + + // push new node + RDIB_UDTMember *member = rdib_udt_member_chunk_list_push(arena, rdib_member_list, task->udt_cap); + rdib_udt_member_list_push_node(&dst->members.list, member); + + // fill out RDIB enum member + member->kind = RDI_MemberKind_NULL; + member->enumerate.name = name; + member->enumerate.value = cv_u64_from_numeric(&value); + } break; + default: InvalidPath; + } + + cursor = AlignPow2(cursor, 4); + } + } break; + case CV_LeafKind_METHODLIST: { + // see CV_LeafKind_METHOD + } break; + case CV_LeafKind_LABEL: { + // ??? + } break; + case CV_LeafKind_VTSHAPE: { + RDIB_Type *dst = lnk_push_converted_codeview_type(arena, &task->rdib_types_lists[task_id], task->tpi_itype_map, itype); + dst->kind = RDI_TypeKindExt_VirtualTable; + // ??? + } break; + case CV_LeafKind_VFTABLE: { + // ??? + } break; + default: InvalidPath; break; + } + +#undef push_converted_type + } + ProfEnd(); +} + +internal U64 +lnk_src_file_hash_cv(String8 normal_full_path, CV_C13ChecksumKind checksum_kind, String8 checksum) +{ + XXH3_state_t state; + XXH3_INITSTATE(&state); + XXH3_64bits_reset(&state); + XXH3_64bits_update(&state, normal_full_path.str, normal_full_path.size); + XXH3_64bits_update(&state, &checksum_kind, sizeof(checksum_kind)); + XXH3_64bits_update(&state, checksum.str, checksum.size); + XXH64_hash_t result = XXH3_64bits_digest(&state); + return result; +} + +internal String8 +lnk_normalize_src_file_path(Arena *arena, String8 file_path) +{ + Temp scratch = scratch_begin(&arena, 1); + String8 result = file_path; + result = lower_from_str8(scratch.arena, result); + result = path_convert_slashes(scratch.arena, result, PathStyle_UnixAbsolute); + result = push_str8_copy(arena, result); + scratch_end(scratch); + return result; +} + +internal LNK_SourceFileBucket * +lnk_src_file_hash_table_lookup_slot(LNK_SourceFileBucket **buckets, + U64 cap, + U64 hash, + String8 normal_path, + CV_C13ChecksumKind checksum_kind, + String8 checksum) +{ + U64 best_idx = hash % cap; + U64 bucket_idx = best_idx; + + RDIB_SourceFile temp = {0}; + temp.normal_full_path = normal_path; + temp.checksum_kind = checksum_kind; + temp.checksum = checksum; + + do { + if (buckets[bucket_idx] == 0) { + break; + } + if (rdib_source_file_match(buckets[bucket_idx]->src_file, &temp, operating_system_from_context())) { + return buckets[bucket_idx]; + } + bucket_idx = (bucket_idx + 1) % cap; + } while (bucket_idx != best_idx); + + return 0; +} + + +internal LNK_SourceFileBucket * +lnk_src_file_insert_or_update(LNK_SourceFileBucket **buckets, U64 cap, U64 hash, LNK_SourceFileBucket *new_bucket) +{ + LNK_SourceFileBucket *result = 0; + + U64 best_idx = hash % cap; + U64 idx = best_idx; + do { + retry:; + LNK_SourceFileBucket *curr_bucket = buckets[idx]; + + if (curr_bucket == 0) { + LNK_SourceFileBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + result = curr_bucket; + break; + } + + // another thread took the bucket... + goto retry; + } else if (rdib_source_file_match(curr_bucket->src_file, new_bucket->src_file, operating_system_from_context())) { + // do we need to update value in the bucket? + int cmp = u64_compar(&curr_bucket->obj_idx, &new_bucket->obj_idx); + if (cmp <= 0) { + // are we inserting bucket that was already inserterd? + Assert(cmp < 0); + + // don't need to update, more recent value is in the bucket + break; + } + + LNK_SourceFileBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + result = compare_bucket; + break; + } + + // another thread took the bucket... + goto retry; + } + + // advance + idx = (idx + 1); + idx = idx == cap ? 0 : idx; + } while (idx != best_idx); + + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_count_source_files_task) +{ + U64 unit_idx = task_id; + LNK_ConvertSourceFilesToRDITask *task = raw_task; + CV_DebugS debug_s = task->debug_s_arr[unit_idx]; + String8List raw_chksms_list = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_FileChksms); + + U64 count = 0; + + for (String8Node *raw_chksms_n = raw_chksms_list.first; raw_chksms_n != 0; raw_chksms_n = raw_chksms_n->next) { + for(U64 cursor = 0; cursor + sizeof(CV_C13Checksum) <= raw_chksms_n->string.size; ) { + // parse header + CV_C13Checksum *header = (CV_C13Checksum *) (raw_chksms_n->string.str + cursor); + + // update count + ++count; + + // advance cursor + cursor += sizeof(*header); + cursor += header->len; + cursor = AlignPow2(cursor, CV_FileCheckSumsAlign); + } + } + + // update total count + ins_atomic_u64_add_eval(&task->total_src_file_count, count); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_insert_src_files_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + U64 obj_idx = task_id; + LNK_ConvertSourceFilesToRDITask *task = raw_task; + CV_DebugS debug_s = task->debug_s_arr[obj_idx]; + String8List raw_chksms_list = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_FileChksms); + String8List raw_strtab_list = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_StringTable); + + if (raw_strtab_list.node_count > 1) { + lnk_error_obj(LNK_Warning_IllData, &task->obj_arr[obj_idx], "Multiple string table sub-sections, picking first one."); + } + if (raw_chksms_list.node_count > 1) { + lnk_error_obj(LNK_Warning_IllData, &task->obj_arr[obj_idx], "Multiple file checksum sub-sections, picking first one."); + } + + String8 string_table = cv_string_table_from_debug_s(debug_s); + LNK_SourceFileBucket *curr_bucket = 0; + + for (String8Node *raw_chksms_n = raw_chksms_list.first; raw_chksms_n != 0; raw_chksms_n = raw_chksms_n->next) { + for (U64 cursor = 0; cursor + sizeof(CV_C13Checksum) <= raw_chksms_n->string.size; ) { + // parse header + CV_C13Checksum *header = (CV_C13Checksum *) (raw_chksms_n->string.str + cursor); + + // grab checksum + String8 checksum = str8_substr(raw_chksms_n->string, rng_1u64(cursor + sizeof(CV_C13Checksum), + cursor + sizeof(CV_C13Checksum) + header->len)); + + // grab file path + Assert(header->name_off < string_table.size); + String8 file_path = str8_cstring_capped(string_table.str + header->name_off, string_table.str + string_table.size); + + // normalize file path + String8 normal_path = lnk_normalize_src_file_path(arena, file_path); + + // push new bucket + if (curr_bucket == 0) { + curr_bucket = push_array(arena, LNK_SourceFileBucket, 1); + curr_bucket->src_file = push_array(arena, RDIB_SourceFile, 1); + } + + // fill out obj idx so we can decide which source file to keep in the hash table + curr_bucket->obj_idx = obj_idx; + + // fill out part with source file info + curr_bucket->src_file->file_path = file_path; + curr_bucket->src_file->normal_full_path = normal_path; + curr_bucket->src_file->checksum_kind = rdi_checksum_from_cv_c13(header->kind); + curr_bucket->src_file->checksum = checksum; + curr_bucket->src_file->line_table_frags = 0; + + // insert bucket + U64 normal_path_hash = lnk_src_file_hash_cv(normal_path, header->kind, checksum); + LNK_SourceFileBucket *insert_result = lnk_src_file_insert_or_update(task->src_file_buckets, task->src_file_buckets_cap, normal_path_hash, curr_bucket); + + if (curr_bucket == insert_result) { + // bucket was inserted into empty slot, reset current bucket + curr_bucket = 0; + } else if (curr_bucket != insert_result) { + // reuse evicted bucket + curr_bucket = insert_result; + } + + // advance cursor + cursor += sizeof(*header); + cursor += header->len; + cursor = AlignPow2(cursor, CV_FileCheckSumsAlign); + } + } + + scratch_end(scratch); + ProfEnd(); +} + +internal RDIB_Type * +lnk_find_container_type(String8 name, Rng1U64 tpi_itype_range, LNK_UDTNameBucket **udt_name_buckets, U64 udt_name_buckets_cap, RDIB_Type **tpi_itype_map) +{ + CV_TypeIndex container_itype = 0; + + String8 delim = str8_lit("::"); + U64 delim_pos = str8_find_needle_reverse(name, 0, delim, 0); + if (delim_pos > 0) { + U64 container_name_size = delim_pos - delim.size; + String8 container_name = str8_prefix(name, container_name_size); + container_itype = lnk_udt_name_hash_table_lookup_itype(udt_name_buckets, udt_name_buckets_cap, container_name); + } + + RDIB_Type *container = 0; + if (container_itype > 0) { + Assert(container_itype < tpi_itype_range.max); + container = tpi_itype_map[container_itype]; + } + + return container; +} + +internal RDIB_Type * +lnk_type_from_itype(CV_TypeIndex itype, Rng1U64 tpi_itype_range, RDIB_Type **tpi_itype_map, LNK_Obj *obj, CV_SymKind symbol_kind, U64 symbol_offset) +{ + RDIB_Type *type = 0; + if (itype < tpi_itype_range.max) { + type = tpi_itype_map[itype]; + } else { + lnk_error_obj(LNK_Error_CvIllSymbolData, obj, "Out of bounds type index 0x%x in S_%S @ 0x%llx.", + itype, cv_string_from_sym_kind(symbol_kind), symbol_offset); + } + return type; +} + +internal U64 +lnk_virt_off_from_sect_off(U64 sect_idx, U64 sect_off, LNK_SectionArray image_sects, LNK_Obj *obj, CV_SymKind symbol_kind, U64 symbol_offset) +{ + U64 virt_off = 0; + if (sect_idx < image_sects.count) { + virt_off = image_sects.v[sect_idx].virt_off + sect_off; + } else { + lnk_error_obj(LNK_Error_CvIllSymbolData, obj, "Out of bounds section index 0x%x in S_%S @ 0x%llx.", + sect_idx, cv_string_from_sym_kind(symbol_kind), symbol_offset); + } + return virt_off; +} + +internal Rng1U64 +lnk_virt_range_from_sect_off_size(U64 sect_idx, U64 sect_off, U64 size, LNK_SectionArray image_sects, LNK_Obj *obj, CV_SymKind symbol_kind, U64 symbol_offset) +{ + Rng1U64 virt_range = {0}; + if (sect_idx < image_sects.count) { + U64 virt_off = image_sects.v[sect_idx].virt_off + sect_off; + virt_range = rng_1u64(virt_off, virt_off + size); + } else { + lnk_error_obj(LNK_Error_CvIllSymbolData, obj, "Out of bounds section index 0x%x in S_%S @ 0x%llx.", + sect_idx, cv_string_from_sym_kind(symbol_kind), symbol_offset); + } + return virt_range; +} + +internal void +lnk_error_on_invalid_defrange_symbol(LNK_Obj *obj, CV_Symbol symbol) +{ + lnk_error_obj(LNK_Error_CvIllSymbolData, obj, "Unable to parse symbol stream, unexpected S_%S without preceding S_LOCAL @ 0x%llx.", + cv_string_from_sym_kind(symbol.kind), symbol.offset); +} + +internal void +lnk_error_on_missing_cv_frameproc(LNK_Obj *obj, CV_Symbol symbol) +{ + lnk_error_obj(LNK_Error_CvIllSymbolData, obj, "Missing S_FRAMEPROC, unable to parse S_%S @ 0x%llx.", + cv_string_from_sym_kind(symbol.kind), symbol.offset); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_find_obj_compiler_info_task) +{ + ProfBeginFunction(); + + LNK_ConvertUnitToRDITask *task = raw_task; + CV_SymbolListArray parsed_symbols = task->parsed_symbols[task_id]; + LNK_CodeViewCompilerInfo *comp_info = &task->comp_info_arr[task_id]; + + comp_info->arch = (CV_Arch)~0u; + comp_info->language = (CV_Language)~0u; + comp_info->compiler_name = str8_zero(); + + // infer unit compiler data from S_COMPILE* which always follows S_OBJ + for (U64 symbol_list_idx = 0; symbol_list_idx < parsed_symbols.count; ++symbol_list_idx) { + CV_SymbolList symbol_list = parsed_symbols.v[symbol_list_idx]; + for (CV_SymbolNode *symbol_n = symbol_list.first; symbol_n != 0; symbol_n = symbol_n->next) { + CV_Symbol symbol = symbol_n->data; + if (symbol.kind == CV_SymKind_COMPILE) { + AssertAlways(sizeof(CV_SymCompile) <= symbol.data.size); + CV_SymCompile *compile = (CV_SymCompile *)symbol.data.str; + comp_info->arch = compile->machine; + comp_info->language = CV_CompileFlags_ExtractLanguage(compile->flags); + comp_info->compiler_name = str8_cstring_capped(compile + 1, symbol.data.str + symbol.data.size); + goto exit; + } else if (symbol.kind == CV_SymKind_COMPILE2) { + AssertAlways(sizeof(CV_SymCompile2) <= symbol.data.size); + CV_SymCompile2 *compile2 = (CV_SymCompile2 *)symbol.data.str; + comp_info->arch = compile2->machine; + comp_info->language = CV_Compile2Flags_ExtractLanguage(compile2->flags); + comp_info->compiler_name = str8_cstring_capped(compile2 + 1, symbol.data.str + symbol.data.size); + goto exit; + } else if (symbol.kind == CV_SymKind_COMPILE3) { + AssertAlways(sizeof(CV_SymCompile3) <= symbol.data.size); + CV_SymCompile3 *compile3 = (CV_SymCompile3 *)symbol.data.str; + comp_info->arch = compile3->machine; + comp_info->language = CV_Compile3Flags_ExtractLanguage(compile3->flags); + comp_info->compiler_name = str8_cstring_capped(compile3 + 1, symbol.data.str + symbol.data.size); + goto exit; + } + } + } + exit:; + + LNK_Obj *obj = &task->obj_arr[task_id]; + + // fill out unit info + U64 unit_chunk_idx = task_id / task->unit_chunk_cap; + U64 local_unit_idx = task_id - unit_chunk_idx * task->unit_chunk_cap; + + RDIB_Unit *dst = &task->units[unit_chunk_idx].v[local_unit_idx]; + dst->arch = rdi_arch_from_cv_arch(comp_info->arch); + dst->unit_name = str8_skip_last_slash(obj->path); + dst->compiler_name = comp_info->compiler_name; + dst->source_file = str8_zero(); + dst->object_file = push_str8_copy(arena, obj->path); + dst->archive_file = push_str8_copy(arena, obj->lib_path); + dst->build_path = str8_zero(); + dst->language = rdi_language_from_cv_language(comp_info->language); + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_convert_line_tables_to_rdi_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + U64 unit_idx = task_id; + LNK_ConvertUnitToRDITask *task = raw_task; + LNK_Obj *obj = &task->obj_arr[unit_idx]; + CV_DebugS debug_s = task->debug_s_arr[unit_idx]; + + U64 unit_chunk_idx = unit_idx / task->unit_chunk_cap; + U64 local_unit_idx = unit_idx - unit_chunk_idx * task->unit_chunk_cap; + RDIB_Unit *dst = &task->units[unit_chunk_idx].v[local_unit_idx]; + + // find sub sections + String8 raw_string_table = cv_string_table_from_debug_s(debug_s); + String8 raw_file_chksms = cv_file_chksms_from_debug_s(debug_s); + String8List raw_lines_list = cv_sub_section_from_debug_s(debug_s, CV_C13SubSectionKind_Lines); + + // emit line table fragments for each source file from C13 line info + dst->line_table = rdib_line_table_chunk_list_push(arena, &task->line_tables[worker_id], task->line_table_cap); + + for (String8Node *raw_lines_node = raw_lines_list.first; raw_lines_node != 0; raw_lines_node = raw_lines_node->next) { + String8 raw_lines = raw_lines_node->string; + CV_C13LinesHeaderList parsed_list = cv_c13_lines_from_sub_sections(scratch.arena, raw_lines, rng_1u64(0, raw_lines.size)); + for (CV_C13LinesHeaderNode *lines_node = parsed_list.first; lines_node != 0; lines_node = lines_node->next) { + CV_C13LinesHeader parsed_lines = lines_node->v; + + // parse checksum header + if (parsed_lines.file_off + sizeof(CV_C13Checksum) > raw_file_chksms.size) { + lnk_error_obj(LNK_Warning_IllData, obj, "Out of bounds $$FILE_CHECKSUM offset (0x%llx) in line table header.", parsed_lines.file_off); + continue; + } + CV_C13Checksum *checksum_header = (CV_C13Checksum *) (raw_file_chksms.str + parsed_lines.file_off); + if (parsed_lines.file_off + sizeof(CV_C13Checksum) + checksum_header->len > raw_file_chksms.size) { + lnk_error_obj(LNK_Warning_IllData, obj, "Not enough bytes to read file checksum @ 0x%llx.", parsed_lines.file_off); + continue; + } + String8 file_path = str8_cstring_capped(raw_string_table.str + checksum_header->name_off, raw_string_table.str + raw_string_table.size); + String8 checksum_bytes = str8((U8 *) (checksum_header + 1), checksum_header->len); + + // read out lines + if (0 == parsed_lines.sec_idx || parsed_lines.sec_idx > task->image_sects.count) { + lnk_error_obj(LNK_Warning_IllData, obj, "Out of bounds section index (%u) in $$LINES; skip line info for \"%S\".", parsed_lines.sec_idx, file_path); + continue; + } + LNK_Section *sect = &task->image_sects.v[parsed_lines.sec_idx]; + CV_LineArray lines = cv_c13_line_array_from_data(arena, raw_lines, sect->virt_off, parsed_lines); + + // find source file for this line table + String8 normal_path = lnk_normalize_src_file_path(scratch.arena, file_path); + U64 src_file_hash = lnk_src_file_hash_cv(normal_path, checksum_header->kind, checksum_bytes); + LNK_SourceFileBucket *src_file_bucket = lnk_src_file_hash_table_lookup_slot(task->src_file_buckets, task->src_file_buckets_cap, src_file_hash, normal_path, checksum_header->kind, checksum_bytes); + if (src_file_bucket == 0) { + lnk_error_obj(LNK_Error_UnexpectedCodePath, obj, "Unable to find source file in the hash table: \"%S\".", file_path); + continue; + } + RDIB_SourceFile *src_file = src_file_bucket->src_file; + + // fill out line table fragment and atomically insert + RDIB_LineTableFragment *frag = rdib_line_table_push(arena, dst->line_table); + frag->src_file = src_file; + frag->voffs = lines.voffs; + frag->line_nums = lines.line_nums; + frag->col_nums = lines.col_nums; + frag->line_count = lines.line_count; + frag->col_count = lines.col_count; + + // build list of line table fragments per file + frag->next_src_file = ins_atomic_ptr_eval_assign(&src_file->line_table_frags, frag); + } + } + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_convert_symbols_to_rdi_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + LNK_ConvertUnitToRDITask *task = raw_task; + LNK_CodeViewSymbolsInput symbols_input = task->symbol_inputs[task_id]; + LNK_Obj *obj = &task->obj_arr[symbols_input.obj_idx]; + LNK_CodeViewCompilerInfo comp_info = task->comp_info_arr[symbols_input.obj_idx]; + CV_InlineeLinesAccel *inlinee_lines_accel = task->inlinee_lines_accel_arr[symbols_input.obj_idx]; + + RDI_Arch arch_rdi = rdi_arch_from_cv_arch(comp_info.arch); + + struct ScopeFrame { + struct ScopeFrame *prev; + RDIB_Scope *scope; + RDIB_Procedure *proc; + CV_ProcFlags proc_flags; + CV_SymFrameproc *frameproc; + U64 param_count; + U64 regrel32_idx; + RDIB_Variable *defrange_target; + }; +#define push_scope_frame() do { \ + struct ScopeFrame *frame; \ + if (free_scope_stack != 0) { \ + frame = free_scope_stack; \ + SLLStackPop_N(free_scope_stack, prev); \ + } else { \ + frame = push_array(scratch.arena, struct ScopeFrame, 1); \ + } \ + SLLStackPush_N(scope_stack, frame, prev); \ +} while (0) + + struct ScopeFrame *scope_stack = 0; + struct ScopeFrame *free_scope_stack = 0; + + // root frame + push_scope_frame(); + + for (CV_SymbolNode *symbol_n = symbols_input.symbol_list->first; symbol_n != 0; symbol_n = symbol_n->next) { + CV_Symbol symbol = symbol_n->data; + + switch (symbol.kind) { + case CV_SymKind_COMPILE: + case CV_SymKind_COMPILE2: + case CV_SymKind_COMPILE3: { + // handled above + } break; + case CV_SymKind_INLINESITE_END: + case CV_SymKind_END: { + if (scope_stack != 0) { + // move top frame to free stack + struct ScopeFrame *free_frame = scope_stack; + SLLStackPop_N(scope_stack, prev); + SLLStackPush_N(free_scope_stack, free_frame, prev); + } else { + lnk_error_obj(LNK_Error_CvIllSymbolData, obj, "Encountered unbalanced blocks. Unable to finish symbol parse."); + goto exit; + } + } break; + case CV_SymKind_BLOCK32: { + CV_SymBlock32 *block32 = (CV_SymBlock32 *) symbol.data.str; + Rng1U64 virt_range = lnk_virt_range_from_sect_off_size(block32->sec, block32->off, block32->len, task->image_sects, obj, symbol.kind, symbol.offset); + + // push new scope node + RDIB_Scope *scope = rdib_scope_chunk_list_push(arena, &task->scopes[worker_id], task->symbol_chunk_cap); + + // fill out scope + scope->container_proc = scope_stack->proc; + scope->parent = scope_stack->scope; + SLLQueuePush_N(scope_stack->scope->first_child, scope_stack->scope->last_child, scope, next_sibling); + rng_1u64_list_push(arena, &scope->ranges, virt_range); + +#if 0 + if (scope->parent) { + Assert(virt_range.min >= scope->parent->ranges.first->v.min); + Assert(virt_range.max <= scope->parent->ranges.first->v.max); + } +#endif + + // push new scope stack frame + push_scope_frame(); + scope_stack->scope = scope; + scope_stack->proc = scope->container_proc; + scope_stack->proc_flags = scope_stack->proc_flags; + scope_stack->frameproc = scope_stack->prev->frameproc; + } break; + case CV_SymKind_GDATA32: + case CV_SymKind_LDATA32: { + CV_SymData32 *data32 = (CV_SymData32 *) symbol.data.str; + String8 name = str8_cstring_capped(data32 + 1, symbol.data.str + symbol.data.size); + RDIB_Type *type = lnk_type_from_itype(data32->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + RDIB_Type *container_type = lnk_find_container_type(name, task->tpi_itype_range, task->udt_name_buckets, task->udt_name_buckets_cap, task->tpi_itype_map); + U64 data_voff = lnk_virt_off_from_sect_off(data32->sec, data32->off, task->image_sects, obj, symbol.kind, symbol.offset); + + B32 is_comp_gen = symbol.kind == CV_SymKind_LDATA32 && name.size == 0 && type == 0; + if (!is_comp_gen) { + + // get link name through virtual offset look up + String8 link_name = {0}; + if (symbol.kind == CV_SymKind_GDATA32) { + KeyValuePair *pair = hash_table_search_u64(task->extern_symbol_voff_ht, data_voff); + if (pair != 0) { + LNK_Symbol *link_symbol = pair->value_raw; + link_name = link_symbol->name; + } + } + + // make module relative location + RDIB_LocationList locations = {0}; + { + RDIB_EvalBytecode bytecode = {0}; + rdib_bytecode_push_op(arena, &bytecode, RDI_EvalOp_ModuleOff, data_voff); + + U64 data_size = rdib_size_from_type(type); + if (data_size == 0) { + data_size = max_U64; + } + + Rng1U64List ranges = {0}; + rng_1u64_list_push(arena, &ranges, rng_1u64(data_voff, data_voff + data_size)); + + RDIB_Location location = rdib_make_location_addr_byte_stream(ranges, bytecode); + rdib_location_list_push(arena, &locations, location); + } + + RDIB_VariableChunkList *var_chunk_list = symbol.kind == CV_SymKind_GDATA32 ? + &task->extern_gvars[worker_id] : &task->static_gvars[worker_id]; + + // push new node + RDIB_Variable *gvar = rdib_variable_chunk_list_push(arena, var_chunk_list, task->symbol_chunk_cap); + gvar->link_flags = symbol.kind == CV_SymKind_GDATA32 ? RDI_LinkFlag_External : 0; + gvar->name = name; + gvar->link_name = link_name; + gvar->type = type; + gvar->container_type = container_type; + gvar->container_proc = scope_stack->proc; + gvar->locations = locations; + } + } break; + case CV_SymKind_LTHREAD32: + case CV_SymKind_GTHREAD32: { + CV_SymThread32 *thread32 = (CV_SymThread32 *) symbol.data.str; + String8 name = str8_cstring_capped(thread32 + 1, symbol.data.str + symbol.data.size); + RDIB_Type *type = lnk_type_from_itype(thread32->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + RDIB_Type *container_type = lnk_find_container_type(name, task->tpi_itype_range, task->udt_name_buckets, task->udt_name_buckets_cap, task->tpi_itype_map); + + // make TLS offset location + RDIB_LocationList locations = {0}; + { + RDIB_EvalBytecode bytecode = {0}; + rdib_bytecode_push_op(arena, &bytecode, RDI_EvalOp_TLSOff, thread32->tls_off); + + Rng1U64List ranges = {0}; + rng_1u64_list_push(arena, &ranges, rng_1u64(0, max_U64)); + + RDIB_Location location = rdib_make_location_addr_byte_stream(ranges, bytecode); + rdib_location_list_push(arena, &locations, location); + } + + // push new node + RDIB_VariableChunkList *tvar_list = symbol.kind == CV_SymKind_GTHREAD32 ? &task->extern_tvars[worker_id] : &task->static_tvars[worker_id]; + RDIB_Variable *tvar = rdib_variable_chunk_list_push(arena, tvar_list, task->symbol_chunk_cap); + + // fill out thread variable + tvar->link_flags = symbol.kind == CV_SymKind_GTHREAD32 ? RDI_LinkFlag_External : 0; + tvar->name = name; + tvar->link_name = str8(0,0); + tvar->type = type; + tvar->container_type = container_type; + tvar->container_proc = scope_stack->proc; + tvar->locations = locations; + } break; + case CV_SymKind_LPROC32_ID: + case CV_SymKind_GPROC32_ID: { + AssertAlways(!"linker converts *_ID symbols in post-process step, if we ever get to this case then we have a bug in post-process step"); + } break; + case CV_SymKind_LPROC32: + case CV_SymKind_GPROC32: { + CV_SymProc32 *proc32 = (CV_SymProc32 *) symbol.data.str; + String8 name = str8_cstring_capped(proc32 + 1, symbol.data.str + symbol.data.size); + RDIB_Type *type = lnk_type_from_itype(proc32->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + Rng1U64 virt_range = lnk_virt_range_from_sect_off_size(proc32->sec, proc32->off, proc32->len, task->image_sects, obj, symbol.kind, symbol.offset); + + // infer container type for method + RDIB_Type *container_type = 0; + if (type != 0) { + if (type->kind == RDI_TypeKind_Method) { + container_type = (RDIB_Type *) type->method.class_type; + } else if (type->kind == RDI_TypeKindExt_StaticMethod) { + container_type = (RDIB_Type *) type->static_method.class_type; + } + } + + // get link name through virtual offset look up + String8 link_name = str8(0,0); + if (symbol.kind == CV_SymKind_GPROC32) { + KeyValuePair *pair = hash_table_search_u64(task->extern_symbol_voff_ht, virt_range.min); + if (pair != 0) { + LNK_Symbol *link_symbol = pair->value_raw; + link_name = link_symbol->name; + } + } + + // scan ahead for context S_FRAMEPROC (must be defined in scope of PROC symbol) + CV_SymFrameproc *frameproc = 0; + { + U64 depth = 1; + for (CV_SymbolNode *lookahead = symbol_n->next; lookahead != 0; lookahead = lookahead->next) { + if (lookahead->data.kind == CV_SymKind_FRAMEPROC) { + frameproc = (CV_SymFrameproc *) lookahead->data.data.str; + break; + } + if (cv_is_scope_symbol(lookahead->data.kind)) { + ++depth; + } else if (cv_is_end_symbol(lookahead->data.kind)) { + --depth; + if (depth == 0) { + break; + } + } + } + } + + // push new procedure node + RDIB_ProcedureChunkList *proc_list = symbol.kind == CV_SymKind_GPROC32 ? &task->extern_procs[worker_id] : &task->static_procs[worker_id]; + RDIB_Procedure *proc = rdib_procedure_chunk_list_push(arena, proc_list, task->symbol_chunk_cap); + + // push new scope node + RDIB_Scope *root_scope = rdib_scope_chunk_list_push(arena, &task->scopes[worker_id], task->symbol_chunk_cap); + root_scope->container_proc = proc; + root_scope->parent = scope_stack->scope; + if (scope_stack->scope != 0) { + SLLQueuePush_N(scope_stack->scope->first_child, scope_stack->scope->last_child, root_scope, next_sibling); + } + rng_1u64_list_push(arena, &root_scope->ranges, virt_range); + + // fill out procedure + proc->link_flags = symbol.kind == CV_SymKind_GPROC32 ? RDI_LinkFlag_External : 0; + proc->name = name; + proc->link_name = link_name; + proc->type = type; + proc->container_type = container_type; + proc->container_proc = scope_stack->proc; + proc->scope = root_scope; + + // push scope frame + push_scope_frame(); + scope_stack->scope = root_scope; + scope_stack->proc = proc; + scope_stack->proc_flags = proc32->flags; + scope_stack->frameproc = frameproc; + + // set number of params for procedure on scope so we can figure out which S_REGREL32 is param + { + B32 is_proc_scope = (scope_stack->proc->scope == scope_stack->scope); + if (is_proc_scope) { + RDIB_Type *params = 0; + if (scope_stack->proc != 0) { + RDIB_Type *proc_type = scope_stack->proc->type; + if (proc_type != 0) { + if (proc_type->kind == RDI_TypeKind_NULL) { + // compiler generates procedures with no type for __try/__except, lambdas, and etc. + } else if (proc_type->kind == RDI_TypeKind_Function) { + params = (RDIB_Type *)proc_type->func.params_type; + } else if (proc_type->kind == RDI_TypeKind_Method) { + params = (RDIB_Type *)proc_type->method.params_type; + } else if (proc_type->kind == RDI_TypeKindExt_StaticMethod) { + params = (RDIB_Type *)proc_type->static_method.params_type; + } else { + InvalidPath; + } + } + } + if (params != 0) { + AssertAlways(params->kind == RDI_TypeKindExt_Params); + scope_stack->param_count = params->params.count; + scope_stack->regrel32_idx = 0; + } + } + } + } break; + case CV_SymKind_THUNK32: { + CV_SymThunk32 *thunk32 = (CV_SymThunk32 *) symbol.data.str; + String8 name = str8_cstring_capped(thunk32 + 1, symbol.data.str + symbol.data.size); + Rng1U64 virt_range = lnk_virt_range_from_sect_off_size(thunk32->sec, thunk32->off, thunk32->len, task->image_sects, obj, symbol.kind, symbol.offset); + + // scan ahead for context S_FRAMEPROC (must be defined in scope of PROC symbol) + CV_SymFrameproc *frameproc = 0; + { + U64 depth = 1; + for (CV_SymbolNode *lookahead = symbol_n->next; lookahead != 0; lookahead = lookahead->next) { + if (lookahead->data.kind == CV_SymKind_FRAMEPROC) { + frameproc = (CV_SymFrameproc *) lookahead->data.data.str; + break; + } + if (cv_is_scope_symbol(lookahead->data.kind)) { + ++depth; + } else if (cv_is_end_symbol(lookahead->data.kind)) { + --depth; + if (depth == 0) { + break; + } + } + } + } + + // push new procedure node + RDIB_ProcedureChunkList *proc_list = &task->static_procs[worker_id]; + RDIB_Procedure *thunk = rdib_procedure_chunk_list_push(arena, proc_list, task->symbol_chunk_cap); + + // push new scope node + RDIB_Scope *root_scope = rdib_scope_chunk_list_push(arena, &task->scopes[worker_id], task->symbol_chunk_cap); + root_scope->container_proc = thunk; + root_scope->parent = scope_stack->scope; + if (scope_stack->scope != 0) { + SLLQueuePush_N(scope_stack->scope->first_child, scope_stack->scope->last_child, root_scope, next_sibling); + } + rng_1u64_list_push(arena, &root_scope->ranges, virt_range); + + // fill out procedure + thunk->name = name; + thunk->type = 0; + thunk->scope = root_scope; + + // push scope frame + push_scope_frame(); + scope_stack->scope = root_scope; + scope_stack->proc = thunk; + scope_stack->proc_flags = 0; + scope_stack->frameproc = frameproc; + } break; + case CV_SymKind_REGREL32: { + if (~scope_stack->proc_flags & CV_ProcFlag_OptDbgInfo) { + CV_SymRegrel32 *regrel32 = (CV_SymRegrel32 *) symbol.data.str; + String8 name = str8_cstring_capped(regrel32 + 1, symbol.data.str + symbol.data.size); + RDIB_Type *type = lnk_type_from_itype(regrel32->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + + RDI_LocalKind local_kind = RDI_LocalKind_Variable; + B32 is_ref = 0; + if (scope_stack->regrel32_idx < scope_stack->param_count) { + local_kind = RDI_LocalKind_Parameter; + if (type != 0) { + U64 byte_size = rdib_size_from_type(type); + switch (comp_info.arch) { + case CV_Arch_8086: is_ref = byte_size > 4 || !IsPow2OrZero(byte_size); break; + case CV_Arch_X64: is_ref = byte_size > 8 || !IsPow2OrZero(byte_size); break; + default: NotImplemented; + } + } + } + + // push node + RDIB_Variable *local = rdib_variable_chunk_list_push(arena, &task->locals[worker_id], task->symbol_chunk_cap); + SLLQueuePush(scope_stack->scope->local_first, scope_stack->scope->local_last, local); + ++scope_stack->scope->local_count; + + // fill out local + local->link_flags = 0; + local->name = name; + local->kind = local_kind; + local->type = type; + + // encode location + RDI_RegCode reg_code = rdi_reg_code_from_cv(comp_info.arch, regrel32->reg); + U32 value_size = 8; + U32 value_pos = 0; + rdib_push_location_addr_reg_off(arena, &local->locations, arch_rdi, reg_code, value_size, value_pos, (S64)regrel32->reg_off, is_ref, scope_stack->scope->ranges); + + // advance reg rel index + ++scope_stack->regrel32_idx; + } + } break; + case CV_SymKind_LOCAL: { + CV_SymLocal *sym_local = (CV_SymLocal *) symbol.data.str; + String8 name = str8_cstring_capped(sym_local + 1, symbol.data.str + symbol.data.size); + RDIB_Type *type = lnk_type_from_itype(sym_local->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + + // reset defrange target + scope_stack->defrange_target = 0; + + if (sym_local->flags & CV_LocalFlag_Global) { + // TODO: apply global modifications + } else if (sym_local->flags & CV_LocalFlag_Static) { + // TODO: apply local modifications + } + + // push New node + RDIB_Variable *local = rdib_variable_chunk_list_push(arena, &task->locals[worker_id], task->symbol_chunk_cap); + SLLQueuePush(scope_stack->scope->local_first, scope_stack->scope->local_last, local); + ++scope_stack->scope->local_count; + + // fill out local + local->link_flags = 0; + local->kind = sym_local->flags & CV_LocalFlag_Param ? RDI_LocalKind_Parameter : RDI_LocalKind_Variable; + local->name = name; + local->type = type; + + scope_stack->defrange_target = local; + } break; + case CV_SymKind_FILESTATIC: { + CV_SymFileStatic *file_static = (CV_SymFileStatic *) symbol.data.str; + String8 name = str8_cstring_capped(file_static + 1, symbol.data.str + symbol.data.size); + RDIB_Type *type = lnk_type_from_itype(file_static->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + + // push New node + RDIB_Variable *local = rdib_variable_chunk_list_push(arena, &task->locals[worker_id], task->symbol_chunk_cap); + SLLQueuePush(scope_stack->scope->local_first, scope_stack->scope->local_last, local); + ++scope_stack->scope->local_count; + + // fill out local + local->link_flags = 0; + local->kind = RDI_LocalKind_Variable; + local->name = name; + local->type = type; + + // set target for following defrange modifications + scope_stack->defrange_target = local; + } break; + case CV_SymKind_DEFRANGE_REGISTER: { + if (scope_stack->defrange_target == 0) { + lnk_error_on_invalid_defrange_symbol(obj, symbol); + break; + } + + CV_SymDefrangeRegister *defrange_reg = (CV_SymDefrangeRegister *) symbol.data.str; + RDI_RegCode reg_code = rdi_reg_code_from_cv(comp_info.arch, defrange_reg->reg); + CV_LvarAddrGap *gaps = (CV_LvarAddrGap *) (defrange_reg + 1); + U64 gap_count = (symbol.data.size - sizeof(*defrange_reg)) / sizeof(*gaps); + + Rng1U64 defrange = lnk_virt_range_from_sect_off_size(defrange_reg->range.sec, defrange_reg->range.off, defrange_reg->range.len, task->image_sects, obj, symbol.kind, symbol.offset); + Rng1U64List ranges = cv_make_defined_range_list_from_gaps(arena, defrange, gaps, gap_count); + RDIB_Location location = rdib_make_location_val_reg(ranges, reg_code); + + rdib_location_list_push(arena, &scope_stack->defrange_target->locations, location); + } break; + case CV_SymKind_DEFRANGE_FRAMEPOINTER_REL: { + if (scope_stack->defrange_target == 0) { + lnk_error_on_invalid_defrange_symbol(obj, symbol); + break; + } + if (scope_stack->frameproc == 0) { + lnk_error_on_missing_cv_frameproc(obj, symbol); + break; + } + + CV_SymDefrangeFramepointerRel *defrange_fprel = (CV_SymDefrangeFramepointerRel *)symbol.data.str; + CV_LvarAddrGap *gaps = (CV_LvarAddrGap *) (defrange_fprel + 1); + U64 gap_count = (symbol.data.size - sizeof(*defrange_fprel)) / sizeof(gaps[0]); + + B32 is_local_param = scope_stack->defrange_target->kind == RDI_LocalKind_Parameter; + CV_EncodedFramePtrReg encoded_fp_reg = cv_pick_fp_encoding(scope_stack->frameproc, is_local_param); + CV_Reg fp_reg = cv_decode_fp_reg(comp_info.arch, encoded_fp_reg); + RDI_RegCode fp_reg_rdi = rdi_reg_code_from_cv(comp_info.arch, fp_reg); + Rng1U64 defrange = lnk_virt_range_from_sect_off_size(defrange_fprel->range.sec, defrange_fprel->range.off, defrange_fprel->range.len, task->image_sects, obj, symbol.kind, symbol.offset); + Rng1U64List ranges = cv_make_defined_range_list_from_gaps(arena, defrange, gaps, gap_count); + U32 value_pos = 0; + U32 value_size = rdi_addr_size_from_arch(arch_rdi); + + rdib_push_location_addr_reg_off(arena, &scope_stack->defrange_target->locations, arch_rdi, fp_reg_rdi, value_size, value_pos, (S64)defrange_fprel->off, 0, ranges); + } break; + case CV_SymKind_DEFRANGE_SUBFIELD_REGISTER: { + if (scope_stack->defrange_target == 0) { + lnk_error_on_invalid_defrange_symbol(obj, symbol); + break; + } + + CV_SymDefrangeSubfieldRegister *defrange_subfield_register = (CV_SymDefrangeSubfieldRegister *) symbol.data.str; + CV_LvarAddrGap *gaps = (CV_LvarAddrGap *) (defrange_subfield_register + 1); + U64 gap_count = (symbol.data.size - sizeof(*defrange_subfield_register)) / sizeof(gaps[0]); + RDI_RegCode reg_rdi = rdi_reg_code_from_cv(comp_info.arch, defrange_subfield_register->reg); + U32 value_pos = CV_DefrangeSubfieldRegister_ExtractParentOffset(defrange_subfield_register->field_offset); + U32 value_size = cv_size_from_reg(comp_info.arch, defrange_subfield_register->reg) - value_pos; + Rng1U64 defrange = lnk_virt_range_from_sect_off_size(defrange_subfield_register->range.sec, defrange_subfield_register->range.off, defrange_subfield_register->range.len, task->image_sects, obj, symbol.kind, symbol.offset); + Rng1U64List ranges = cv_make_defined_range_list_from_gaps(arena, defrange, gaps, gap_count); + + rdib_push_location_addr_reg_off(arena, &scope_stack->defrange_target->locations, arch_rdi, reg_rdi, value_size, value_pos, 0, 0, ranges); + } break; + case CV_SymKind_DEFRANGE_FRAMEPOINTER_REL_FULL_SCOPE: { + if (scope_stack->defrange_target == 0) { + lnk_error_on_invalid_defrange_symbol(obj, symbol); + break; + } + if (scope_stack->frameproc == 0) { + lnk_error_on_missing_cv_frameproc(obj, symbol); + break; + } + + CV_SymDefrangeFramepointerRelFullScope *defrange_fprelfs = (CV_SymDefrangeFramepointerRelFullScope *) symbol.data.str; + B32 is_local_param = scope_stack->defrange_target->kind == RDI_LocalKind_Parameter; + CV_EncodedFramePtrReg encoded_fp_reg = cv_pick_fp_encoding(scope_stack->frameproc, is_local_param); + CV_Reg fp_reg = cv_decode_fp_reg(comp_info.arch, encoded_fp_reg); + RDI_RegCode fp_reg_rdi = rdi_reg_code_from_cv(comp_info.arch, fp_reg); + U32 value_size = cv_size_from_reg(comp_info.arch, fp_reg); + U32 value_pos = 0; + Rng1U64List ranges = scope_stack->scope->ranges; // variable is available everywhere in the scope + + rdib_push_location_addr_reg_off(arena, &scope_stack->defrange_target->locations, arch_rdi, fp_reg_rdi, value_size, value_pos, (S64)defrange_fprelfs->off, 0, ranges); + } break; + case CV_SymKind_DEFRANGE_REGISTER_REL: { + if (scope_stack->defrange_target == 0) { + lnk_error_on_invalid_defrange_symbol(obj, symbol); + break; + } + + CV_SymDefrangeRegisterRel *defrange_register_rel = (CV_SymDefrangeRegisterRel *) symbol.data.str; + CV_LvarAddrGap *gaps = (CV_LvarAddrGap *) (defrange_register_rel + 1); + U64 gap_count = (symbol.data.size - sizeof(*defrange_register_rel)) / sizeof(gaps[0]); + RDI_RegCode reg_rdi = rdi_reg_code_from_cv(comp_info.arch, defrange_register_rel->reg); + U64 value_size = cv_size_from_reg(comp_info.arch, defrange_register_rel->reg); + U64 value_pos = 0; + Rng1U64 defrange = lnk_virt_range_from_sect_off_size(defrange_register_rel->range.sec, defrange_register_rel->range.off, defrange_register_rel->range.len, task->image_sects, obj, symbol.kind, symbol.offset); + Rng1U64List ranges = cv_make_defined_range_list_from_gaps(arena, defrange, gaps, gap_count); + + rdib_push_location_addr_reg_off(arena, &scope_stack->defrange_target->locations, arch_rdi, reg_rdi, value_size, value_pos, (S64)defrange_register_rel->reg_off, 0, ranges); + } break; + case CV_SymKind_INLINESITE: { + CV_SymInlineSite *sym_inline_site = (CV_SymInlineSite *) symbol.data.str; + String8 binary_annots = str8((U8 *) (sym_inline_site + 1), symbol.data.size - sizeof(*sym_inline_site)); + + U64 parent_voff = 0; + if (scope_stack != 0) { + RDIB_Scope *proc_scope = scope_stack->proc->scope; + Assert(proc_scope->ranges.count == 1); + Rng1U64 scope_vrange = proc_scope->ranges.first->v; + parent_voff = scope_vrange.min; + } else { + Assert(!"S_INLINESITE doesn't have a parent procedure symbol"); + } + + // parse binary annots + CV_C13InlineeLinesParsed *inlinee_parsed = cv_c13_inlinee_lines_accel_find(inlinee_lines_accel, sym_inline_site->inlinee); + CV_InlineBinaryAnnotsParsed binary_annots_parsed = cv_c13_parse_inline_binary_annots(arena, parent_voff, inlinee_parsed, binary_annots); + + String8 name = str8_zero(); + RDIB_Type *type = 0; + RDIB_Type *owner = 0; + if (task->ipi_itype_range.min <= sym_inline_site->inlinee && sym_inline_site->inlinee < task->ipi_itype_range.max) { + U64 leaf_idx = sym_inline_site->inlinee - task->tpi_itype_range.min; + CV_Leaf leaf = cv_debug_t_get_leaf(task->ipi, leaf_idx); + if (leaf.kind == CV_LeafKind_MFUNC_ID) { + if (sizeof(CV_LeafMFuncId) <= leaf.data.size) { + CV_LeafMFuncId *mfunc_id = (CV_LeafMFuncId *) leaf.data.str; + name = str8_cstring_capped_reverse(mfunc_id + 1, leaf.data.str + leaf.data.size); + type = lnk_type_from_itype(mfunc_id->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + owner = lnk_type_from_itype(mfunc_id->owner_itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + } else { + Assert(!"invalid leaf size"); + } + } else if (leaf.kind == CV_LeafKind_FUNC_ID) { + if (sizeof(CV_LeafFuncId) <= leaf.data.size) { + CV_LeafFuncId *func_id = (CV_LeafFuncId *) leaf.data.str; + name = str8_cstring_capped_reverse(func_id + 1, leaf.data.str + leaf.data.size); + type = lnk_type_from_itype(func_id->itype, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + owner = lnk_type_from_itype(func_id->scope_string_id, task->tpi_itype_range, task->tpi_itype_map, obj, symbol.kind, symbol.offset); + } else { + Assert(!"invalid leaf size"); + } + } else { + Assert(!"inlinee must pointer to LF_FUNC_ID or LF_MFUNC_ID"); + } + } else { + Assert(!"out of bounds inlinee"); + } + + // fill out inline site + RDIB_InlineSite *inline_site = rdib_inline_site_chunk_list_push(arena, &task->inline_sites[worker_id], task->inline_site_cap); + inline_site->name = name; + inline_site->type = type; + inline_site->owner = owner; + + inline_site->convert_ref.ud0 = binary_annots_parsed.lines; + inline_site->convert_ref.ud1 = binary_annots_parsed.lines_count; + inline_site->convert_ref.ud2 = task_id; + + // fill out scope + RDIB_Scope *scope = rdib_scope_chunk_list_push(arena, &task->scopes[worker_id], task->symbol_chunk_cap); + scope->container_proc = scope_stack->proc; + scope->parent = scope_stack->scope; + scope->inline_site = inline_site; + scope->ranges = binary_annots_parsed.code_ranges; + + // push new scope stack frame + push_scope_frame(); + scope_stack->scope = scope; + scope_stack->proc = scope->container_proc; + scope_stack->proc_flags = scope_stack->proc_flags; + scope_stack->frameproc = scope_stack->prev->frameproc; + } break; + default: break; + } + } + exit:; + +#undef push_scope_frame + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_convert_inline_site_line_tables_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + LNK_ConvertUnitToRDITask *task = raw_task; + RDIB_InlineSiteChunk *chunk = task->inline_site_chunks[task_id]; + + RDIB_LineTableFragmentChunkList frag_chunk_list = {0}; + + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_InlineSite *inline_site = &chunk->v[i]; + + CV_LineArray *lines_arr = inline_site->convert_ref.ud0; + U64 lines_count = inline_site->convert_ref.ud1; + U64 obj_idx = inline_site->convert_ref.ud2; + + CV_DebugS debug_s = task->debug_s_arr[obj_idx]; + String8 raw_string_table = cv_string_table_from_debug_s(debug_s); + String8 raw_file_chksms = cv_file_chksms_from_debug_s(debug_s); + + if (lines_count > 0) { + inline_site->line_table = rdib_line_table_chunk_list_push(arena, &task->line_tables[worker_id], task->line_table_cap); + } else { + inline_site->line_table = task->null_line_table; + } + + // emit line tables for each file (yes, it is possbile to split inline site among two or more files via #include) + for (U64 file_idx = 0; file_idx < lines_count; ++file_idx) { + CV_LineArray lines = lines_arr[file_idx]; + + // prase checksum header + CV_C13Checksum *checksum_header = (CV_C13Checksum *) (raw_file_chksms.str + lines.file_off); + if (lines.file_off + sizeof(CV_C13Checksum) + checksum_header->len > raw_file_chksms.size) { + LNK_Obj *obj = task->obj_arr + obj_idx; + lnk_error_obj(LNK_Warning_IllData, obj, "Not enough bytes to read file checksum @ 0x%llx.", lines.file_off); + continue; + } + String8 file_path = str8_cstring_capped(raw_string_table.str + checksum_header->name_off, raw_string_table.str + raw_string_table.size); + String8 checksum_bytes = str8((U8 *) (checksum_header + 1), checksum_header->len); + + // find source file for this line table + String8 normal_path = lnk_normalize_src_file_path(scratch.arena, file_path); + U64 src_file_hash = lnk_src_file_hash_cv(normal_path, checksum_header->kind, checksum_bytes); + LNK_SourceFileBucket *src_file_bucket = lnk_src_file_hash_table_lookup_slot(task->src_file_buckets, task->src_file_buckets_cap, src_file_hash, normal_path, checksum_header->kind, checksum_bytes); + if (src_file_bucket == 0) { + LNK_Obj *obj = task->obj_arr + obj_idx; + lnk_error_obj(LNK_Error_UnexpectedCodePath, obj, "Unable to find source file in the hash table: \"%S\".", file_path); + continue; + } + RDIB_SourceFile *src_file = src_file_bucket->src_file; + + // fill out line table fragment + RDIB_LineTableFragment *frag = rdib_line_table_fragment_chunk_list_push(arena, &frag_chunk_list, chunk->count); + frag->src_file = src_file; + frag->voffs = lines.voffs; + frag->line_nums = lines.line_nums; + frag->col_nums = lines.col_nums; + frag->line_count = lines.line_count; + frag->col_count = lines.col_count; + + // build list of fragments per line table + rdib_line_table_push_fragment_node(inline_site->line_table, frag); + + // build list of line table fragments per file + frag->next_src_file = ins_atomic_ptr_eval_assign(&src_file->line_table_frags, frag); + } + } + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_collect_obj_virtual_ranges_task) +{ + ProfBeginFunction(); + + LNK_ConvertUnitToRDITask *task = raw_task; + + U64 unit_idx = task_id; + LNK_Obj *obj = &task->obj_arr[unit_idx]; + + U64 unit_chunk_idx = unit_idx / task->unit_chunk_cap; + U64 local_unit_idx = unit_idx - unit_chunk_idx * task->unit_chunk_cap; + + RDIB_Unit *dst = &task->units[unit_chunk_idx].v[local_unit_idx]; + dst->virt_range_count = 0; + dst->virt_ranges = push_array_no_zero(arena, Rng1U64, obj->sect_count); + + for (U64 chunk_idx = 0; chunk_idx < obj->sect_count; ++chunk_idx) { + LNK_Chunk *chunk = &obj->chunk_arr[chunk_idx]; + if (!chunk || lnk_chunk_is_discarded(chunk)) { + continue; + } + + LNK_Section *sect = lnk_sect_from_chunk_ref(task->sect_id_map, chunk->ref); + if (!sect->has_layout) { + continue; + } + + U64 chunk_voff = lnk_virt_off_from_chunk_ref(task->sect_id_map, chunk->ref); + U64 chunk_size = lnk_virt_size_from_chunk_ref(task->sect_id_map, chunk->ref); + + if (chunk_size == 0) { + continue; + } + + dst->virt_ranges[dst->virt_range_count] = rng_1u64(chunk_voff, chunk_voff + chunk_size); + ++dst->virt_range_count; + } + + // free unused memory + arena_pop(arena, sizeof(dst->virt_ranges[0]) * (obj->sect_count - dst->virt_range_count)); + + ProfEnd(); +} + +internal String8List +lnk_build_rad_debug_info(TP_Context *tp, + TP_Arena *tp_arena, + OperatingSystem os, + RDI_Arch arch, + String8 image_name, + String8 image_data, + LNK_SectionArray image_sects, + LNK_Section **sect_id_map, + U64 obj_count, + LNK_Obj *obj_arr, + CV_DebugS *debug_s_arr, + U64 total_symbol_input_count, + LNK_CodeViewSymbolsInput *symbol_inputs, + CV_SymbolListArray *parsed_symbols, + CV_DebugT types[CV_TypeIndexSource_COUNT]) +{ + ProfBegin("RDI"); + Temp scratch = scratch_begin(0,0); + + RDIB_Input input = rdib_init_input(scratch.arena); + + ProfBegin("Top Level Info"); + { + U64 image_vsize = 0; + for (U64 sect_idx = 0; sect_idx < image_sects.count; sect_idx++) { + LNK_Section *sect = &image_sects.v[sect_idx]; + U64 sect_virt_size = lnk_virt_size_from_chunk_ref(sect_id_map, sect->root->ref); + U64 sect_voff_max = sect->virt_off + sect_virt_size; + image_vsize = Max(image_vsize, sect_voff_max); + } + + input.top_level_info.arch = arch; + input.top_level_info.exe_name = image_name; + input.top_level_info.exe_hash = rdi_hash(image_data.str, image_data.size); + input.top_level_info.voff_max = image_vsize; + input.top_level_info.producer_string = push_str8f(scratch.arena, "%s [Debug Info: CodeView]", BUILD_VERSION_STRING); + } + ProfEnd(); + + ProfBegin("Sections"); + { + input.sect_count = image_sects.count; + input.sections = push_array(scratch.arena, RDIB_BinarySection, image_sects.count); + for (U64 sect_idx = 0; sect_idx < image_sects.count; ++sect_idx) { + LNK_Section *src = &image_sects.v[sect_idx]; + RDIB_BinarySection *dst = &input.sections[sect_idx]; + + U64 sect_virt_size = lnk_virt_size_from_chunk_ref(sect_id_map, src->root->ref); + U64 sect_file_size = lnk_file_size_from_chunk_ref(sect_id_map, src->root->ref); + + dst->name = push_str8_copy(scratch.arena, src->name); + dst->flags = rdi_binary_section_flags_from_coff_section_flags(src->flags); + dst->voff_first = src->virt_off; + dst->voff_opl = src->virt_off + sect_virt_size; + dst->foff_first = src->file_off; + dst->foff_opl = src->file_off + sect_file_size; + } + } + ProfEnd(); + + // assing low and high type indices per source + Rng1U64 itype_ranges[CV_TypeIndexSource_COUNT]; + for (U64 i = 0; i < ArrayCount(itype_ranges); ++i) { + itype_ranges[i] = rng_1u64(CV_MinComplexTypeIndex, CV_MinComplexTypeIndex + types[i].count); + } + + ProfBegin("Convert Types"); + U64 udt_name_buckets_cap; + LNK_UDTNameBucket **udt_name_buckets; + RDIB_Type **tpi_itype_map; + { + ProfBegin("Push TPI itype -> RDIB Type map"); + tpi_itype_map = push_array(scratch.arena, RDIB_Type *, itype_ranges[CV_TypeIndexSource_TPI].max); + ProfEnd(); + + ProfBegin("Push Built-in Types"); + RDIB_DataModel data_model = rdib_infer_data_model(os, arch); + lnk_push_basic_itypes(scratch.arena, data_model, tpi_itype_map, &input.types); + ProfEnd(); + + Assert(tpi_itype_map[0] == 0); + tpi_itype_map[0] = input.null_type; + + ProfBegin("Build UDT Name Hash Table"); + // TODO: fix memory life-time + udt_name_buckets_cap = 0; + udt_name_buckets = lnk_udt_name_hash_table_from_debug_t(tp, tp_arena, types[CV_TypeIndexSource_TPI], &udt_name_buckets_cap); + ProfEnd(); + + ProfBegin("Convert CodeView types to RDIB Types"); + LNK_ConvertTypesToRDI task = {0}; + task.types = types; + task.type_cap = input.type_cap; + task.udt_cap = input.udt_cap; + task.variadic_type_ref = rdib_make_type_ref(scratch.arena, input.variadic_type); + task.itype_ranges = itype_ranges; + task.tpi_itype_map = tpi_itype_map; + task.udt_name_bucket_cap = udt_name_buckets_cap; + task.udt_name_buckets = udt_name_buckets; + task.rdib_types_lists = push_array(scratch.arena, RDIB_TypeChunkList, tp->worker_count); + task.rdib_types_struct_lists = push_array(scratch.arena, RDIB_TypeChunkList, tp->worker_count); + task.rdib_types_union_lists = push_array(scratch.arena, RDIB_TypeChunkList, tp->worker_count); + task.rdib_types_enum_lists = push_array(scratch.arena, RDIB_TypeChunkList, tp->worker_count); + task.rdib_types_udt_members_lists = push_array(scratch.arena, RDIB_TypeChunkList, tp->worker_count); + task.rdib_types_enum_members_lists = push_array(scratch.arena, RDIB_TypeChunkList, tp->worker_count); + task.rdib_types_params_lists = push_array(scratch.arena, RDIB_TypeChunkList, tp->worker_count); + task.rdib_udt_members_lists = push_array(scratch.arena, RDIB_UDTMemberChunkList, tp->worker_count); + task.rdib_enum_members_lists = push_array(scratch.arena, RDIB_UDTMemberChunkList, tp->worker_count); + task.ranges = tp_divide_work(scratch.arena, types[CV_TypeIndexSource_TPI].count, tp->worker_count); + tp_for_parallel(tp, tp_arena, tp->worker_count, lnk_convert_types_to_rdi_task, &task); + ProfEnd(); + + ProfBegin("Concat converted types"); + rdib_type_chunk_list_concat_in_place_many (&input.types, task.rdib_types_lists, tp->worker_count); + rdib_type_chunk_list_concat_in_place_many (&input.struct_list, task.rdib_types_struct_lists, tp->worker_count); + rdib_type_chunk_list_concat_in_place_many (&input.union_list, task.rdib_types_union_lists, tp->worker_count); + rdib_type_chunk_list_concat_in_place_many (&input.enum_list, task.rdib_types_enum_lists, tp->worker_count); + rdib_type_chunk_list_concat_in_place_many (&input.param_types, task.rdib_types_params_lists, tp->worker_count); + rdib_type_chunk_list_concat_in_place_many (&input.member_types, task.rdib_types_udt_members_lists, tp->worker_count); + rdib_type_chunk_list_concat_in_place_many (&input.enum_types, task.rdib_types_enum_members_lists, tp->worker_count); + rdib_udt_member_chunk_list_concat_in_place_many(&input.udt_members, task.rdib_udt_members_lists, tp->worker_count); + rdib_udt_member_chunk_list_concat_in_place_many(&input.enum_members, task.rdib_enum_members_lists, tp->worker_count); + ProfEnd(); + + // types are converted and we can remove indirection and release 'itype_map' + ProfBegin("Deref Type Refs"); + rdib_deref_type_refs(tp, &input.types); + rdib_deref_type_refs(tp, &input.struct_list); + rdib_deref_type_refs(tp, &input.union_list); + rdib_deref_type_refs(tp, &input.enum_list); + rdib_deref_type_refs(tp, &input.param_types); + rdib_deref_type_refs(tp, &input.member_types); + rdib_deref_type_refs(tp, &input.enum_types); + ProfEnd(); + } + ProfEnd(); + + // Loop over source files in objs and build a hash table + // for path -> source file maps. During symbol conversion + // we use the hash table to lookup source files and append + // inline site line tables. + U64 src_file_buckets_cap; + LNK_SourceFileBucket **src_file_buckets; + { + ProfBegin("Build Source File Hash Table"); + + LNK_ConvertSourceFilesToRDITask task = {0}; + task.obj_arr = obj_arr; + task.debug_s_arr = debug_s_arr; + + ProfBegin("Count Source Files"); + tp_for_parallel(tp, 0, obj_count, lnk_count_source_files_task, &task); + ProfEnd(); + + ProfBeginDynamic("Insert Source Files [Count %llu]", task.total_src_file_count); + task.src_file_buckets_cap = (U64)(task.total_src_file_count * 1.3); + task.src_file_buckets = push_array(tp_arena->v[0], LNK_SourceFileBucket*, task.src_file_buckets_cap); + tp_for_parallel(tp, tp_arena, obj_count, lnk_insert_src_files_task, &task); + ProfEnd(); + + src_file_buckets_cap = task.src_file_buckets_cap; + src_file_buckets = task.src_file_buckets; + + ProfEnd(); + } + + // Copy source files to a contiguous array and update source file pointers + // in buckets so we can do lookup and compute source file index in output array + // with a pointer subtraction. + ProfBegin("Source Files"); + for (U64 bucket_idx = 0; bucket_idx < src_file_buckets_cap; ++bucket_idx) { + LNK_SourceFileBucket *bucket = src_file_buckets[bucket_idx]; + if (bucket != 0) { + RDIB_SourceFile *new_src_file = rdib_source_file_chunk_list_push(scratch.arena, &input.src_files, input.src_file_chunk_cap); + + // restore chunk pointer after copy + RDIB_SourceFileChunk *new_src_file_chunk = new_src_file->chunk; + *new_src_file = *bucket->src_file; + new_src_file->chunk = new_src_file_chunk; + + bucket->src_file = new_src_file; + } + } + ProfEnd(); + + ProfBegin("Units"); + { + LNK_ConvertUnitToRDITask task = {0}; + task.image_sects = image_sects; + task.sect_id_map = sect_id_map; + task.obj_arr = obj_arr; + task.debug_s_arr = debug_s_arr; + task.ipi = types[CV_TypeIndexSource_IPI]; + task.symbol_inputs = symbol_inputs; + task.parsed_symbols = parsed_symbols; + task.ipi_itype_range = itype_ranges[CV_TypeIndexSource_IPI]; + task.tpi_itype_range = itype_ranges[CV_TypeIndexSource_TPI]; + task.tpi_itype_map = tpi_itype_map; + task.src_file_buckets_cap = src_file_buckets_cap; + task.src_file_buckets = src_file_buckets; + task.udt_name_buckets = udt_name_buckets; + task.udt_name_buckets_cap = udt_name_buckets_cap; + task.src_file_chunk_cap = input.src_file_chunk_cap; + task.line_table_cap = input.line_table_cap; + task.symbol_chunk_cap = input.symbol_chunk_cap; + task.unit_chunk_cap = input.unit_chunk_cap; + task.inline_site_cap = input.inline_site_cap; + task.null_line_table = input.null_line_table; + task.extern_symbol_voff_ht = hash_table_init(scratch.arena, 256); + task.units = rdib_unit_chunk_list_reserve_ex(scratch.arena, &input.units, input.unit_chunk_cap, obj_count); + task.scopes = push_array(scratch.arena, RDIB_ScopeChunkList, tp->worker_count); + task.locals = push_array(scratch.arena, RDIB_VariableChunkList, tp->worker_count); + task.extern_gvars = push_array(scratch.arena, RDIB_VariableChunkList, tp->worker_count); + task.static_gvars = push_array(scratch.arena, RDIB_VariableChunkList, tp->worker_count); + task.extern_tvars = push_array(scratch.arena, RDIB_VariableChunkList, tp->worker_count); + task.static_tvars = push_array(scratch.arena, RDIB_VariableChunkList, tp->worker_count); + task.extern_procs = push_array(scratch.arena, RDIB_ProcedureChunkList, tp->worker_count); + task.static_procs = push_array(scratch.arena, RDIB_ProcedureChunkList, tp->worker_count); + task.inline_sites = push_array(scratch.arena, RDIB_InlineSiteChunkList, tp->worker_count); + task.line_tables = push_array(scratch.arena, RDIB_LineTableChunkList, tp->worker_count); + + ProfBegin("Gather Compiler Info"); + task.comp_info_arr = push_array(scratch.arena, LNK_CodeViewCompilerInfo, obj_count); + tp_for_parallel(tp, tp_arena, obj_count, lnk_find_obj_compiler_info_task, &task); + ProfEnd(); + + ProfBegin("Convert Line Tables"); + tp_for_parallel(tp, tp_arena, obj_count, lnk_convert_line_tables_to_rdi_task, &task); + ProfEnd(); + + ProfBegin("Convert Symbols"); + tp_for_parallel(tp, tp_arena, total_symbol_input_count, lnk_convert_symbols_to_rdi_task, &task); + ProfEnd(); + + ProfBegin("Convert Inline Sites Line Tables"); + rdib_inline_site_chunk_list_concat_in_place_many(&input.inline_sites, task.inline_sites, tp->worker_count); + task.inline_site_chunks = rdib_array_from_inline_site_chunk_list(scratch.arena, input.inline_sites); + tp_for_parallel(tp, tp_arena, input.inline_sites.count, lnk_convert_inline_site_line_tables_task, &task); + ProfEnd(); + + ProfBegin("Collect Units Virtual Ranges"); + tp_for_parallel(tp, tp_arena, obj_count, lnk_collect_obj_virtual_ranges_task, &task); + ProfEnd(); + + rdib_line_table_chunk_list_concat_in_place_many(&input.line_tables, task.line_tables, tp->worker_count); + rdib_scope_chunk_list_concat_in_place_many(&input.scopes, task.scopes, tp->worker_count); + rdib_variable_chunk_list_concat_in_place_many(&input.locals, task.locals, tp->worker_count); + rdib_variable_chunk_list_concat_in_place_many(&input.extern_gvars, task.extern_gvars, tp->worker_count); + rdib_variable_chunk_list_concat_in_place_many(&input.static_gvars, task.static_gvars, tp->worker_count); + rdib_variable_chunk_list_concat_in_place_many(&input.extern_tvars, task.extern_tvars, tp->worker_count); + rdib_variable_chunk_list_concat_in_place_many(&input.static_tvars, task.static_tvars, tp->worker_count); + rdib_procedure_chunk_list_concat_in_place_many(&input.extern_procs, task.extern_procs, tp->worker_count); + rdib_procedure_chunk_list_concat_in_place_many(&input.static_procs, task.static_procs, tp->worker_count); + } + ProfEnd(); + + String8List rdi_data = rdib_finish(tp, tp_arena, &input); + + scratch_end(scratch); + ProfEnd(); + return rdi_data; +} + diff --git a/src/linker/lnk_debug_info.h b/src/linker/lnk_debug_info.h new file mode 100644 index 00000000..c5c2614f --- /dev/null +++ b/src/linker/lnk_debug_info.h @@ -0,0 +1,603 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +//////////////////////////////// + +typedef struct LNK_PchInfo +{ + CV_TypeIndex ti_lo; + CV_TypeIndex ti_hi; + U64 debug_p_obj_idx; +} LNK_PchInfo; + +typedef struct LNK_CodeViewSymbolsInput +{ + U64 obj_idx; + CV_SymbolList *symbol_list; + String8 raw_symbols; +} LNK_CodeViewSymbolsInput; + +typedef struct LNK_CodeViewInput +{ + U64 count; + U64 internal_count; + U64 external_count; + U64 type_server_count; + String8 *type_server_path_arr; // [type_server_count] + String8 *type_server_data_arr; // [type_server_count] + U64List *ts_to_obj_arr; // [type_server_count] + LNK_Obj *obj_arr; // [count] + LNK_PchInfo *pch_arr; // [count] + CV_DebugS *debug_s_arr; // [count] + CV_DebugT *debug_p_arr; // [count] + CV_DebugT *debug_t_arr; // [count] + CV_DebugT *merged_debug_t_p_arr; // [count] + + U64 total_symbol_input_count; + LNK_CodeViewSymbolsInput *symbol_inputs; // [total_symbol_input_count] + CV_SymbolListArray *parsed_symbols; // [count] + + LNK_Obj *internal_obj_arr; // [internal_count] + CV_DebugS *internal_debug_s_arr; // [internal_count] + CV_DebugT *internal_debug_t_arr; // [internal_count] + CV_DebugT *internal_debug_p_arr; // [internal_count] + U64 internal_total_symbol_input_count; + LNK_CodeViewSymbolsInput *internal_symbol_inputs; // [internal_total_symbol_input_count] + CV_SymbolListArray *internal_parsed_symbols; // [internal_count] + + LNK_Obj *external_obj_arr; // [external_count] + CV_DebugS *external_debug_s_arr; // [external_count] + CV_DebugT *external_debug_t_arr; // [external_count] + CV_DebugT *external_debug_p_arr; // [external_count] + U64 external_total_symbol_input_count; + LNK_CodeViewSymbolsInput *external_symbol_inputs; // [exteranl_total_symbol_input_count] + CV_SymbolListArray *external_parsed_symbols; // [external_count] + Rng1U64 **external_ti_ranges; // [type_server_count] + CV_DebugT **external_leaves; // [type_server_count] + U64 *external_obj_to_ts_idx_arr; // [external_count] + Rng1U64 external_obj_range; +} LNK_CodeViewInput; + +//////////////////////////////// + +typedef enum +{ + LNK_LeafLocType_Internal, + LNK_LeafLocType_External, + LNK_LeafLocType_Count +} LNK_LeafLocType; + +#define LNK_LeafRefFlag_LocIdxExternal (1 << 31) +#define LNK_LeafRefFlag_LeafIdxIPI (1 << 31) +typedef struct +{ + U32 enc_loc_idx; + U32 enc_leaf_idx; +} LNK_LeafRef; + +typedef struct LNK_LeafRange +{ + struct LNK_LeafRange *next; + Rng1U64 range; + CV_DebugT *debug_t; +} LNK_LeafRange; + +typedef struct LNK_LeafRangeList +{ + U64 count; + LNK_LeafRange *first; + LNK_LeafRange *last; +} LNK_LeafRangeList; + +typedef struct +{ + LNK_LeafRef leaf_ref; + CV_TypeIndex type_index; +} LNK_LeafBucket; + +typedef struct +{ + U64 count; + LNK_LeafBucket **v; +} LNK_LeafBucketArray; + +typedef struct +{ + U64 cap; + LNK_LeafBucket **bucket_arr; +} LNK_LeafHashTable; + +typedef union +{ + struct { + U128Array **internal_hashes; + U128Array **external_hashes; + }; + U128Array **v[CV_TypeIndexSource_COUNT]; +} LNK_LeafHashes; + +//////////////////////////////// + +typedef struct +{ + LNK_Obj **obj_arr; + LNK_ChunkList *sect_list_arr; + CV_DebugS *debug_s_arr; +} LNK_ParseDebugSTaskData; + +typedef struct +{ + LNK_Obj **obj_arr; + String8Array *data_arr_arr; +} LNK_CheckDebugTSigTaskData; + +typedef struct +{ + LNK_Obj **obj_arr; + String8Array *data_arr_arr; + CV_DebugT *debug_t_arr; +} LNK_ParseDebugTTaskData; + +typedef struct +{ + String8 *path_arr; + String8 *msf_data_arr; + Rng1U64 **external_ti_ranges; + CV_DebugT **external_leaves; + B8 *is_corrupted; +} LNK_GetExternalLeavesTask; + +//////////////////////////////// + +typedef struct +{ + LNK_LeafRangeList *leaf_ranges_per_task; + U64 **count_arr_arr; +} LNK_CountPerSourceLeafTask; + +typedef struct +{ + LNK_CodeViewInput *input; + LNK_LeafHashes *hashes; + Arena **fixed_arenas; + CV_DebugT *debug_t_arr; +} LNK_LeafHasherTask; + +typedef struct +{ + LNK_CodeViewInput *input; + LNK_LeafHashes *hashes; + LNK_LeafHashTable *leaf_ht_arr; + CV_DebugT *debug_t_arr; +} LNK_LeafDedupInternal; + +typedef struct +{ + LNK_CodeViewInput *input; + LNK_LeafHashes *hashes; + LNK_LeafHashTable *leaf_ht_arr; + CV_TypeIndexSource dedup_ti_source; +} LNK_LeafDedupExternal; + +typedef struct +{ + LNK_LeafHashTable *ht; + U64 *count_arr; + Rng1U64 *range_arr; + U64 *offset_arr; + LNK_LeafBucketArray result; +} LNK_GetPresentBucketsTask; + +typedef struct +{ + U64 loc_idx_bit_count_0; + U64 loc_idx_bit_count_1; + U64 loc_idx_bit_count_2; + U64 counts_max; + U32 **counts_arr; + Rng1U64 *ranges; + LNK_LeafBucket **dst; + LNK_LeafBucket **src; + U64 loc_idx_max; + U64 pass_idx; +} LNK_LeafRadixSortTask; + +typedef struct +{ + U32 *counts; + U32 *offsets; + LNK_LeafBucket **dst; + LNK_LeafBucket **src; + Rng1U64 *ranges; +} LNK_LeafLocRadixSortTask; + +typedef struct +{ + Rng1U64 *range_arr; + CV_TypeIndex min_type_index; + LNK_LeafBucketArray bucket_arr; +} LNK_AssignTypeIndicesTask; + +typedef struct +{ + LNK_CodeViewInput *input; + LNK_LeafBucket **bucket_arr; + U8 **raw_leaf_arr; + Rng1U64 *range_arr; +} LNK_UnbucketRawLeavesTask; + +typedef struct +{ + LNK_CodeViewInput *input; + LNK_LeafHashes *hashes; + LNK_LeafHashTable *leaf_ht_arr; + CV_SymbolList *symbol_list_arr; + Arena **arena_arr; +} LNK_PatchSymbolTypesTask; + +typedef struct +{ + LNK_CodeViewInput *input; + LNK_LeafHashes *hashes; + LNK_LeafHashTable *leaf_ht_arr; + CV_DebugS *debug_s_arr; +} LNK_PatchInlinesTask; + +typedef struct +{ + LNK_CodeViewInput *input; + LNK_LeafHashes *hashes; + LNK_LeafHashTable *leaf_ht_arr; + LNK_LeafBucket **bucket_arr; + Rng1U64 *range_arr; + Arena **fixed_arena_arr; +} LNK_PatchLeavesTask; + +//////////////////////////////// + +typedef struct +{ + String8List *data_list_arr; +} LNK_ProcessedCodeViewC11Data; + +typedef struct +{ + String8List *data_list_arr; + String8List *source_file_names_list_arr; +} LNK_ProcessedCodeViewC13Data; + +typedef struct +{ + LNK_CodeViewSymbolsInput *inputs; +} LNK_ParseCVSymbolsTaskData; + +typedef struct +{ + U64 total_symbol_input_count; + LNK_CodeViewSymbolsInput *symbol_inputs; + CV_SymbolListArray *parsed_symbols; + PDB_DbiModule **mod_arr; + String8List *symbol_data_arr; + CV_SymbolList *gsi_list_arr; +} LNK_ProcessSymDataTaskData; + +typedef struct +{ + CV_DebugS *debug_s_arr; + MSF_Context *msf; + PDB_DbiModule **dbi_mod_arr; + String8List *c13_data_arr; + String8List *source_file_names_list_arr; + U64 string_data_base_offset; + CV_StringHashTable string_ht; +} LNK_ProcessC13DataTask; + +typedef struct +{ + MSF_Context *msf; + PDB_DbiModule **mod_arr; + String8List *symbol_data_arr; + String8List *c11_data_list_arr; + String8List *c13_data_list_arr; + String8List *globrefs_arr; +} LNK_WriteModuleDataTask; + +typedef struct +{ + LNK_Obj *obj_arr; + LNK_Section **sect_id_map; + PDB_DbiModule **mod_arr; + PDB_DbiSectionContribList *sc_list; +} LNK_PushDbiSecContribTaskData; + +typedef struct +{ + U32Array *hash_arr_arr; + CV_SymbolList *list_arr; +} LNK_HashCVSymbolListTask; + +typedef struct +{ + U64 *hash_arr; + CV_SymbolNode **arr; + Rng1U64 *range_arr; +} LNK_CvSymbolPtrArrayHasher; + +typedef struct +{ + LNK_Section **sect_id_map; + LNK_SymbolScopeIndex scope_idx; + LNK_SymbolList **bucket_arr; + CV_SymbolList *pub_list_arr; + + Rng1U64 *symbol_ranges; + PDB_GsiContext *gsi; + CV_SymbolPtrArray symbols; + U32 *hashes; +} LNK_BuildPublicSymbolsTaskData; + +typedef struct +{ + CV_TypeIndex ipi_min_type_index; + CV_DebugT ipi_types; + LNK_CodeViewSymbolsInput *symbol_inputs; + CV_SymbolListArray *parsed_symbols; +} LNK_PostProcessCvSymbolsTask; + +typedef struct +{ + Rng1U64 *range_arr; + CV_SymbolPtrNode **bucket_arr; + CV_SymbolPtrNode **out_arr; + U64 *out_count_arr; +} LNK_GsiDeduper; + +typedef struct +{ + Rng1U64 *range_arr; + CV_SymbolPtrNode **bucket_arr; + U64 *symbol_base_arr; + CV_SymbolNode **symbol_arr; +} LNK_GsiUnbucket; + +//////////////////////////////// +// RAD Debug Info + +typedef struct +{ + String8 name; + U64 leaf_idx; +} LNK_UDTNameBucket; + +typedef struct +{ + CV_DebugT debug_t; + Rng1U64 *ranges; + U64 buckets_cap; + LNK_UDTNameBucket **buckets; +} LNK_BuildUDTNameHashTableTask; + +typedef struct +{ + CV_DebugT debug_t; + CV_TypeIndex ti_lo; + Rng1U64 *ranges; + U64 udt_name_buckets_cap; + LNK_UDTNameBucket **udt_name_buckets; + CV_TypeIndex *fwdmap; +} LNK_BuildUDTFwdMapTask; + +//////////////////////////////// + +typedef struct +{ + CV_DebugT *types; + U64 type_cap; + U64 udt_cap; + RDIB_TypeRef variadic_type_ref; + Rng1U64 *itype_ranges; + U64 udt_name_bucket_cap; + LNK_UDTNameBucket **udt_name_buckets; + RDIB_Type **tpi_itype_map; + RDIB_TypeChunkList *rdib_types_lists; + RDIB_TypeChunkList *rdib_types_struct_lists; + RDIB_TypeChunkList *rdib_types_union_lists; + RDIB_TypeChunkList *rdib_types_enum_lists; + RDIB_TypeChunkList *rdib_types_params_lists; + RDIB_TypeChunkList *rdib_types_udt_members_lists; + RDIB_TypeChunkList *rdib_types_enum_members_lists; + RDIB_UDTMemberChunkList *rdib_udt_members_lists; + RDIB_UDTMemberChunkList *rdib_enum_members_lists; + Rng1U64 *ranges; +} LNK_ConvertTypesToRDI; + +typedef struct +{ + U64 obj_idx; + RDIB_SourceFile *src_file; +} LNK_SourceFileBucket; + +typedef struct +{ + LNK_Obj *obj_arr; + CV_DebugS *debug_s_arr; + U64 total_src_file_count; + LNK_SourceFileBucket **src_file_buckets; + U64 src_file_buckets_cap; +} LNK_ConvertSourceFilesToRDITask; + +typedef struct +{ + CV_Arch arch; + CV_Language language; + String8 compiler_name; +} LNK_CodeViewCompilerInfo; + +typedef struct +{ + LNK_SectionArray image_sects; + LNK_Section **sect_id_map; + LNK_Obj *obj_arr; + CV_DebugS *debug_s_arr; + CV_DebugT ipi; + LNK_CodeViewSymbolsInput *symbol_inputs; + CV_SymbolListArray *parsed_symbols; + Rng1U64 ipi_itype_range; + Rng1U64 tpi_itype_range; + RDIB_Type **tpi_itype_map; + U64 src_file_buckets_cap; + LNK_SourceFileBucket **src_file_buckets; + LNK_UDTNameBucket **udt_name_buckets; + U64 line_table_cap; + U64 udt_name_buckets_cap; + U64 src_file_chunk_cap; + U64 symbol_chunk_cap; + U64 unit_chunk_cap; + U64 inline_site_cap; + RDIB_LineTable *null_line_table; + HashTable *extern_symbol_voff_ht; + LNK_CodeViewCompilerInfo *comp_info_arr; + CV_InlineeLinesAccel **inlinee_lines_accel_arr; + + RDIB_InlineSiteChunk **inline_site_chunks; + + // output + RDIB_UnitChunk *units; + RDIB_VariableChunkList *locals; + RDIB_ScopeChunkList *scopes; + RDIB_VariableChunkList *extern_gvars; + RDIB_VariableChunkList *static_gvars; + RDIB_VariableChunkList *extern_tvars; + RDIB_VariableChunkList *static_tvars; + RDIB_ProcedureChunkList *extern_procs; + RDIB_ProcedureChunkList *static_procs; + RDIB_InlineSiteChunkList *inline_sites; + RDIB_LineTableChunkList *line_tables; +} LNK_ConvertUnitToRDITask; + +//////////////////////////////// +// CodeView + +internal CV_DebugS * lnk_parse_debug_s_sections(TP_Context *tp, TP_Arena *arena, U64 obj_count, LNK_Obj **obj_arr, LNK_ChunkList *sect_list_arr); +internal CV_DebugT * lnk_parse_debug_t_sections(TP_Context *tp, TP_Arena *arena, U64 obj_count, LNK_Obj **obj_arr, LNK_ChunkList *debug_t_list_arr); +internal CV_SymbolList * lnk_cv_symbol_list_arr_from_debug_s_arr(TP_Context *tp, TP_Arena *arena, U64 obj_count, CV_DebugS *debug_s_arr); +internal LNK_PchInfo * lnk_setup_pch(Arena *arena, U64 obj_count, LNK_Obj *obj_arr, CV_DebugT *debug_t_arr, CV_DebugT *debug_p_arr, CV_SymbolListArray *parsed_symbols); + +internal LNK_CodeViewInput lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, String8List lib_dir_list, LNK_ObjList obj_list); + +internal LNK_LeafRef lnk_leaf_ref(U32 idx, U32 leaf_idx); +internal LNK_LeafRef lnk_obj_leaf_ref(U32 obj_idx, U32 leaf_idx); +internal LNK_LeafRef lnk_ts_leaf_ref(CV_TypeIndexSource ti_source, U32 ts_idx, U32 leaf_idx); +internal int lnk_leaf_ref_compare(LNK_LeafRef a, LNK_LeafRef b); +internal LNK_LeafLocType lnk_loc_type_from_leaf_ref(LNK_LeafRef leaf_ref); +internal LNK_LeafLocType lnk_loc_type_from_obj_idx(LNK_CodeViewInput *input, U64 obj_idx); +internal U64 lnk_loc_idx_from_obj_idx(LNK_CodeViewInput *input, U64 obj_idx); +internal CV_TypeIndex lnk_ti_lo_from_loc(LNK_CodeViewInput *input, LNK_LeafLocType loc_type, U64 loc_idx, CV_TypeIndexSource ti_source); +internal CV_TypeIndex lnk_ti_lo_from_leaf_ref(LNK_CodeViewInput *input, LNK_LeafRef leaf_ref); +internal String8 lnk_data_from_leaf_ref(LNK_CodeViewInput *input, LNK_LeafRef leaf_ref); +internal CV_Leaf lnk_cv_leaf_from_leaf_ref(LNK_CodeViewInput *input, LNK_LeafRef leaf_ref); +internal U128 lnk_hash_from_leaf_ref(LNK_LeafHashes *hashes, LNK_LeafRef leaf_ref); +internal LNK_LeafRef lnk_leaf_ref_from_loc_idx_and_ti(LNK_CodeViewInput *input, LNK_LeafLocType loc_type, CV_TypeIndexSource ti_source, U64 loc_idx, CV_TypeIndex obj_ti); +internal B32 lnk_match_leaf_ref(LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafRef a, LNK_LeafRef b); +internal B32 lnk_match_leaf_ref_deep(Arena *arena, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafRef a, LNK_LeafRef b); +internal U128 lnk_hash_cv_leaf(Arena *arena, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafLocType loc_type, U32 loc_idx, Rng1U64 *ti_ranges, CV_TypeIndex curr_ti, CV_Leaf leaf, CV_TypeIndexInfoList ti_info_list); +internal void lnk_hash_cv_leaf_deep(Arena *arena, LNK_CodeViewInput *input, Rng1U64 *ti_ranges, CV_DebugT *leaves, LNK_LeafHashes *hashes, LNK_LeafLocType loc_type, U32 loc_idx, CV_TypeIndexInfoList ti_info_list, String8 data); +internal LNK_LeafBucket * lnk_leaf_hash_table_insert_or_update(LNK_LeafHashTable *leaf_ht, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, U128 hash, LNK_LeafBucket *new_bucket); +internal LNK_LeafBucket * lnk_leaf_hash_table_search(LNK_LeafHashTable *ht, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafRef leaf_ref); + +internal void lnk_cv_debug_t_count_leaves_per_source(TP_Context *tp, U64 count, CV_DebugT *debug_t_arr, U64 *per_source_count_arr); +internal void lnk_hash_debug_t_arr(TP_Context *tp, Arena *arena, U64 obj_count, CV_DebugT *debug_t_arr, U128Array *hash_arr_arr); +internal LNK_LeafBucketArray lnk_present_bucket_array_from_leaf_hash_table(TP_Context *tp, Arena *arena, LNK_LeafHashTable *ht); +internal void lnk_leaf_bucket_array_sort_radix_subset_parallel(TP_Context *tp, U64 bucket_count, U64 loc_idx_max, LNK_LeafBucket **dst, LNK_LeafBucket **src); +internal void lnk_leaf_bucket_array_sort_radix_parallel(TP_Context *tp, LNK_LeafBucketArray arr, U64 obj_count, U64 type_server_count); +internal void lnk_assign_type_indices(TP_Context *tp, LNK_LeafBucketArray bucket_arr, CV_TypeIndex min_type_index); +internal void lnk_patch_symbols(TP_Context *tp, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafHashTable *leaf_ht_arr); +internal void lnk_patch_inlines(TP_Context *tp, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafHashTable *leaf_ht_arr, U64 obj_count, CV_DebugS *debug_s_arr); +internal void lnk_patch_leaves(TP_Context *tp, LNK_CodeViewInput *input, LNK_LeafHashes *hashes, LNK_LeafHashTable *leaf_ht_arr, LNK_LeafBucketArray bucket_arr); +internal String8Node * lnk_copy_raw_leaf_arr_to_type_server(TP_Context *tp, CV_DebugT types, PDB_TypeServer *type_server); +internal CV_DebugT * lnk_import_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input); + +//////////////////////////////// +// RAD Debug info + +internal U64 lnk_udt_name_hash_table_hash(String8 string); +internal LNK_UDTNameBucket ** lnk_udt_name_hash_table_from_debug_t(TP_Context *tp, TP_Arena *arena, CV_DebugT debug_t, U64 *buckets_cap_out); +internal LNK_UDTNameBucket * lnk_udt_name_hash_table_lookup(LNK_UDTNameBucket **buckets, U64 cap, String8 name); +internal CV_TypeIndex * lnk_build_udt_fwdmap(TP_Context *tp, Arena *arena, CV_DebugT debug_t, CV_TypeIndex ti_lo, LNK_UDTNameBucket **udt_name_buckets, U64 udt_name_buckets_cap); + +internal RDIB_TypeRef lnk_rdib_type_from_itype(LNK_ConvertTypesToRDI *task, CV_TypeIndex itype); +internal RDI_MemberKind lnk_rdib_method_kind_from_cv_prop(CV_MethodProp prop); +internal LNK_SourceFileBucket * lnk_src_file_hash_table_hash(String8 file_path, CV_C13ChecksumKind checksum_kind, String8 checksum_bytes); +internal LNK_SourceFileBucket * lnk_src_file_hash_table_lookup_slot(LNK_SourceFileBucket **src_file_buckets, U64 src_file_buckets_cap, U64 hash, String8 file_path, CV_C13ChecksumKind checksum_kind, String8 checksum_bytes); + +internal String8List lnk_build_rad_debug_info(TP_Context *tp, + TP_Arena *tp_arena, + OperatingSystem os, + RDI_Arch arch, + String8 image_name, + String8 image_data, + LNK_SectionArray image_sects, + LNK_Section **sect_id_map, + U64 obj_count, + LNK_Obj *obj_arr, + CV_DebugS *debug_s_arr, + U64 total_symbol_input_count, + LNK_CodeViewSymbolsInput *symbol_inputs, + CV_SymbolListArray *parsed_symbols, + CV_DebugT types[CV_TypeIndexSource_COUNT]); + +//////////////////////////////// +// PDB + +internal LNK_ProcessedCodeViewC11Data lnk_process_c11_data(TP_Context *tp, TP_Arena *arena, U64 obj_count, CV_DebugS *debug_s_arr, U64 string_data_base_offset, CV_StringHashTable string_ht, MSF_Context *msf, PDB_DbiModule **mod_arr); +internal LNK_ProcessedCodeViewC13Data lnk_process_c13_data(TP_Context *tp, TP_Arena *arena, U64 obj_count, CV_DebugS *debug_s_arr, U64 string_data_base_offset, CV_StringHashTable string_ht, MSF_Context *msf, PDB_DbiModule **mod_arr); +internal U64 * lnk_hash_cv_symbol_ptr_arr(TP_Context *tp, Arena *arena, CV_SymbolPtrArray arr); +internal CV_SymbolPtrArray lnk_dedup_gsi_symbols(TP_Context *tp, Arena *arena, PDB_GsiContext *gsi, U64 obj_count, CV_SymbolList *symbol_list_arr); + +internal void lnk_build_pdb_public_symbols(TP_Context *tp, + TP_Arena *arena, + LNK_SymbolTable *symtab, + LNK_Section **sect_id_map, + PDB_PsiContext *psi, + LNK_SymbolScopeIndex scope_idx); + +internal String8List lnk_build_pdb(TP_Context *tp, + TP_Arena *tp_arena, + OS_Guid guid, + COFF_MachineType machine, + COFF_TimeStamp time_stamp, + U32 age, + U64 page_size, + String8 pdb_name, + String8List lib_dir_list, + String8List natvis_list, + LNK_SymbolTable *symtab, + LNK_Section **sect_id_map, + U64 obj_count, + LNK_Obj *obj_arr, + CV_DebugS *debug_s_arr, + U64 total_symbol_input_count, + LNK_CodeViewSymbolsInput *symbol_inputs, + CV_SymbolListArray *parsed_symbols, + CV_DebugT types[CV_TypeIndexSource_COUNT]); + +//////////////////////////////// +// RAD Debug Info + +internal U64 lnk_udt_name_hash_table_hash(String8 string); +internal LNK_UDTNameBucket ** lnk_udt_name_hash_table_from_debug_t(TP_Context *tp, TP_Arena *arena, CV_DebugT debug_t, U64 *buckets_cap_out); +internal LNK_UDTNameBucket * lnk_udt_name_hash_table_lookup(LNK_UDTNameBucket **buckets, U64 cap, String8 name); + +internal CV_TypeIndex * lnk_build_udt_fwdmap(TP_Context *tp, + Arena *arena, + CV_DebugT debug_t, + CV_TypeIndex ti_lo, + LNK_UDTNameBucket **udt_name_buckets, + U64 udt_name_buckets_cap); + +internal void lnk_init_rdib_itype_map(Arena *arena, RDI_Arch arch, RDIB_Type **itype_map, RDIB_TypeChunkList *rdib_types_list); +internal RDIB_TypeRef lnk_rdib_type_from_itype(LNK_ConvertTypesToRDI *task, CV_TypeIndex itype); +internal RDI_MemberKind lnk_rdib_method_kind_from_cv_prop(CV_MethodProp prop); + diff --git a/src/linker/lnk_directive.c b/src/linker/lnk_directive.c new file mode 100644 index 00000000..8857cc3a --- /dev/null +++ b/src/linker/lnk_directive.c @@ -0,0 +1,185 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal void +lnk_alt_name_list_concat_in_place(LNK_AltNameList *list, LNK_AltNameList *to_concat) +{ + str8_list_concat_in_place(&list->from_list, &to_concat->from_list); + str8_list_concat_in_place(&list->to_list, &to_concat->to_list); +} + +internal LNK_MergeDirectiveNode * +lnk_merge_directive_list_push(Arena *arena, LNK_MergeDirectiveList *list, LNK_MergeDirective data) +{ + LNK_MergeDirectiveNode *node = push_array_no_zero(arena, LNK_MergeDirectiveNode, 1); + node->data = data; + node->next = 0; + + SLLQueuePush(list->first, list->last, node); + ++list->count; + + return node; +} + +//////////////////////////////// + +internal void +lnk_parse_directives(Arena *arena, LNK_DirectiveInfo *directive_info, String8 buffer, String8 obj_path) +{ + Temp scratch = scratch_begin(&arena, 1); + + String8 unparsed_directives = buffer; + { + static const U8 BOM_SIG[] = { 0xEF, 0xBB, 0xBF }; + B32 is_bom = MemoryMatch(buffer.str, &BOM_SIG[0], sizeof(BOM_SIG)); + if (is_bom) { + unparsed_directives = str8_zero(); + lnk_not_implemented("TODO: support for BOM encoding"); + } + static const U8 ASCII_SIG[] = { 0x20, 0x20, 0x20 }; + B32 is_ascii = MemoryMatch(buffer.str, &ASCII_SIG[0], sizeof(ASCII_SIG)); + if (is_ascii) { + unparsed_directives = str8_skip(buffer, sizeof(ASCII_SIG)); + } + } + + String8List arg_list = lnk_arg_list_parse_windows_rules(scratch.arena, unparsed_directives); + LNK_CmdLine cmd_line = lnk_cmd_line_parse_windows_rules(scratch.arena, arg_list); + + for (LNK_CmdOption *opt = cmd_line.first_option; opt != 0; opt = opt->next) { + static struct { + LNK_DirectiveKind kind; + String8 name; + } directive_table[LNK_Directive_Count] = { + { LNK_Directive_Null, str8_lit_comp("") }, + { LNK_Directive_DefaultLib, str8_lit_comp("defaultlib") }, + { LNK_Directive_Export, str8_lit_comp("export" ) }, + { LNK_Directive_Include, str8_lit_comp("include") }, + { LNK_Directive_ManifestDependency, str8_lit_comp("manifestdependency") }, + { LNK_Directive_Merge, str8_lit_comp("merge") }, + { LNK_Directive_Section, str8_lit_comp("section") }, + { LNK_Directive_AlternateName, str8_lit_comp("alternatename") }, + { LNK_Directive_GuardSym, str8_lit_comp("guardsym") }, + { LNK_Directive_DisallowLib, str8_lit_comp("disallowlib") }, + { LNK_Directive_FailIfMismatch, str8_lit_comp("failifmismatch") }, + { LNK_Directive_EditAndContinue, str8_lit_comp("editandcontinue") }, + { LNK_Directive_ThrowingNew, str8_lit_comp("throwingnew") }, + }; + + LNK_DirectiveKind kind = LNK_Directive_Null; + for (U64 i = 0; i < ArrayCount(directive_table); ++i) { + if (str8_match(directive_table[i].name, opt->string, StringMatchFlag_CaseInsensitive)) { + kind = directive_table[i].kind; + break; + } + } + if (kind == LNK_Directive_Null) { + lnk_error(LNK_Warning_UnknownDirective, "%S: unknown directive \"%S\"", obj_path, opt->string); + } + + LNK_Directive *directive = push_array_no_zero(arena, LNK_Directive, 1); + directive->next = 0; + directive->id = push_str8_copy(arena, opt->string); + directive->value_list = str8_list_copy(arena, &opt->value_strings); + + LNK_DirectiveList *directive_list = &directive_info->v[kind]; + SLLQueuePush(directive_list->first, directive_list->last, directive); + ++directive_list->count; + } + + scratch_end(scratch); +} + +internal String8List +lnk_parse_default_lib_directive(Arena *arena, LNK_DirectiveList *dir_list) +{ + ProfBeginFunction(); + String8List default_libs = {0}; + + for (LNK_Directive *dir = dir_list->first; dir != 0; dir = dir->next) { + for (String8Node *i = dir->value_list.first; i != 0; i = i->next) { + String8 lib_path = i->string; + + // is there lib extension? + String8 ext = str8_skip_last_dot(lib_path); + if (ext.size == lib_path.size) { // TODO: fix string_extension_from_path, if there is no extension it should return zero + lib_path = push_str8f(arena, "%S.lib", lib_path); + } else { + lib_path = push_str8_copy(arena, lib_path); + } + + + str8_list_push(arena, &default_libs, lib_path); + } + } + + ProfEnd(); + return default_libs; +} + +internal LNK_ExportParse * +lnk_parse_export_direcive(Arena *arena, LNK_ExportParseList *list, String8List value_list, LNK_Obj *obj) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + LNK_ExportParse *parse = 0; + + // parse directive + String8 name = str8(0,0); + String8 alias = str8(0,0); + String8 type = coff_string_from_import_header_type(COFF_ImportHeaderType_CODE); + if (value_list.node_count > 0) { + String8List dir_split = str8_split_by_string_chars(scratch.arena, value_list.first->string, str8_lit("="), 0); + B32 is_export_valid = value_list.node_count <= 2 && value_list.node_count > 0; + if (is_export_valid) { + if (dir_split.node_count > 0) { + name = dir_split.last->string; + } + if (dir_split.node_count == 2) { + alias = dir_split.first->string; + } + if (value_list.node_count == 2) { + type = value_list.last->string; + } + } + } + + // prase error check + if (name.size == 0) { + String8 dir = str8_list_join(scratch.arena, &value_list, 0); + lnk_error_obj(LNK_Error_IllData, obj, "invalid export directive \"%S\"", dir); + goto exit; + } + + parse = push_array_no_zero(arena, LNK_ExportParse, 1); + parse->next = 0; + parse->name = name; + parse->alias = alias; + parse->type = type; + + SLLQueuePush(list->first, list->last, parse); + ++list->count; + +exit:; + scratch_end(scratch); + ProfEnd(); + return parse; +} + +internal B32 +lnk_parse_merge_directive(String8 string, LNK_MergeDirective *out) +{ + Temp scratch = scratch_begin(0, 0); + B32 is_parse_ok = 0; + + String8List list = str8_split_by_string_chars(scratch.arena, string, str8_lit("="), 0); + if (list.node_count == 2) { + out->src = list.first->string; + out->dst = list.last->string; + is_parse_ok = 1; + } + + scratch_end(scratch); + return is_parse_ok; +} + diff --git a/src/linker/lnk_directive.h b/src/linker/lnk_directive.h new file mode 100644 index 00000000..c6228738 --- /dev/null +++ b/src/linker/lnk_directive.h @@ -0,0 +1,89 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct LNK_Directive +{ + struct LNK_Directive *next; + String8 id; + String8List value_list; +} LNK_Directive; + +typedef struct LNK_DirectiveList +{ + U64 count; + LNK_Directive *first; + LNK_Directive *last; +} LNK_DirectiveList; + +typedef struct LNK_ExportParse +{ + struct LNK_ExportParse *next; + String8 name; + String8 alias; + String8 type; +} LNK_ExportParse; + +typedef struct LNK_ExportParseList +{ + U64 count; + LNK_ExportParse *first; + LNK_ExportParse *last; +} LNK_ExportParseList; + +typedef struct LNK_MergeDirective +{ + String8 src; + String8 dst; +} LNK_MergeDirective; + +typedef struct LNK_MergeDirectiveNode +{ + struct LNK_MergeDirectiveNode *next; + LNK_MergeDirective data; +} LNK_MergeDirectiveNode; + +typedef struct LNK_MergeDirectiveList +{ + U64 count; + LNK_MergeDirectiveNode *first; + LNK_MergeDirectiveNode *last; +} LNK_MergeDirectiveList; + +typedef enum +{ + LNK_Directive_Null, + LNK_Directive_DefaultLib, + LNK_Directive_Export, + LNK_Directive_Include, + LNK_Directive_ManifestDependency, + LNK_Directive_Merge, + LNK_Directive_Section, + LNK_Directive_AlternateName, + LNK_Directive_GuardSym, + LNK_Directive_DisallowLib, + LNK_Directive_FailIfMismatch, + LNK_Directive_EditAndContinue, + LNK_Directive_ThrowingNew, + LNK_Directive_Count +} LNK_DirectiveKind; + +typedef struct LNK_DirectiveInfo +{ + LNK_DirectiveList v[LNK_Directive_Count]; +} LNK_DirectiveInfo; + +//////////////////////////////// + +internal void lnk_alt_name_list_concat_in_place(LNK_AltNameList *list, LNK_AltNameList *to_concat); + +internal LNK_MergeDirectiveNode * lnk_merge_directive_list_push(Arena *arena, LNK_MergeDirectiveList *list, LNK_MergeDirective data); + +//////////////////////////////// + +internal void lnk_parse_directives(Arena *arena, LNK_DirectiveInfo *directive_info, String8 buffer, String8 obj_path); +internal String8List lnk_parse_default_lib_directive(Arena *arena, LNK_DirectiveList *dir_list); +internal B32 lnk_parse_merge_directive(String8 directive, LNK_MergeDirective *out); + + diff --git a/src/linker/lnk_error.c b/src/linker/lnk_error.c new file mode 100644 index 00000000..1aeb04f0 --- /dev/null +++ b/src/linker/lnk_error.c @@ -0,0 +1,121 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +static LNK_ErrorMode g_error_mode_arr[LNK_Error_Count]; +static LNK_ErrorCodeStatus g_error_code_status_arr[LNK_Error_Count]; +static B32 g_log_status[LNK_Log_Count]; + +internal void +lnk_exit(int code) +{ + exit(code); +} + +internal void +lnk_init_error_handler(void) +{ + for (int i = LNK_Error_StopFirst; i < LNK_Error_StopLast; ++i) { + g_error_mode_arr[i] = LNK_ErrorMode_Stop; + } + for (int i = LNK_Error_First; i < LNK_Error_Last; ++i) { + g_error_mode_arr[i] = LNK_ErrorMode_Continue; + } + for (int i = LNK_Warning_First; i < LNK_Warning_Last; ++i) { + g_error_mode_arr[i] = LNK_ErrorMode_Warn; + } +} + +internal String8 +lnk_string_from_error_mode(LNK_ErrorMode mode) +{ + switch (mode) { + case LNK_ErrorMode_Ignore: return str8_lit("Ignore"); + case LNK_ErrorMode_Continue: return str8_lit("Error"); + case LNK_ErrorMode_Stop: return str8_lit("Error"); + case LNK_ErrorMode_Warn: return str8_lit("Warning"); + } + return str8_zero(); +} + +internal void +lnk_errorfv(LNK_ErrorCode code, char *fmt, va_list args) +{ + if (g_error_mode_arr[code] == LNK_ErrorMode_Ignore) { + return; + } + if (lnk_is_error_code_ignored(code)) { + return; + } + + Temp scratch = scratch_begin(0,0); + String8 message = push_str8fv(scratch.arena, fmt, args); + String8 string = push_str8f(scratch.arena, "%S(%03d): %S\n", lnk_string_from_error_mode(g_error_mode_arr[code]), code, message); + fprintf(stderr, "%.*s", str8_varg(string)); + scratch_end(scratch); + + if (g_error_mode_arr[code] == LNK_ErrorMode_Stop) { + lnk_exit(code); + } +} + +internal void +lnk_error(LNK_ErrorCode code, char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + lnk_errorfv(code, fmt, args); + va_end(args); +} + +internal void +lnk_supplement_error(char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + + Temp scratch = scratch_begin(0,0); + String8 string = push_str8fv(scratch.arena, fmt, args); + + fprintf(stderr, "\t"); + fprintf(stderr, "%.*s", str8_varg(string)); + fprintf(stderr, "\n"); + + va_end(args); + scratch_end(scratch); +} + +internal void +lnk_supplement_error_list(String8List list) +{ + for (String8Node *node = list.first; node != 0; node = node->next) { + lnk_supplement_error("%.*s", str8_varg(node->string)); + } +} + +internal void +lnk_suppress_error(LNK_ErrorCode code) +{ + g_error_code_status_arr[code] = LNK_ErrorCodeStatus_Ignore; +} + +internal LNK_ErrorCodeStatus +lnk_get_error_code_status(LNK_ErrorCode code) +{ + return g_error_code_status_arr[code]; +} + +internal void +lnk_internal_error(LNK_InternalError code, char *file, int line, char *fmt, ...) +{ + Temp scratch = scratch_begin(0,0); + va_list args; + va_start(args, fmt); + + String8 issue = push_str8fv(scratch.arena, fmt, args); + fprintf(stderr, "internal error #%03d in %s:%u\n", code, file, line); + fprintf(stderr, "\t%.*s\n", str8_varg(issue)); + + va_end(args); + scratch_end(scratch); +} + diff --git a/src/linker/lnk_error.h b/src/linker/lnk_error.h new file mode 100644 index 00000000..1bf30beb --- /dev/null +++ b/src/linker/lnk_error.h @@ -0,0 +1,120 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef enum +{ + LNK_Error_Null, + + LNK_Error_StopFirst, + LNK_Error_Cmdl, + LNK_Error_EndprecompNotFound, + LNK_Error_EntryPoint, + LNK_Error_ExternalTypeServerConflict, + LNK_Error_FileNotFound, + LNK_Error_IllData, + LNK_Error_IllExport, + LNK_Error_IncomatibleCmdOptions, + LNK_Error_IncompatibleObj, + LNK_Error_InvalidPrecompLeafCount, + LNK_Error_InvalidStartIndex, + LNK_Error_NoAccess, + LNK_Error_NoSubsystem, + LNK_Error_OutOfExportOrdinals, + LNK_Error_PrecompObjNotFound, + LNK_Error_PrecompSigMismatch, + LNK_Error_Telemetry, + LNK_Error_UnsupportedMachine, + LNK_Error_Mt, + LNK_Error_UnableToSerializeMsf, + LNK_Error_StopLast, + + LNK_Error_First, + LNK_Error_InvalidPath, + LNK_Error_AlreadyDefinedSymbol, + LNK_Error_AlternateNameConflict, + LNK_Error_CvPrecomp, + LNK_Error_MultiplyDefinedSymbol, + LNK_Error_Natvis, + LNK_Error_TooManyFiles, + LNK_Error_UndefinedSymbol, + LNK_Error_UnresolvedSymbol, + LNK_Error_UnableToOpenTypeServer, + LNK_Error_UnexpectedCodePath, + LNK_Error_CvIllSymbolData, + LNK_Error_IllegalAlternateNameRedifine, + LNK_Error_InvalidTypeIndex, + LNK_Error_Last, + + LNK_Warning_First, + LNK_Warning_AmbiguousMerge, + LNK_Warning_AtypicalStartIndex, + LNK_Warning_Cmdl, + LNK_Warning_Directive, + LNK_Warning_DuplicateObjPath, + LNK_Warning_ExternalTypeServerAgeMismatch, + LNK_Warning_FileNotFound, + LNK_Warning_IllData, + LNK_Warning_IllExport, + LNK_Warning_InvalidNatvisFileExt, + LNK_Warning_LargePages, + LNK_Warning_LargePagesNotEnabled, + LNK_Warning_MissingExternalTypeServer, + LNK_Warning_MultipleDebugTAndDebugP, + LNK_Warning_MultipleExternalTypeServers, + LNK_Warning_MultipleLibMatch, + LNK_Warning_MultiplyDefinedImport, + LNK_Warning_Natvis, + LNK_Warning_PrecompObjSymbolsNotFound, + LNK_Warning_SectionFlagsConflict, + LNK_Warning_Subsystem, + LNK_Warning_UnknownDirective, + LNK_Warning_UnresolvedComdat, + LNK_Warning_UnusedDelayLoadDll, + LNK_Warning_LongSectionName, + LNK_Warning_UnknownSwitch, + LNK_Warning_Last, + + LNK_Error_Count +} LNK_ErrorCode; + +typedef enum +{ + LNK_ErrorMode_Ignore, + LNK_ErrorMode_Stop, + LNK_ErrorMode_Continue, + LNK_ErrorMode_Warn, +} LNK_ErrorMode; + +typedef enum +{ + LNK_InternalError_Null, + LNK_InternalError_NotImplemented, + LNK_InternalError_InvalidPath, + LNK_InternalError_IncompleteSwitch, + LNK_InternalError_OutOfMemory +} LNK_InternalError; + +typedef enum +{ + LNK_ErrorCodeStatus_Active, + LNK_ErrorCodeStatus_Ignore, +} LNK_ErrorCodeStatus; + +internal void lnk_init_error_handler(void); +internal void lnk_errorfv(LNK_ErrorCode code, char *fmt, va_list args); +internal void lnk_error(LNK_ErrorCode code, char *fmt, ...); +internal void lnk_supplement_error(char *fmt, ...); +internal void lnk_supplement_error_list(String8List list); +internal void lnk_suppress_error(LNK_ErrorCode code); + +#define lnk_is_error_code_active(code) (lnk_get_error_code_status(code) == LNK_ErrorCodeStatus_Active) +#define lnk_is_error_code_ignored(code) (lnk_get_error_code_status(code) == LNK_ErrorCodeStatus_Ignore) +internal LNK_ErrorCodeStatus lnk_get_error_code_status(LNK_ErrorCode code); + +internal void lnk_internal_error(LNK_InternalError code, char *file, int line, char *fmt, ...); +#define lnk_invalid_path(fmt, ...) lnk_internal_error(LNK_InternalError_InvalidPath, __FILE__, __LINE__, fmt, __VA_ARGS__) +#define lnk_not_implemented(fmt, ...) lnk_internal_error(LNK_InternalError_NotImplemented, __FILE__, __LINE__, fmt, __VA_ARGS__) +#define lnk_incomplete_switch(fmt, ...) lnk_internal_error(LNK_InternalError_IncompleteSwitch, __FILE__, __LINE__, fmt, __VA_ARGS__) + diff --git a/src/linker/lnk_export_table.c b/src/linker/lnk_export_table.c new file mode 100644 index 00000000..341dc805 --- /dev/null +++ b/src/linker/lnk_export_table.c @@ -0,0 +1,302 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +int +lnk_export_name_compar(const void *a_, const void *b_) +{ + const LNK_Export *a = (const LNK_Export *)a_; + const LNK_Export *b = (const LNK_Export *)b_; + return str8_compar_case_sensetive(&a->name, &b->name); +} + +int +lnk_export_ordinal_compar(const void *a_, const void *b_) +{ + const LNK_Export *a = (const LNK_Export *)a_; + const LNK_Export *b = (const LNK_Export *)b_; + int cmp = u16_compar(&a->ordinal, &b->ordinal); + return cmp; +} + +internal LNK_ExportTable * +lnk_export_table_alloc(void) +{ + ProfBeginFunction(); + Arena *arena = arena_alloc(); + LNK_ExportTable *exptab = push_array(arena, LNK_ExportTable, 1); + exptab->arena = arena; + exptab->voff_size = sizeof(U32); + exptab->max_ordinal = max_U16; + exptab->is_ordinal_used = push_array(arena, B8, exptab->max_ordinal); + ProfEnd(); + return exptab; +} + +internal void +lnk_export_table_release(LNK_ExportTable **exptab_ptr) +{ + ProfBeginFunction(); + arena_release((*exptab_ptr)->arena); + *exptab_ptr = NULL; + ProfEnd(); +} + +internal LNK_Export * +lnk_export_table_search(LNK_ExportTable *exptab, String8 name) +{ + for (LNK_Export *exp = exptab->name_export_list.first; exp != NULL; exp = exp->next) { + if (str8_match(exp->name, name, 0)) { + return exp; + } + } + return NULL; +} + +internal LNK_Export * +lnk_export_table_push_export(LNK_ExportTable *exptab, LNK_SymbolTable *symtab, LNK_ExportParse *exp_parse) +{ + LNK_Export *exp = 0; + + // get export symbol + LNK_Symbol *symbol = lnk_symbol_table_search(symtab, LNK_SymbolScopeFlag_Main, exp_parse->name); + if (symbol == 0) { + lnk_error(LNK_Warning_IllExport, "symbol \"%S\" for export doesn't exist", exp_parse->name); + goto exit; + } + symbol = lnk_resolve_symbol(symtab, symbol); + if (!LNK_Symbol_IsDefined(symbol->type)) { + lnk_error(LNK_Warning_IllExport, "unable to resolve symbol \"%S\" for export", exp_parse->name); + goto exit; + } + LNK_DefinedSymbol *def = &symbol->u.defined; + + // NOTE: It is possible to export a global variable as CODE + // with following snippet: + // int global_bar = 0; + // #pragma comment(linker, "/export:global_bar") + // for some reason MSVC and LLD don't check symbol type and default + // to CODE instead of DATA. But if you try export global variable with: + // #pragma comment(linker, "/export:global_bar,CODE") + // MSVC and LLD issue an error. For compatibility sake we do the same thing too. + COFF_ImportHeaderType type = coff_import_header_type_from_string(exp_parse->type); + switch (type) { + case COFF_ImportHeaderType_CODE: { + B32 is_export_data = !(def->flags & (LNK_DefinedSymbolFlag_IsFunc|LNK_DefinedSymbolFlag_IsThunk)); + if (is_export_data) { + lnk_error(LNK_Error_IllExport, "export \"%S\" is DATA but has specifier CODE", exp_parse->name); + } + } break; + case COFF_ImportHeaderType_DATA: { + B32 is_export_code = !!(def->flags & (LNK_DefinedSymbolFlag_IsFunc|LNK_DefinedSymbolFlag_IsThunk)); + if (is_export_code) { + lnk_error(LNK_Error_IllExport, "export \"%S\" is CODE but has specifier DATA", exp_parse->name); + } + } break; + case COFF_ImportHeaderType_CONST: { + lnk_not_implemented("TODO: COFF_ImportHeaderType_CONST"); + } break; + default: { + if (exp_parse->type.size) { + lnk_error(LNK_Error_IllExport, "invalid type \"%S\" for export \"%S\"", exp_parse->type, exp_parse->name); + } + } break; + } + + + // error check multiple def + exp = lnk_export_table_search(exptab, exp_parse->alias); + if (exp) { + if (exp->type != type) { + lnk_error(LNK_Warning_IllExport, "trying to rexport symbol \"%S\"", exp_parse->alias); + } + goto exit; + } + exp = lnk_export_table_search(exptab, exp_parse->name); + if (exp) { + if (exp->type != type) { + lnk_error(LNK_Warning_IllExport, "multiple export definition for \"%S\"", exp_parse->name); + } + goto exit; + } + + + // find free ordinal + U16 ordinal; + for (ordinal = 0; ordinal < exptab->max_ordinal; ++ordinal) { + if (!exptab->is_ordinal_used[ordinal]) { + exptab->is_ordinal_used[ordinal] = 1; + break; + } + } + + // ordinal alloc error check + if (ordinal >= exptab->max_ordinal) { + lnk_error(LNK_Error_OutOfExportOrdinals, "reached export limit of %u, discarding export %S", exptab->max_ordinal, exp_parse->name); + goto exit; + } + + + // fill out export + exp = push_array_no_zero(exptab->arena, LNK_Export, 1); + exp->next = 0; + exp->name = push_str8_copy(exptab->arena, exp_parse->alias.size > 0 ? exp_parse->alias : exp_parse->name); + exp->symbol = symbol; + exp->id = exptab->name_export_list.count; + exp->ordinal = ordinal; + exp->type = type; + exp->is_private = 0; // exports through directives are public + + // push node + SLLQueuePush(exptab->name_export_list.first, exptab->name_export_list.last, exp); + exptab->name_export_list.count += 1; + + exit:; + return exp; +} + +internal LNK_ExportArray +lnk_export_array_from_list(Arena *arena, LNK_ExportList list) +{ + ProfBeginFunction(); + LNK_ExportArray arr; + arr.count = 0; + arr.v = push_array_no_zero(arena, LNK_Export, list.count); + for (LNK_Export *exp = list.first; exp != NULL; exp = exp->next) { + arr.v[arr.count++] = *exp; + } + ProfEnd(); + return arr; +} + +internal void +lnk_build_edata(LNK_ExportTable *exptab, LNK_SectionTable *st, LNK_SymbolTable *symtab, String8 image_name, COFF_MachineType machine) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + // is export table empty? + if (exptab->name_export_list.count == 0 && exptab->noname_export_list.count == 0) { + goto exit; + } + + // compute ordinal bounds + U64 ordinal_low; + for (ordinal_low = 0; ordinal_low < exptab->max_ordinal; ++ordinal_low) { + if (exptab->is_ordinal_used[ordinal_low]) { + break; + } + } + U64 ordinal_high; + for (ordinal_high = exptab->max_ordinal - 1; ordinal_high > 0; --ordinal_high) { + if (exptab->is_ordinal_used[ordinal_high]) { + break; + } + } + + LNK_Section *edata = lnk_section_table_search(st, str8_lit(".edata")); + + // push header + PE_ExportTable *header = push_array(edata->arena, PE_ExportTable, 1); + header->ordinal_base = safe_cast_u16(ordinal_low + 1); + header->export_address_table_count = safe_cast_u32(exptab->name_export_list.count + exptab->noname_export_list.count); + header->name_pointer_table_count = safe_cast_u32(exptab->name_export_list.count); + + String8 header_data = str8((U8*)header, sizeof(*header)); + String8 image_name_cstr = push_cstr(edata->arena, str8_skip_last_slash(image_name)); + + // push edata chunks + LNK_Chunk *header_chunk = lnk_section_push_chunk_data(edata, edata->root, header_data, str8_lit("a")); + LNK_Chunk *voff_table_chunk = lnk_section_push_chunk_list(edata, edata->root, str8_lit("b")); + LNK_Chunk *name_voff_table_chunk = lnk_section_push_chunk_list(edata, edata->root, str8_lit("c")); + LNK_Chunk *ordinal_table_chunk = lnk_section_push_chunk_list(edata, edata->root, str8_lit("d")); + LNK_Chunk *string_buffer_chunk = lnk_section_push_chunk_list(edata, edata->root, str8_lit("e")); + LNK_Chunk *image_name_chunk = lnk_section_push_chunk_data(edata, string_buffer_chunk, image_name_cstr, str8(0,0)); + lnk_chunk_set_debugf(edata->arena, header_chunk, "EXPORT_HEADER"); + lnk_chunk_set_debugf(edata->arena, voff_table_chunk, "EXPORT_ADDRESS_TABLE"); + lnk_chunk_set_debugf(edata->arena, name_voff_table_chunk, "EXPORT_NAME_VOFF_TABLE"); + lnk_chunk_set_debugf(edata->arena, ordinal_table_chunk, "EXPORT_ORDINAL_TABLE"); + lnk_chunk_set_debugf(edata->arena, string_buffer_chunk, "EXPORT_STRING_BUFFER"); + lnk_chunk_set_debugf(edata->arena, image_name_chunk, "EXPORT_IMAGE_NAME"); + + LNK_Symbol *image_name_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit("export_table.name_voff"), LNK_DefinedSymbolVisibility_Internal, 0, image_name_chunk, 0, 0, 0); + LNK_Symbol *address_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit("export_table.export_address_table_voff"), LNK_DefinedSymbolVisibility_Internal, 0, voff_table_chunk, 0, 0, 0); + LNK_Symbol *name_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit("export_table.name_pointer_table_voff"), LNK_DefinedSymbolVisibility_Internal, 0, name_voff_table_chunk, 0, 0, 0); + LNK_Symbol *ordinal_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_lit("export_table.ordinal_table_voff"), LNK_DefinedSymbolVisibility_Internal, 0, ordinal_table_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, image_name_symbol); + lnk_symbol_table_push(symtab, address_table_symbol); + lnk_symbol_table_push(symtab, name_table_symbol); + lnk_symbol_table_push(symtab, ordinal_table_symbol); + + // patch header fields + lnk_section_push_reloc(edata, header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_ExportTable, name_voff), image_name_symbol); + lnk_section_push_reloc(edata, header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_ExportTable, export_address_table_voff), address_table_symbol); + lnk_section_push_reloc(edata, header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_ExportTable, name_pointer_table_voff), name_table_symbol); + lnk_section_push_reloc(edata, header_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_ExportTable, ordinal_table_voff), ordinal_table_symbol); + + // reserve virtual offset chunks + LNK_Chunk **ordinal_voff_map = push_array(scratch.arena, LNK_Chunk *, exptab->max_ordinal); + for (U32 i = ordinal_low; i <= ordinal_high; i += 1) { + String8 sort_index = str8_from_bits_u32(edata->arena, i); + LNK_Chunk *voff_chunk = lnk_section_push_chunk_bss(edata, voff_table_chunk, exptab->voff_size, sort_index); + ordinal_voff_map[i] = voff_chunk; + } + + B8 *is_ordinal_bound = push_array(scratch.arena, B8, exptab->max_ordinal); + LNK_ExportList *exp_list_arr[] = { &exptab->name_export_list, &exptab->noname_export_list }; + for (LNK_ExportList *list_ptr = exp_list_arr[0], *list_opl = list_ptr + ArrayCount(exp_list_arr); + list_ptr < list_opl; + list_ptr += 1) { + for (LNK_Export *exp = list_ptr->first; exp != 0; exp = exp->next) { + String8 name_cstr = push_cstr(edata->arena, exp->name); + + // push name string + LNK_Chunk *name_chunk = lnk_section_push_chunk_data(edata, string_buffer_chunk, name_cstr, str8(0,0)); + lnk_chunk_set_debugf(edata->arena, name_chunk, "export: %S", name_cstr); + + // push name symbol + String8 name_export_name = push_str8f(symtab->arena, "export.%S", name_cstr); + LNK_Symbol *name_symbol = lnk_make_defined_symbol_chunk(symtab->arena, name_export_name, LNK_DefinedSymbolVisibility_Internal, 0, name_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, name_symbol); + + // name voff + LNK_Chunk *voff_chunk = lnk_section_push_chunk_bss(edata, name_voff_table_chunk, exptab->voff_size, /* export table must be sorted lexically: */ name_cstr); + lnk_chunk_set_debugf(edata->arena, voff_chunk, "voff for export name %S", name_cstr); + + // link reloc with name symbol + lnk_section_push_reloc(edata, voff_chunk, LNK_Reloc_VIRT_OFF_32, 0, name_symbol); + + // make ordinal relative + U16 *ordinal_ptr = push_array(edata->arena, U16, 1); + *ordinal_ptr = (exp->ordinal - ordinal_low); + + // ordinal + LNK_Chunk *ordinal_chunk = lnk_section_push_chunk_raw(edata, ordinal_table_chunk, ordinal_ptr, sizeof(*ordinal_ptr), /* ordinal table is parallel to the name table: */ name_cstr); + lnk_chunk_set_debugf(edata->arena, ordinal_chunk, "ordinal %u for %S", exp->ordinal, exp->name); + + // (ordinal - ordinal_low) -> export virtual offset + if ( ! is_ordinal_bound[exp->ordinal]) { + is_ordinal_bound[exp->ordinal] = 1; + LNK_Chunk *export_func_voff_chunk = ordinal_voff_map[exp->ordinal]; + lnk_section_push_reloc(edata, export_func_voff_chunk, LNK_Reloc_VIRT_OFF_32, 0, exp->symbol); + } + } + } + +exit:; + scratch_end(scratch); + ProfEnd(); +} + +internal void +lnk_collect_exports_from_obj_directives(LNK_ExportTable *exptab, LNK_ObjList obj_list, LNK_SymbolTable *symtab) +{ + ProfBeginFunction(); + for (LNK_ObjNode *obj_node = obj_list.first; obj_node != 0; obj_node = obj_node->next) { + for (LNK_ExportParse *exp_parse = obj_node->data.export_parse.first; exp_parse != 0; exp_parse = exp_parse->next) { + lnk_export_table_push_export(exptab, symtab, exp_parse); + } + } + ProfEnd(); +} + + diff --git a/src/linker/lnk_export_table.h b/src/linker/lnk_export_table.h new file mode 100644 index 00000000..2ee77a21 --- /dev/null +++ b/src/linker/lnk_export_table.h @@ -0,0 +1,46 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct LNK_Export +{ + struct LNK_Export *next; + String8 name; + LNK_Symbol *symbol; + U32 id; + U16 ordinal; + COFF_ImportHeaderType type; + B32 is_private; +} LNK_Export; + +typedef struct LNK_ExportList +{ + U64 count; + LNK_Export *first; + LNK_Export *last; +} LNK_ExportList; + +typedef struct LNK_ExportArray +{ + U64 count; + LNK_Export *v; +} LNK_ExportArray; + +typedef struct LNK_ExportTable +{ + Arena *arena; + LNK_ExportList name_export_list; + LNK_ExportList noname_export_list; + U64 voff_size; + U64 max_ordinal; + B8 *is_ordinal_used; +} LNK_ExportTable; + +internal LNK_ExportTable * lnk_export_table_alloc(void); +internal void lnk_export_table_release(LNK_ExportTable **exptab_ptr); +internal LNK_Export * lnk_export_table_search(LNK_ExportTable *exptab, String8 name); +internal void lnk_collect_exports_from_def_files(LNK_ExportTable *exptab, String8List path_list); +internal void lnk_build_edata(LNK_ExportTable *exptab, LNK_SectionTable *st, LNK_SymbolTable *symtab, String8 image_name, COFF_MachineType machine); +internal void lnk_collect_exports_from_obj_directives(LNK_ExportTable *exptab, LNK_ObjList obj_list, LNK_SymbolTable *symtab); + diff --git a/src/linker/lnk_import_table.c b/src/linker/lnk_import_table.c new file mode 100644 index 00000000..61535579 --- /dev/null +++ b/src/linker/lnk_import_table.c @@ -0,0 +1,802 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal LNK_ImportTable * +lnk_import_table_alloc_regular(LNK_SectionTable *st, LNK_SymbolTable *symtab, COFF_MachineType machine) +{ + ProfBeginFunction(); + + LNK_Section *data_sect = lnk_section_table_push(st, str8_lit(".idata"), LNK_IDATA_SECTION_FLAGS); + LNK_Section *code_sect = lnk_section_table_search(st, str8_lit(".text")); + + LNK_Chunk *dll_table_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *int_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *iat_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *ilt_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *code_chunk = lnk_section_push_chunk_list(code_sect, code_sect->root, str8_zero()); + + LNK_Chunk *null_dll_import = lnk_section_push_chunk_data(data_sect, dll_table_chunk, str8(0, sizeof(PE_ImportEntry)), str8_lit("zzzzz")); + lnk_chunk_set_debugf(data_sect->arena, null_dll_import, "DLL_DIRECTORY_TERMINATOR"); + + LNK_Symbol *dll_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_IMPORT_DLL_TABLE_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, dll_table_chunk, 0, 0, 0); + LNK_Symbol *int_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_IMPORT_NAME_TABLE_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, int_chunk , 0, 0, 0); + LNK_Symbol *iat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_IMPORT_IAT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, iat_chunk , 0, 0, 0); + LNK_Symbol *ilt_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_IMPORT_ILT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, ilt_chunk , 0, 0, 0); + LNK_Symbol *code_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_IMPORT_JMP_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, code_chunk , 0, 0, 0); + + lnk_symbol_table_push(symtab, dll_table_symbol); + lnk_symbol_table_push(symtab, int_symbol); + lnk_symbol_table_push(symtab, iat_symbol); + lnk_symbol_table_push(symtab, ilt_symbol); + lnk_symbol_table_push(symtab, code_symbol); + + Arena *arena = arena_alloc(); + LNK_ImportTable *imptab = push_array(arena, LNK_ImportTable, 1); + imptab->machine = machine; + imptab->arena = arena; + imptab->data_sect = data_sect; + imptab->code_sect = code_sect; + imptab->dll_table_chunk = dll_table_chunk; + imptab->int_chunk = int_chunk; + imptab->iat_chunk = iat_chunk; + imptab->ilt_chunk = ilt_chunk; + imptab->code_chunk = code_chunk; + imptab->dll_ht = hash_table_init(arena, LNK_IMPORT_DLL_HASH_TABLE_BUCKET_COUNT); + + ProfEnd(); + return imptab; +} + +internal LNK_ImportTable * +lnk_import_table_alloc_delayed(LNK_SectionTable *st, LNK_SymbolTable *symtab, COFF_MachineType machine, B32 is_unloadable, B32 is_bindable) +{ + ProfBeginFunction(); + + LNK_Section *data_sect = lnk_section_table_push(st, str8_lit(".didat"), LNK_DEBUG_DIR_SECTION_FLAGS); + LNK_Section *code_sect = lnk_section_table_search(st, str8_lit(".text")); + + LNK_Chunk *dll_table_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *int_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *handle_table_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *iat_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *ilt_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *biat_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *uiat_chunk = lnk_section_push_chunk_list(data_sect, data_sect->root, str8_zero()); + LNK_Chunk *code_chunk = lnk_section_push_chunk_list(code_sect, code_sect->root, str8_zero()); + + LNK_Chunk *null_dll_import = lnk_section_push_chunk_data(data_sect, dll_table_chunk, str8(0, sizeof(PE_DelayedImportEntry)), str8_lit("~0")); + lnk_chunk_set_debugf(data_sect->arena, null_dll_import, "DLL_DIRECTORY_TERMINATOR"); + + if (is_unloadable) { + U64 import_size = coff_word_size_from_machine(machine); + LNK_Chunk *null_uiat_chunk = lnk_section_push_chunk_bss(data_sect, uiat_chunk, import_size, str8_lit("~1")); + lnk_chunk_set_debugf(data_sect->arena, null_uiat_chunk, "UIAT_TERMINATOR"); + } + + if (is_bindable) { + U64 import_size = coff_word_size_from_machine(machine); + LNK_Chunk *null_biat_chunk = lnk_section_push_chunk_bss(data_sect, biat_chunk, import_size, str8_lit("~2")); + lnk_chunk_set_debugf(data_sect->arena, null_biat_chunk, "BIAT_TERMINATOR"); + } + + LNK_Symbol *dll_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_DLL_TABLE_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, dll_table_chunk , 0, 0, 0); + LNK_Symbol *int_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_INT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, int_chunk , 0, 0, 0); + LNK_Symbol *handle_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_HANDLE_TABLE_SYMBOL_NAME), LNK_DefinedSymbolVisibility_Internal, 0, handle_table_chunk, 0, 0, 0); + LNK_Symbol *iat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_IAT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, iat_chunk , 0, 0, 0); + LNK_Symbol *ilt_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_ILT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, ilt_chunk , 0, 0, 0); + LNK_Symbol *biat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_BIAT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, biat_chunk , 0, 0, 0); + LNK_Symbol *uiat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_UIAT_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, uiat_chunk , 0, 0, 0); + LNK_Symbol *code_symbol = lnk_make_defined_symbol_chunk(symtab->arena, str8_cstring(LNK_DELAYED_IMPORT_CODE_SYMBOL_NAME) , LNK_DefinedSymbolVisibility_Internal, 0, code_chunk , 0, 0, 0); + + lnk_symbol_table_push(symtab, dll_table_symbol); + lnk_symbol_table_push(symtab, int_symbol); + lnk_symbol_table_push(symtab, handle_table_symbol); + lnk_symbol_table_push(symtab, iat_symbol); + lnk_symbol_table_push(symtab, ilt_symbol); + lnk_symbol_table_push(symtab, biat_symbol); + lnk_symbol_table_push(symtab, uiat_symbol); + lnk_symbol_table_push(symtab, code_symbol); + + LNK_ImportTableFlags flags = 0; + if (is_unloadable) { + flags |= LNK_ImportTableFlag_EmitUiat; + } + if (is_bindable) { + flags |= LNK_ImportTableFlag_EmitBiat; + } + + Arena *arena = arena_alloc(); + LNK_ImportTable *imptab = push_array(arena, LNK_ImportTable, 1); + imptab->arena = arena; + imptab->machine = machine; + imptab->data_sect = data_sect; + imptab->code_sect = code_sect; + imptab->dll_table_chunk = dll_table_chunk; + imptab->int_chunk = int_chunk; + imptab->handle_table_chunk = handle_table_chunk; + imptab->iat_chunk = iat_chunk; + imptab->ilt_chunk = ilt_chunk; + imptab->biat_chunk = biat_chunk; + imptab->uiat_chunk = uiat_chunk; + imptab->code_chunk = code_chunk; + imptab->flags = flags; + imptab->dll_ht = hash_table_init(arena, LNK_IMPORT_FUNC_HASH_TABLE_BUCKET_COUNT); + + ProfEnd(); + return imptab; +} + +internal void +lnk_import_table_release(LNK_ImportTable **imptab_ptr) +{ + ProfBeginFunction(); + arena_release((*imptab_ptr)->arena); + *imptab_ptr = 0; + ProfEnd(); +} + +internal BucketNode * +lnk_import_table_push_dll_node(LNK_ImportTable *imptab, LNK_ImportDLL *dll) +{ + // update list + SLLQueuePush(imptab->first_dll, imptab->last_dll, dll); + + // update name -> dll hash table + return hash_table_push_path_raw(imptab->arena, imptab->dll_ht, dll->name, dll); +} + +internal BucketNode * +lnk_import_table_push_func_node(LNK_ImportTable *imptab, LNK_ImportDLL *dll, LNK_ImportFunc *func) +{ + // update list + SLLQueuePush(dll->first_func, dll->last_func, func); + + // update name -> func hash table + return hash_table_push_string_raw(imptab->arena, dll->func_ht, func->name, func); +} + +internal LNK_ImportDLL * +lnk_import_table_search_dll(LNK_ImportTable *imptab, String8 name) +{ + KeyValuePair *kv = hash_table_search_path(imptab->dll_ht, name); + if (kv) { + Assert(kv->value_raw); + return kv->value_raw; + } + return 0; +} + +internal LNK_ImportFunc * +lnk_import_table_search_func(LNK_ImportDLL *dll, String8 name) +{ + KeyValuePair *kv = hash_table_search_string(dll->func_ht, name); + if (kv) { + Assert(kv->value_raw); + return kv->value_raw; + } + return 0; +} + +internal LNK_ImportDLL * +lnk_import_table_push_dll_regular(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, String8 dll_name, COFF_MachineType machine) +{ + ProfBeginFunction(); + + // TODO: error handle + Assert(imptab->machine == machine); + + LNK_Section *data_sect = imptab->data_sect; + LNK_Section *code_sect = imptab->code_sect; + + LNK_Chunk *int_table_chunk = lnk_section_push_chunk_list(data_sect, imptab->int_chunk, str8_zero()); + LNK_Chunk *ilt_table_chunk = lnk_section_push_chunk_list(data_sect, imptab->ilt_chunk, str8_zero()); + LNK_Chunk *iat_table_chunk = lnk_section_push_chunk_list(data_sect, imptab->iat_chunk, str8_zero()); + LNK_Chunk *code_table_chunk = lnk_section_push_chunk_list(code_sect, imptab->code_chunk, str8_zero()); + + String8 ilt_symbol_name = push_str8f(symtab->arena, "%S.lookup_table_voff", dll_name); + LNK_Symbol *ilt_symbol = lnk_make_defined_symbol_chunk(symtab->arena, ilt_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, ilt_table_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, ilt_symbol); + + String8 iat_symbol_name = push_str8f(symtab->arena, "%S.import_addr_table_voff", dll_name); + LNK_Symbol *iat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, iat_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, iat_table_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, iat_symbol); + + String8 dll_name_cstr = push_cstr(data_sect->arena, dll_name); + LNK_Chunk *dll_name_chunk = lnk_section_push_chunk_data(data_sect, int_table_chunk, dll_name_cstr, str8_zero()); + lnk_chunk_set_debugf(data_sect->arena, dll_name_chunk, "DLL name chunk (%S)", dll_name); + + String8 dll_name_voff_name = push_str8f(symtab->arena, "%S.name_voff", dll_name); + LNK_Symbol *dll_name_voff_symbol = lnk_make_defined_symbol_chunk(symtab->arena, dll_name_voff_name, LNK_DefinedSymbolVisibility_Internal, 0, dll_name_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, dll_name_voff_symbol); + + // chunk for dll directory entry + PE_ImportEntry *dir = push_array(imptab->arena, PE_ImportEntry, 1); + LNK_Chunk *dll_chunk = lnk_section_push_chunk_data(data_sect, imptab->dll_table_chunk, str8_struct(dir), str8_zero()); + lnk_chunk_set_debugf(data_sect->arena, dll_chunk, "DLL Directory for %S", dll_name); + + // patch dll import fields + lnk_section_push_reloc(data_sect, dll_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_ImportEntry, lookup_table_voff), ilt_symbol); + lnk_section_push_reloc(data_sect, dll_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_ImportEntry, name_voff), dll_name_voff_symbol); + lnk_section_push_reloc(data_sect, dll_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_ImportEntry, import_addr_table_voff), iat_symbol); + + U64 import_size = coff_word_size_from_machine(machine); + + // null entry to terminate import lookup table array + LNK_Chunk *null_ilt_chunk = lnk_section_push_chunk_data(data_sect, ilt_table_chunk, str8(0, import_size), str8_lit("zzzzzz")); + lnk_chunk_set_debugf(data_sect->arena, null_ilt_chunk, "%S: ILT terminator", dll_name); + + // null entry to terminate import address table array + LNK_Chunk *null_iat_chunk = lnk_section_push_chunk_data(data_sect, iat_table_chunk, str8(0, import_size), str8_lit("zzzzzz")); + lnk_chunk_set_debugf(data_sect->arena, null_iat_chunk, "%S: IAT terminator", dll_name); + + // push to list + LNK_ImportDLL *dll = push_array(imptab->arena, LNK_ImportDLL, 1); + dll->name = push_str8_copy(imptab->arena, dll_name); + dll->dll_chunk = dll_chunk; + dll->int_table_chunk = int_table_chunk; + dll->ilt_table_chunk = ilt_table_chunk; + dll->iat_table_chunk = iat_table_chunk; + dll->code_table_chunk = code_table_chunk; + dll->machine = machine; + dll->func_ht = hash_table_init(imptab->arena, LNK_IMPORT_FUNC_HASH_TABLE_BUCKET_COUNT); + + lnk_import_table_push_dll_node(imptab, dll); + + ProfEnd(); + return dll; +} + +internal LNK_ImportDLL * +lnk_import_table_push_dll_delayed(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, String8 dll_name, COFF_MachineType machine) +{ + ProfBeginFunction(); + + Assert(imptab->machine == machine); + + U64 handle_size = coff_word_size_from_machine(machine); + U64 import_size = coff_word_size_from_machine(machine); + + // shortcuts + LNK_Section *data_sect = imptab->data_sect; + LNK_Section *code_sect = imptab->code_sect; + + // init DLL entry + PE_DelayedImportEntry *imp_desc = push_array(data_sect->arena, PE_DelayedImportEntry, 1); + imp_desc->attributes = 1; + imp_desc->name_voff = 0; // relocated + imp_desc->module_handle_voff = 0; // relocated + imp_desc->iat_voff = 0; // relocated + imp_desc->name_table_voff = 0; // relocated + imp_desc->bound_table_voff = 0; // relocated + imp_desc->unload_table_voff = 0; // relocated + imp_desc->time_stamp = 0; + + // emit entry chunk + String8 imp_desc_data = str8_struct(imp_desc); + LNK_Chunk *imp_desc_chunk = lnk_section_push_chunk_data(data_sect, imptab->dll_table_chunk, imp_desc_data, str8_zero()); + + // emit entry symbol + String8 imp_desc_name = push_str8f(symtab->arena, "__DELAY_IMPORT_DESCRIPTOR_%S", dll_name); + LNK_Symbol *imp_desc_symbol = lnk_make_defined_symbol_chunk(symtab->arena, imp_desc_name, LNK_DefinedSymbolVisibility_Extern, 0, imp_desc_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, imp_desc_symbol); + + // emit string table chunk + String8 int_table_chunk_debug = push_str8f(data_sect->arena, "delayed.%S.int", dll_name); + LNK_Chunk *int_table_chunk = lnk_section_push_chunk_list(data_sect, imptab->int_chunk, int_table_chunk_debug); + + String8 int_table_symbol_name = push_str8f(symtab->arena, "delayed.%S.int", dll_name); + LNK_Symbol *int_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, int_table_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, int_table_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, int_table_symbol); + + LNK_Chunk *null_string_chunk = lnk_section_push_chunk_list(data_sect, int_table_chunk, str8_lit("zzzzz")); + lnk_chunk_set_debugf(data_sect->arena, null_string_chunk, "string table null"); + + // emit DLL name chunk + String8 name_chunk_data = push_cstr(data_sect->arena, dll_name); + LNK_Chunk *name_chunk = lnk_section_push_chunk_data(data_sect, int_table_chunk, name_chunk_data, str8_zero()); + + String8 name_symbol_name = push_str8f(symtab->arena, "delayed.%S.name", dll_name); + LNK_Symbol *name_symbol = lnk_make_defined_symbol_chunk(symtab->arena, name_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, name_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, name_symbol); + + // patch DLL name voff + lnk_section_push_reloc(data_sect, imp_desc_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_DelayedImportEntry, name_voff), name_symbol); + + // emit DLL handle chunk + LNK_Chunk *handle_chunk = lnk_section_push_chunk_bss(data_sect, imptab->handle_table_chunk, handle_size, str8_zero()); + + String8 handle_name = push_str8f(symtab->arena, "delayed.%S.handle", dll_name); + LNK_Symbol *handle_symbol = lnk_make_defined_symbol_chunk(symtab->arena, handle_name, LNK_DefinedSymbolVisibility_Internal, 0, handle_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, handle_symbol); + + // patch DLL handle voff + lnk_section_push_reloc(data_sect, imp_desc_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_DelayedImportEntry, module_handle_voff), handle_symbol); + + // emit IAT chunk + LNK_Chunk *iat_table_chunk = lnk_section_push_chunk_list(data_sect, imptab->iat_chunk, str8_zero()); + + String8 iat_table_name = push_str8f(symtab->arena, "delayed.%S.iat", dll_name); + LNK_Symbol *iat_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, iat_table_name, LNK_DefinedSymbolVisibility_Internal, 0, iat_table_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, iat_table_symbol); + + LNK_Chunk *null_iat_chunk = lnk_section_push_chunk_bss(data_sect, iat_table_chunk, import_size, str8_lit("zzzzzz")); + lnk_chunk_set_debugf(data_sect->arena, null_iat_chunk, "%S: IAT terminator", dll_name); + + // emit ILT chunk + LNK_Chunk *ilt_table_chunk = lnk_section_push_chunk_list(data_sect, imptab->ilt_chunk, str8_zero()); + + LNK_Chunk *null_ilt_chunk = lnk_section_push_chunk_bss(data_sect, ilt_table_chunk, import_size, str8_lit("zzzzzz")); + lnk_chunk_set_debugf(data_sect->arena, null_ilt_chunk, "%S: ILT terminator", dll_name); + + String8 ilt_table_name = push_str8f(symtab->arena, "delayed.%S.ilt", dll_name); + LNK_Symbol *ilt_table_symbol = lnk_make_defined_symbol_chunk(symtab->arena, ilt_table_name, LNK_DefinedSymbolVisibility_Extern, 0, ilt_table_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, ilt_table_symbol); + + // patch import address table voff + lnk_section_push_reloc(data_sect, imp_desc_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_DelayedImportEntry, iat_voff), iat_table_symbol); + + // patch string table voff + lnk_section_push_reloc(data_sect, imp_desc_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_DelayedImportEntry, name_table_voff), ilt_table_symbol); + + // emit bound table chunk + LNK_Chunk *biat_chunk = 0; + if (imptab->flags & LNK_ImportTableFlag_EmitBiat) { + biat_chunk = lnk_section_push_chunk_list(data_sect, imptab->biat_chunk, str8_zero()); + + String8 biat_symbol_name = push_str8f(symtab->arena, "delayed.%S.BIAT", dll_name); + LNK_Symbol *biat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, biat_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, biat_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, biat_symbol); + + // patch BIAT field off + lnk_section_push_reloc(data_sect, imp_desc_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_DelayedImportEntry, bound_table_voff), biat_symbol); + } + + // emit unload table chunk + LNK_Chunk *uiat_chunk = NULL; + if (imptab->flags & LNK_ImportTableFlag_EmitUiat) { + uiat_chunk = lnk_section_push_chunk_list(data_sect, imptab->uiat_chunk, str8_zero()); + + String8 uiat_symbol_name = push_str8f(symtab->arena, "delayed.%S.UIAT", dll_name); + LNK_Symbol *uiat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, uiat_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, uiat_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, uiat_symbol); + + // patch UIAT field voff + lnk_section_push_reloc(data_sect, imp_desc_chunk, LNK_Reloc_VIRT_OFF_32, OffsetOf(PE_DelayedImportEntry, unload_table_voff), uiat_symbol); + } + + // emit chunk for DLL thunk/load code + LNK_Chunk *code_chunk = lnk_section_push_chunk_list(code_sect, imptab->code_chunk, str8_zero()); + lnk_chunk_set_debugf(code_sect->arena, code_chunk, "code for %S", dll_name); + + // emit tail merge + LNK_Chunk *tail_merge_chunk = 0; + switch (machine) { + case COFF_MachineType_X64: { + LNK_Symbol *delay_load_helper_symbol = lnk_make_undefined_symbol(symtab->arena, str8_lit(LNK_DELAY_LOAD_HELPER2_SYMBOL_NAME), LNK_SymbolScopeFlag_Main); + tail_merge_chunk = lnk_emit_tail_merge_thunk_x64(code_sect, code_chunk, imp_desc_symbol, delay_load_helper_symbol); + } break; + default: { + lnk_not_implemented("TODO: __tailMerge for %S", coff_string_from_machine_type(machine)); + } break; + } + + // fill out result + LNK_ImportDLL *dll = push_array(imptab->arena, LNK_ImportDLL, 1); + dll->dll_chunk = imp_desc_chunk; + dll->int_table_chunk = int_table_chunk; + dll->iat_table_chunk = iat_table_chunk; + dll->ilt_table_chunk = ilt_table_chunk; + dll->biat_table_chunk = biat_chunk; + dll->uiat_table_chunk = uiat_chunk; + dll->code_table_chunk = code_chunk; + dll->tail_merge_symbol = lnk_emit_tail_merge_symbol(symtab, tail_merge_chunk, dll_name); + dll->name = push_str8_copy(imptab->arena, dll_name); + dll->machine = machine; + dll->func_ht = hash_table_init(imptab->arena, LNK_IMPORT_FUNC_HASH_TABLE_BUCKET_COUNT); + + lnk_import_table_push_dll_node(imptab, dll); + + ProfEnd(); + return dll; +} + +internal LNK_ImportFunc * +lnk_import_table_push_func_regular(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, LNK_ImportDLL *dll, COFF_ImportHeader *header) +{ + ProfBeginFunction(); + + Assert(header->machine == dll->machine); // TODO: error handling + + LNK_Section *data_sect = imptab->data_sect; + LNK_Section *code_sect = imptab->code_sect; + + LNK_Chunk *int_table_chunk = dll->int_table_chunk; + LNK_Chunk *ilt_table_chunk = dll->ilt_table_chunk; + LNK_Chunk *iat_table_chunk = dll->iat_table_chunk; + LNK_Chunk *code_table_chunk = dll->code_table_chunk; + + LNK_Chunk *ilt_chunk = g_null_chunk_ptr; + LNK_Chunk *iat_chunk = g_null_chunk_ptr; + + U64 import_size = coff_word_size_from_machine(dll->machine); + + // generate sort index (optional) + String8 sort_index = str8_from_bits_u32(data_sect->arena, header->hint); + + switch (header->name_type) { + case COFF_ImportHeaderNameType_ORDINAL: { + String8 ordinal_data = lnk_ordinal_data_from_hint(data_sect->arena, dll->machine, header->hint); + ilt_chunk = lnk_section_push_chunk_data(data_sect, ilt_table_chunk, ordinal_data, sort_index); + iat_chunk = lnk_section_push_chunk_data(data_sect, iat_table_chunk, ordinal_data, sort_index); + + // associate chunks + lnk_section_associate_chunks(data_sect, iat_chunk, ilt_chunk); + } break; + case COFF_ImportHeaderNameType_NAME: { + // put together name look up entry + String8 int_data = coff_make_import_lookup(data_sect->arena, header->hint, header->func_name); + LNK_Chunk *int_chunk = lnk_section_push_chunk_data(data_sect, int_table_chunk, int_data, str8_zero()); + + // create symbol for lookup chunk + String8 int_symbol_name = push_str8f(symtab->arena, "regular.%S.%S.name", dll->name, header->func_name); + LNK_Symbol *int_symbol = lnk_make_defined_symbol_chunk(symtab->arena, int_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, int_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, int_symbol); + + // in the file IAT mirrors ILT, dynamic linker later overwrites it with imported function addresses. + ilt_chunk = lnk_section_push_chunk_bss(data_sect, ilt_table_chunk, import_size, sort_index); + iat_chunk = lnk_section_push_chunk_bss(data_sect, iat_table_chunk, import_size, sort_index); + lnk_chunk_set_debugf(data_sect->arena, ilt_chunk, "ILT entry for %S.%S", dll->name, header->func_name); + lnk_chunk_set_debugf(data_sect->arena, iat_chunk, "IAT entry for %S.%S", dll->name, header->func_name); + + // associate chunks + lnk_section_associate_chunks(data_sect, iat_chunk, ilt_chunk); + lnk_section_associate_chunks(data_sect, iat_chunk, int_chunk); + + // patch IAT and ILT + lnk_section_push_reloc(data_sect, ilt_chunk, LNK_Reloc_VIRT_OFF_32, 0, int_symbol); + lnk_section_push_reloc(data_sect, iat_chunk, LNK_Reloc_VIRT_OFF_32, 0, int_symbol); + } break; + case COFF_ImportHeaderNameType_UNDECORATE: { + lnk_not_implemented("TODO: COFF_ImportHeaderNameType_UNDECORATE"); + } break; + case COFF_ImportHeaderNameType_NAME_NOPREFIX: { + lnk_not_implemented("TODO: COFF_ImportHeaderNameType_NAME_NOPREFIX"); + } break; + } + + String8 ilt_symbol_name = push_str8f(symtab->arena, "regular.%S.%S.ilt", dll->name, header->func_name); + String8 iat_symbol_name = push_str8f(symtab->arena, "__imp_%S", header->func_name); + LNK_Symbol *ilt_symbol = lnk_make_defined_symbol_chunk(symtab->arena, ilt_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, ilt_chunk, 0, 0, 0); + LNK_Symbol *iat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, iat_symbol_name, LNK_DefinedSymbolVisibility_Extern, 0, iat_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, ilt_symbol); + lnk_symbol_table_push(symtab, iat_symbol); + + // generate thunks + LNK_Symbol *jmp_thunk_symbol = g_null_symbol_ptr; + if (header->type == COFF_ImportHeaderType_CODE) { + switch (dll->machine) { + case COFF_MachineType_X64: { + // generate jump thunk + LNK_Chunk *jmp_thunk_chunk = lnk_emit_indirect_jump_thunk_x64(code_sect, code_table_chunk, iat_symbol); + lnk_section_associate_chunks(data_sect, iat_chunk, jmp_thunk_chunk); + + // push jump thunk symbol + String8 jmp_thunk_symbol_name = push_str8_copy(symtab->arena, header->func_name); + jmp_thunk_symbol = lnk_emit_jmp_thunk_symbol(symtab, jmp_thunk_chunk, jmp_thunk_symbol_name); + lnk_symbol_set_debugf(symtab->arena, jmp_thunk_symbol, "x64 jmp thunk %S.%S", dll->name, header->func_name); + } break; + default: lnk_not_implemented("TODO: support for machine 0x%X", dll->machine); break; + } + } + + // fill out import + LNK_ImportFunc *func = push_array(imptab->arena, LNK_ImportFunc, 1); + func->name = push_str8_copy(imptab->arena, header->func_name); + func->thunk_symbol_name = push_str8_copy(imptab->arena, jmp_thunk_symbol->name); + func->iat_symbol_name = push_str8_copy(imptab->arena, iat_symbol->name); + + lnk_import_table_push_func_node(imptab, dll, func); + + ProfEnd(); + return func; +} + +internal LNK_ImportFunc * +lnk_import_table_push_func_delayed(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, LNK_ImportDLL *dll, COFF_ImportHeader *header) +{ + ProfBeginFunction(); + + Assert(dll->machine == header->machine); // TODO: error handle + + U64 import_size = coff_word_size_from_machine(dll->machine); + + LNK_Section *data_sect = imptab->data_sect; + LNK_Section *code_sect = imptab->code_sect; + + LNK_Chunk *int_table_chunk = dll->int_table_chunk; + LNK_Chunk *ilt_table_chunk = dll->ilt_table_chunk; + LNK_Chunk *iat_table_chunk = dll->iat_table_chunk; + LNK_Chunk *biat_table_chunk = dll->biat_table_chunk; + LNK_Chunk *uiat_table_chunk = dll->uiat_table_chunk; + LNK_Chunk *code_table_chunk = dll->code_table_chunk; + + LNK_Chunk *ilt_chunk = g_null_chunk_ptr; + LNK_Chunk *iat_chunk = g_null_chunk_ptr; + LNK_Chunk *uiat_chunk = g_null_chunk_ptr; + LNK_Chunk *biat_chunk = g_null_chunk_ptr; + + LNK_Symbol *int_symbol = 0; + + // generate sort index (optional) + String8 sort_index = str8_from_bits_u32(data_sect->arena, header->hint); + + // generate thunks + LNK_Symbol *jmp_thunk_symbol = g_null_symbol_ptr; + LNK_Symbol *load_thunk_symbol = g_null_symbol_ptr; + LNK_Chunk *jmp_thunk_chunk = 0; + LNK_Chunk *load_thunk_chunk = 0; + if (header->type == COFF_ImportHeaderType_CODE) { + switch (dll->machine) { + case COFF_MachineType_X64: { + String8 iat_symbol_name = push_str8f(symtab->arena, "__imp_%S", header->func_name); + LNK_Symbol *iat_symbol = lnk_make_undefined_symbol(symtab->arena, iat_symbol_name, LNK_SymbolScopeFlag_Main); + lnk_symbol_set_debugf(symtab->arena, iat_symbol, "x64 IAT (delayed) %S.%S", dll->name, header->func_name); + + // emit jmp thunk chunk + jmp_thunk_chunk = lnk_emit_indirect_jump_thunk_x64(code_sect, code_table_chunk, iat_symbol); + jmp_thunk_symbol = lnk_emit_jmp_thunk_symbol(symtab, jmp_thunk_chunk, header->func_name); + lnk_symbol_set_debugf(symtab->arena, jmp_thunk_symbol, "x64 jmp thunk (delayed) %S.%S", dll->name, header->func_name); + + // emit load thunk + load_thunk_chunk = lnk_emit_load_thunk_x64(code_sect, code_table_chunk, iat_symbol, dll->tail_merge_symbol); + load_thunk_symbol = lnk_emit_load_thunk_symbol(symtab, load_thunk_chunk, header->func_name); + lnk_symbol_set_debugf(symtab->arena, load_thunk_symbol, "x64 load thunk (delayed) %S.%S", dll->name, header->func_name); + } break; + default: lnk_not_implemented("TODO: support for machine 0x%X", dll->machine); break; + } + } + + switch (header->name_type) { + case COFF_ImportHeaderNameType_ORDINAL: { + String8 ordinal_data = lnk_ordinal_data_from_hint(data_sect->arena, dll->machine, header->hint); + Assert(ordinal_data.size == import_size); + ilt_chunk = lnk_section_push_chunk_data(data_sect, ilt_table_chunk, ordinal_data, sort_index); + iat_chunk = lnk_section_push_chunk_bss(data_sect, iat_table_chunk, import_size, sort_index); + lnk_section_push_reloc(data_sect, iat_chunk, LNK_Reloc_ADDR_64, 0, load_thunk_symbol); + + lnk_section_associate_chunks(data_sect, iat_chunk, ilt_chunk); + if (imptab->flags & LNK_ImportTableFlag_EmitBiat) { + biat_chunk = lnk_section_push_chunk_bss(data_sect, biat_table_chunk, import_size, sort_index); + lnk_section_push_reloc(data_sect, biat_chunk, LNK_Reloc_ADDR_64, 0, load_thunk_symbol); + lnk_section_associate_chunks(data_sect, iat_chunk, biat_chunk); + } + if (imptab->flags & LNK_ImportTableFlag_EmitUiat) { + uiat_chunk = lnk_section_push_chunk_bss(data_sect, uiat_table_chunk, import_size, sort_index); + lnk_section_push_reloc(data_sect, uiat_chunk, LNK_Reloc_ADDR_64, 0, load_thunk_symbol); + lnk_section_associate_chunks(data_sect, iat_chunk, uiat_chunk); + } + } break; + case COFF_ImportHeaderNameType_NAME: { + // put together name look up entry + String8 int_data = coff_make_import_lookup(data_sect->arena, header->hint, header->func_name); + LNK_Chunk *int_chunk = lnk_section_push_chunk_data(data_sect, int_table_chunk, int_data, str8_zero()); + + // create symbol for lookup chunk + String8 int_symbol_name = push_str8f(symtab->arena, "%S.%S.name.delayed", dll->name, header->func_name); + int_symbol = lnk_make_defined_symbol_chunk(symtab->arena, int_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, int_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, int_symbol); + + // dynamic linker patches this voff on DLL load event + ilt_chunk = lnk_section_push_chunk_bss(data_sect, ilt_table_chunk, import_size, sort_index); + lnk_chunk_set_debugf(data_sect->arena, ilt_chunk, "ILT entry (delayed) %S.%S", dll->name, header->func_name); + + // patch-in ILT with import voff + lnk_section_push_reloc(data_sect, ilt_chunk, LNK_Reloc_VIRT_OFF_32, 0, int_symbol); + + // in the file IAT mirrors ILT, dynamic linker later overwrites it with imported function addresses. + iat_chunk = lnk_section_push_chunk_bss(data_sect, iat_table_chunk, import_size, sort_index); + lnk_chunk_set_debugf(data_sect->arena, iat_chunk, "IAT entre (delayed) %S.%S", dll->name, header->func_name); + + // associate chunks + lnk_section_associate_chunks(data_sect, iat_chunk, ilt_chunk); + lnk_section_associate_chunks(data_sect, iat_chunk, int_chunk); + + // patch-in thunk address + lnk_section_push_reloc(data_sect, iat_chunk, LNK_Reloc_ADDR_64, 0, load_thunk_symbol); + + if (imptab->flags & LNK_ImportTableFlag_EmitBiat) { + biat_chunk = lnk_section_push_chunk_bss(data_sect, biat_table_chunk, import_size, sort_index); + lnk_chunk_set_debugf(data_sect->arena, biat_chunk, "%S.biat.%S (delayed)", dll->name, header->func_name); + + // patch-in thunk address + lnk_section_push_reloc(data_sect, biat_chunk, LNK_Reloc_ADDR_64, 0, load_thunk_symbol); + } + + if (imptab->flags & LNK_ImportTableFlag_EmitUiat) { + uiat_chunk = lnk_section_push_chunk_bss(data_sect, uiat_table_chunk, import_size, sort_index); + lnk_chunk_set_debugf(data_sect->arena, uiat_chunk, "%S.uiat.%S (delayed)", dll->name, header->func_name); + + // patch-in thunk address + lnk_section_push_reloc(data_sect, uiat_chunk, LNK_Reloc_ADDR_64, 0, load_thunk_symbol); + } + } break; + case COFF_ImportHeaderNameType_UNDECORATE: { + lnk_not_implemented("TODO: COFF_ImportHeaderNameType_UNDECORATE"); + } break; + case COFF_ImportHeaderNameType_NAME_NOPREFIX: { + lnk_not_implemented("TODO: COFF_ImportHeaderNameType_NAME_NOPREFIX"); + } break; + } + + if (jmp_thunk_chunk) { + lnk_section_associate_chunks(data_sect, iat_chunk, jmp_thunk_chunk); + } + if (load_thunk_chunk) { + lnk_section_associate_chunks(data_sect, iat_chunk, load_thunk_chunk); + } + + String8 iat_symbol_name = push_str8f(symtab->arena, "__imp_%S", header->func_name); + LNK_Symbol *iat_symbol = lnk_make_defined_symbol_chunk(symtab->arena, iat_symbol_name, LNK_DefinedSymbolVisibility_Extern, 0, iat_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, iat_symbol); + + String8 ilt_symbol_name = push_str8f(symtab->arena, "%S.%S.ilt.delayed", dll->name, header->func_name); + LNK_Symbol *ilt_symbol = lnk_make_defined_symbol_chunk(symtab->arena, ilt_symbol_name, LNK_DefinedSymbolVisibility_Internal, 0, ilt_chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, ilt_symbol); + + // fill out import + LNK_ImportFunc *func = push_array(imptab->arena, LNK_ImportFunc, 1); + func->name = push_str8_copy(imptab->arena, header->func_name); + func->thunk_symbol_name = push_str8_copy(imptab->arena, jmp_thunk_symbol->name); + func->iat_symbol_name = push_str8_copy(imptab->arena, iat_symbol->name); + + lnk_import_table_push_func_node(imptab, dll, func); + + ProfEnd(); + return func; +} + +internal String8 +lnk_ordinal_data_from_hint(Arena *arena, COFF_MachineType machine, U16 hint) +{ + String8 ordinal_data = str8_zero(); + switch (machine) { + case COFF_MachineType_X64: { + U64 *ordinal = push_array(arena, U64, 1); + *ordinal = coff_make_ordinal_64(hint); + ordinal_data = str8_struct(ordinal); + } break; + case COFF_MachineType_X86: { + U32 *ordinal = push_array(arena, U32, 1); + *ordinal = coff_make_ordinal_32(hint); + ordinal_data = str8_struct(ordinal); + } break; + default: lnk_not_implemented("TODO: support for machine 0x%x", machine); + } + return ordinal_data; +} + +internal LNK_Chunk * +lnk_emit_indirect_jump_thunk_x64(LNK_Section *sect, LNK_Chunk *parent, LNK_Symbol *addr_ptr) +{ + ProfBeginFunction(); + + static U8 thunk[] = { 0xFF, 0x25, 0x00, 0x00, 0x00, 0x00 }; // jmp [__imp_] + + // emit chunk + String8 jmp_data = push_str8_copy(sect->arena, str8_array_fixed(thunk)); + LNK_Chunk *jmp_chunk = lnk_section_push_chunk_data(sect, parent, jmp_data, str8_zero()); + + // patch thunk with imports address + static const U64 JMP_OPERAND_OFFSET = 2; + lnk_section_push_reloc(sect, jmp_chunk, LNK_Reloc_REL32, JMP_OPERAND_OFFSET, addr_ptr); + + ProfEnd(); + return jmp_chunk; +} + +internal LNK_Chunk * +lnk_emit_load_thunk_x64(LNK_Section *sect, LNK_Chunk *parent, LNK_Symbol *imp_addr_ptr, LNK_Symbol *tail_merge) +{ + ProfBeginFunction(); + + static U8 load_thunk[] = { + 0x48, 0x8D, 0x05, 0x00, 0x00, 0x00, 0x00, // lea rax, [__imp_] + 0xE9, 0x00, 0x00, 0x00, 0x00 // jmp __tailMerge_ + }; + + // emit load thunk chunk + String8 load_thunk_data = push_str8_copy(sect->arena, str8_array_fixed(load_thunk)); + LNK_Chunk *load_thunk_chunk = lnk_section_push_chunk_data(sect, parent, load_thunk_data, str8_zero()); + + // patch lea with IAT entry + static const U64 LEA_OPERAND_OFFSET = 3; + lnk_section_push_reloc(sect, load_thunk_chunk, LNK_Reloc_REL32, LEA_OPERAND_OFFSET, imp_addr_ptr); + + // patch jmp __tailMerge_ + static const U64 JMP_OPERAND_OFFSET = 8; + lnk_section_push_reloc(sect, load_thunk_chunk, LNK_Reloc_REL32, JMP_OPERAND_OFFSET, tail_merge); + + ProfEnd(); + return load_thunk_chunk; +} + +internal LNK_Chunk * +lnk_emit_tail_merge_thunk_x64(LNK_Section *sect, LNK_Chunk *parent, LNK_Symbol *dll_import_descriptor, LNK_Symbol *delay_load_helper) +{ + ProfBeginFunction(); + + static U8 tail_merge[] = { + 0x48, 0x89, 0x4C, 0x24, 0x08, // mov qword ptr [rsp+8],rcx + 0x48, 0x89, 0x54, 0x24, 0x10, // mov qword ptr [rsp+10h],rdx + 0x4C, 0x89, 0x44, 0x24, 0x18, // mov qword ptr [rsp+18h],r8 + 0x4C, 0x89, 0x4C, 0x24, 0x20, // mov qword ptr [rsp+20h],r9 + 0x48, 0x83, 0xEC, 0x68, // sub rsp,68h + 0x66, 0x0F, 0x7F, 0x44, 0x24, 0x20, // movdqa xmmword ptr [rsp+20h],xmm0 + 0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x30, // movdqa xmmword ptr [rsp+30h],xmm1 + 0x66, 0x0F, 0x7F, 0x54, 0x24, 0x40, // movdqa xmmword ptr [rsp+40h],xmm2 + 0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x50, // movdqa xmmword ptr [rsp+50h],xmm3 + 0x48, 0x8B, 0xD0, // mov rdx,rax + 0x48, 0x8D, 0x0D, 0x00, 0x00, 0x00, 0x00, // lea rcx,[__DELAY_IMPORT_DESCRIPTOR_] + 0xE8, 0x00, 0x00, 0x00, 0x00, // call __delayLoadHelper2 + 0x66, 0x0F, 0x6F, 0x44, 0x24, 0x20, // movdqa xmm0,xmmword ptr [rsp+20h] + 0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x30, // movdqa xmm1,xmmword ptr [rsp+30h] + 0x66, 0x0F, 0x6F, 0x54, 0x24, 0x40, // movdqa xmm2,xmmword ptr [rsp+40h] + 0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x50, // movdqa xmm3,xmmword ptr [rsp+50h] + 0x48, 0x8B, 0x4C, 0x24, 0x70, // mov rcx,qword ptr [rsp+70h] + 0x48, 0x8B, 0x54, 0x24, 0x78, // mov rdx,qword ptr [rsp+78h] + 0x4C, 0x8B, 0x84, 0x24, 0x80, 0x00, 0x00, 0x00, // mov r8,qword ptr [rsp+80h] + 0x4C, 0x8B, 0x8C, 0x24, 0x88, 0x00, 0x00, 0x00, // mov r9,qword ptr [rsp+88h] + 0x48, 0x83, 0xC4, 0x68, // add rsp,68h + 0xFF, 0xE0, // jmp rax + }; + + // emit tail merge chunk + String8 tail_merge_data = push_str8_copy(sect->arena, str8_array_fixed(tail_merge)); + LNK_Chunk *tail_merge_chunk = lnk_section_push_chunk_data(sect, parent, tail_merge_data, str8_zero()); + + // patch lea __DELAY_IMPORT_DESCRIPTOR_ + static const U64 LEA_OPERAND_OFFSET = 54; + lnk_section_push_reloc(sect, tail_merge_chunk, LNK_Reloc_REL32, LEA_OPERAND_OFFSET, dll_import_descriptor); + + // patch call __delayLoadHelper2 + static const U64 CALL_OPERAND_OFFSET = 59; + lnk_section_push_reloc(sect, tail_merge_chunk, LNK_Reloc_REL32, CALL_OPERAND_OFFSET, delay_load_helper); + + ProfEnd(); + return tail_merge_chunk; +} + +internal LNK_Symbol * +lnk_emit_load_thunk_symbol(LNK_SymbolTable *symtab, LNK_Chunk *chunk, String8 func_name) +{ + ProfBeginFunction(); + // emit load thunk symbol + String8 load_thunk_name = push_str8f(symtab->arena, "__imp_load_%S", func_name); + LNK_Symbol *load_thunk_symbol = lnk_make_defined_symbol_chunk(symtab->arena, load_thunk_name, LNK_DefinedSymbolVisibility_Extern, LNK_DefinedSymbolFlag_IsFunc|LNK_DefinedSymbolFlag_IsThunk, chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, load_thunk_symbol); + ProfEnd(); + return load_thunk_symbol; +} + +internal LNK_Symbol * +lnk_emit_jmp_thunk_symbol(LNK_SymbolTable *symtab, LNK_Chunk *chunk, String8 func_name) +{ + ProfBeginFunction(); + String8 jmp_thunk_name = push_str8f(symtab->arena, "%S", func_name); + LNK_Symbol *jmp_thunk_symbol = lnk_make_defined_symbol_chunk(symtab->arena, jmp_thunk_name, LNK_DefinedSymbolVisibility_Extern, LNK_DefinedSymbolFlag_IsFunc|LNK_DefinedSymbolFlag_IsThunk, chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, jmp_thunk_symbol); + ProfEnd(); + return jmp_thunk_symbol; +} + +internal LNK_Symbol * +lnk_emit_tail_merge_symbol(LNK_SymbolTable *symtab, LNK_Chunk *chunk, String8 func_name) +{ + ProfBeginFunction(); + String8 tail_merge_name = push_str8f(symtab->arena, "__tailMerge_%S", func_name); + LNK_Symbol *tail_merge_symbol = lnk_make_defined_symbol_chunk(symtab->arena, tail_merge_name, LNK_DefinedSymbolVisibility_Extern, LNK_DefinedSymbolFlag_IsFunc|LNK_DefinedSymbolFlag_IsThunk, chunk, 0, 0, 0); + lnk_symbol_table_push(symtab, tail_merge_symbol); + ProfEnd(); + return tail_merge_symbol; +} + diff --git a/src/linker/lnk_import_table.h b/src/linker/lnk_import_table.h new file mode 100644 index 00000000..ea28a0ac --- /dev/null +++ b/src/linker/lnk_import_table.h @@ -0,0 +1,81 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#define LNK_IMPORT_DLL_HASH_TABLE_BUCKET_COUNT 512 +#define LNK_IMPORT_FUNC_HASH_TABLE_BUCKET_COUNT 2048 + +typedef struct LNK_ImportFunc +{ + struct LNK_ImportFunc *next; + String8 name; + String8 thunk_symbol_name; + String8 iat_symbol_name; +} LNK_ImportFunc; + +typedef struct LNK_ImportDLL +{ + struct LNK_ImportDLL *next; + struct LNK_ImportFunc *first_func; + struct LNK_ImportFunc *last_func; + LNK_Chunk *dll_chunk; + LNK_Chunk *int_table_chunk; + LNK_Chunk *ilt_table_chunk; + LNK_Chunk *iat_table_chunk; + LNK_Chunk *biat_table_chunk; + LNK_Chunk *uiat_table_chunk; + LNK_Chunk *code_table_chunk; + LNK_Symbol *tail_merge_symbol; + String8 name; + COFF_MachineType machine; + HashTable *func_ht; +} LNK_ImportDLL; + +enum +{ + LNK_ImportTableFlag_EmitBiat = (1 << 0), + LNK_ImportTableFlag_EmitUiat = (1 << 1), +}; +typedef U32 LNK_ImportTableFlags; + +typedef struct LNK_ImportTable +{ + Arena *arena; + COFF_MachineType machine; + LNK_ImportDLL *first_dll; + LNK_ImportDLL *last_dll; + LNK_Section *data_sect; + LNK_Section *code_sect; + LNK_Chunk *dll_table_chunk; + LNK_Chunk *int_chunk; + LNK_Chunk *handle_table_chunk; + LNK_Chunk *iat_chunk; + LNK_Chunk *ilt_chunk; + LNK_Chunk *biat_chunk; + LNK_Chunk *uiat_chunk; + LNK_Chunk *code_chunk; + LNK_ImportTableFlags flags; + HashTable *dll_ht; +} LNK_ImportTable; + +internal LNK_ImportTable * lnk_import_table_alloc_regular(LNK_SectionTable *st, LNK_SymbolTable *symtab, COFF_MachineType machine); +internal LNK_ImportTable * lnk_import_table_alloc_delayed(LNK_SectionTable *st, LNK_SymbolTable *symtab, COFF_MachineType machine, B32 is_unloadable, B32 is_bindable); +internal void lnk_import_table_release(LNK_ImportTable **imptab); +internal LNK_ImportDLL * lnk_import_table_push_dll_regular(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, String8 dll_name, COFF_MachineType machine); +internal LNK_ImportDLL * lnk_import_table_push_dll_delayed(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, String8 dll_name, COFF_MachineType machine); +internal LNK_ImportFunc * lnk_import_table_push_func_regular(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, LNK_ImportDLL *dll, COFF_ImportHeader *header); +internal LNK_ImportFunc * lnk_import_table_push_func_delayed(LNK_ImportTable *imptab, LNK_SymbolTable *symtab, LNK_ImportDLL *dll, COFF_ImportHeader *header); +internal LNK_ImportDLL * lnk_import_table_search_dll(LNK_ImportTable *imptab, String8 name); +internal LNK_ImportFunc * lnk_import_table_search_func(LNK_ImportDLL *dll, String8 name); + +internal String8 lnk_ordinal_data_from_hint(Arena *arena, COFF_MachineType machine, U16 hint); + +internal LNK_Chunk * lnk_emit_indirect_jump_thunk_x64(LNK_Section *sect, LNK_Chunk *parent, LNK_Symbol *addr_ptr); +internal LNK_Chunk * lnk_emit_load_thunk_x64(LNK_Section *sect, LNK_Chunk *parent, LNK_Symbol *imp_addr_ptr, LNK_Symbol *tail_merge); +internal LNK_Chunk * lnk_emit_tail_merge_thunk_x64(LNK_Section *sect, LNK_Chunk *parent, LNK_Symbol *dll_import_descriptor, LNK_Symbol *delay_load_helper); + +internal LNK_Symbol * lnk_emit_load_thunk_symbol(LNK_SymbolTable *symtab, LNK_Chunk *chunk, String8 func_name); +internal LNK_Symbol * lnk_emit_jmp_thunk_symbol(LNK_SymbolTable *symtab, LNK_Chunk *chunk, String8 func_name); +internal LNK_Symbol * lnk_emit_tail_merge_symbol(LNK_SymbolTable *symtab, LNK_Chunk *chunk, String8 func_name); + diff --git a/src/linker/lnk_lib.c b/src/linker/lnk_lib.c new file mode 100644 index 00000000..6221bf5a --- /dev/null +++ b/src/linker/lnk_lib.c @@ -0,0 +1,941 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal LNK_LibNode * +lnk_lib_list_reserve(Arena *arena, LNK_LibList *list, U64 count) +{ + LNK_LibNode *arr = 0; + if (count) { + arr = push_array(arena, LNK_LibNode, count); + for (LNK_LibNode *ptr = arr, *opl = arr + count; ptr < opl; ++ptr) { + SLLQueuePush(list->first, list->last, ptr); + } + list->count += count; + } + return arr; +} + +internal LNK_LibMemberNode * +lnk_lib_member_list_push(Arena *arena, LNK_LibMemberList *list, LNK_LibMember member) +{ + LNK_LibMemberNode *n = push_array_no_zero(arena, LNK_LibMemberNode, 1); + n->next = 0; + n->data = member; + + SLLQueuePush(list->first, list->last, n); + ++list->count; + + return n; +} + +internal LNK_LibMember * +lnk_lib_member_array_from_list(Arena *arena, LNK_LibMemberList list) +{ + ProfBeginFunction(); + LNK_LibMember *arr = push_array_no_zero(arena, LNK_LibMember, list.count); + LNK_LibMember *ptr = arr; + for (LNK_LibMemberNode *i = list.first; i != 0; i = i->next, ptr += 1) { + ptr->name = push_str8_copy(arena, i->data.name); + ptr->data = push_str8_copy(arena, i->data.data); + } + ProfEnd(); + return arr; +} + +internal LNK_LibSymbolNode * +lnk_lib_symbol_list_push(Arena *arena, LNK_LibSymbolList *list, LNK_LibSymbol symbol) +{ + LNK_LibSymbolNode *n = push_array_no_zero(arena, LNK_LibSymbolNode, 1); + n->next = 0; + n->data = symbol; + + SLLQueuePush(list->first, list->last, n); + ++list->count; + + return n; +} + +internal LNK_LibSymbol * +lnk_lib_symbol_array_from_list(Arena *arena, LNK_LibSymbolList list) +{ + LNK_LibSymbol *arr = push_array_no_zero(arena, LNK_LibSymbol, list.count + 2); + LNK_LibSymbol *ptr = arr + 1; + for (LNK_LibSymbolNode *i = list.first; i != 0; i = i->next, ptr += 1) { + ptr->name = push_str8_copy(arena, i->data.name); + ptr->member_idx = i->data.member_idx; + } + MemoryZeroStruct(&arr[0]); + MemoryZeroStruct(&arr[list.count+1]); + return arr; +} + +int +lnk_lib_symbol_name_compar(const void *raw_a, const void *raw_b) +{ + const LNK_Symbol *sa = (const LNK_Symbol *)raw_a; + const LNK_Symbol *sb = (const LNK_Symbol *)raw_b; + return str8_compar_case_sensetive(&sa->name, &sb->name); +} + +int +lnk_lib_symbol_name_compar_is_before(void *raw_a, void *raw_b) +{ + int compar = lnk_lib_symbol_name_compar(raw_a, raw_b); + int is_before = compar < 0; + return is_before; +} + +internal void +lnk_lib_symbol_array_sort(LNK_LibSymbol *arr, U64 count) +{ + Assert(count >= 2); + radsort(arr + 1, count - 2, lnk_lib_symbol_name_compar_is_before); +} + +//////////////////////////////// + +internal LNK_Lib +lnk_lib_from_data(Arena *arena, String8 data, String8 path) +{ + U64 symbol_count; + String8 string_table; + U32 *member_off_arr; + + // is data archive? + COFF_ArchiveType type = coff_archive_type_from_data(data); + if (type == COFF_Archive_Null) { + lnk_not_implemented("TODO: data is not archive"); + } + + COFF_ArchiveParse parse = coff_archive_parse_from_data(arena, data); + + // try to init library from optional second member + if (parse.second_member.member_count) { + COFF_ArchiveSecondMember second_member = parse.second_member; + Assert(second_member.symbol_count == second_member.symbol_indices.size / sizeof(U16)); + Assert(second_member.member_count == second_member.member_offsets.size / sizeof(U32)); + + symbol_count = second_member.symbol_count; + string_table = second_member.string_table; + member_off_arr = push_array_no_zero(arena, U32, symbol_count); + + // decompress member offsets + U32 *comp_off_arr = (U32*)second_member.member_offsets.str; + U16 *off_number_arr = (U16*)second_member.symbol_indices.str; + for (U64 symbol_idx = 0; symbol_idx < symbol_count; symbol_idx += 1) { + U16 off_number = off_number_arr[symbol_idx]; + if (0 < off_number && off_number <= second_member.member_count) { + member_off_arr[symbol_idx] = comp_off_arr[off_number - 1]; + } else { + // TODO: log bad offset + member_off_arr[symbol_idx] = max_U32; + } + } + } + // first member is deprecated however tools emit it for compatibility reasons + // and lld-link with /DLL emits only first member + else if (parse.first_member.symbol_count) { + COFF_ArchiveFirstMember first_member = parse.first_member; + Assert(first_member.symbol_count == first_member.member_offsets.size / sizeof(U32)); + + symbol_count = first_member.symbol_count; + string_table = first_member.string_table; + member_off_arr = (U32*)first_member.member_offsets.str; + + // convert big endian offsets + for (U32 offset_idx = 0; offset_idx < symbol_count; offset_idx += 1) { + member_off_arr[offset_idx] = BE_U32(member_off_arr[offset_idx]); + } + } else { + symbol_count = 0; + string_table = str8(0,0); + member_off_arr = 0; + } + + // parse string table + String8List symbol_name_list = str8_split_by_string_chars(arena, string_table, str8_lit("\0"), StringSplitFlag_KeepEmpties); + Assert(symbol_name_list.node_count >= symbol_count); + symbol_count = Min(symbol_count, symbol_name_list.node_count); + + // init lib + LNK_Lib lib = {0}; + lib.path = push_str8_copy(arena, path); + lib.data = data; + lib.type = type; + lib.symbol_count = symbol_count; + lib.member_off_arr = member_off_arr; + lib.symbol_name_list = symbol_name_list; + lib.long_names = parse.long_names; + + return lib; +} + +THREAD_POOL_TASK_FUNC(lnk_lib_initer) +{ + LNK_LibIniter *task = raw_task; + LNK_LibNode *lib_node = task->node_arr + task_id; + LNK_Lib *lib = &lib_node->data; + String8 data = task->data_arr[task_id]; + String8 path = task->path_arr[task_id]; + + *lib = lnk_lib_from_data(arena, data, path); +} + +internal LNK_LibNodeArray +lnk_lib_list_push_parallel(TP_Context *tp, TP_Arena *arena, LNK_LibList *list, String8Array data_arr, String8Array path_arr) +{ + Assert(data_arr.count == path_arr.count); + U64 lib_count = data_arr.count; + + LNK_LibIniter lib_initer = {0}; + lib_initer.node_arr = lnk_lib_list_reserve(arena->v[0], list, lib_count); + lib_initer.data_arr = data_arr.v; + lib_initer.path_arr = path_arr.v; + tp_for_parallel(tp, arena, lib_count, lnk_lib_initer, &lib_initer); + + LNK_LibNodeArray arr; + arr.count = lib_count; + arr.v = lib_initer.node_arr; + return arr; +} + +#if 0 +internal LNK_LibNode * +lnk_lib_list_push(Arena *arena, LNK_LibList *list, String8 data, String8 path) +{ + ProfBeginFunction(); + + TP_Arena pool_arena = {0}; + pool_arena.count = 1; + pool_arena.v = &arena; + + String8Array data_arr = {0}; + data_arr.count = 1; + data_arr.v = &data; + + String8Array path_arr = {0}; + path_arr.count = 1; + path_arr.v = &path; + + LNK_LibNodeArray node_arr = lnk_lib_list_push_parallel(&pool_arena, list, data_arr, path_arr); + + ProfEnd(); + return node_arr.v; +} +#endif + +//////////////////////////////// + +internal LNK_LibWriter * +lnk_lib_writer_alloc(void) +{ + Arena *arena = arena_alloc(); + LNK_LibWriter *writer = push_array(arena, LNK_LibWriter, 1); + writer->arena = arena; + return writer; +} + +internal void +lnk_lib_writer_release(LNK_LibWriter **writer_ptr) +{ + arena_release((*writer_ptr)->arena); + *writer_ptr = 0; +} + +internal void +lnk_lib_writer_push_obj(LNK_LibWriter *writer, LNK_Obj *obj) +{ + ProfBeginFunction(); + + U64 member_idx = writer->member_list.count; + + // push obj member + LNK_LibMember member = {0}; + member.name = obj->path; + member.data = obj->data; + lnk_lib_member_list_push(writer->arena, &writer->member_list, member); + + // push external symbols + for (LNK_SymbolNode *node = obj->symbol_list.first; node != 0; node = node->next) { + LNK_Symbol *symbol = node->data; + B32 is_extern = symbol->type == LNK_Symbol_DefinedExtern; + if (is_extern) { + LNK_LibSymbol lib_symbol = {0}; + lib_symbol.name = symbol->name; + lib_symbol.member_idx = member_idx; + lnk_lib_symbol_list_push(writer->arena, &writer->symbol_list, lib_symbol); + } + } + + ProfEnd(); +} + +internal void +lnk_lib_writer_push_export(LNK_LibWriter *writer, COFF_MachineType machine, U64 time_stamp, String8 dll_name, LNK_Export *exp) +{ + ProfBeginFunction(); + + U64 member_idx = writer->member_list.count; + + // make import header + String8 import_data; + if (exp->name.size) { + U16 hint = safe_cast_u16(exp->id); + import_data = coff_make_import_header_by_name(writer->arena, dll_name, machine, time_stamp, exp->name, hint, exp->type); + } else { + U16 ordinal = safe_cast_u16(exp->id); + import_data = coff_make_import_header_by_ordinal(writer->arena, dll_name, machine, time_stamp, ordinal, exp->type); + } + + // push import member + LNK_LibMember member = {0}; + member.name = dll_name; + member.data = import_data; + lnk_lib_member_list_push(writer->arena, &writer->member_list, member); + + switch (exp->type) { + case COFF_ImportHeaderType_CODE: { + LNK_LibSymbol def_symbol = {0}; + def_symbol.name = push_str8_copy(writer->arena, exp->name); + def_symbol.member_idx = member_idx; + lnk_lib_symbol_list_push(writer->arena, &writer->symbol_list, def_symbol); + } + case COFF_ImportHeaderType_DATA: { + LNK_LibSymbol imp_symbol = {0}; + imp_symbol.name = push_str8f(writer->arena, "__imp_%S", exp->name); + imp_symbol.member_idx = member_idx; + lnk_lib_symbol_list_push(writer->arena, &writer->symbol_list, imp_symbol); + } break; + case COFF_ImportHeaderType_CONST: { + NotImplemented; + } break; + default: InvalidPath; + } + + ProfEnd(); +} + +internal LNK_LibBuild +lnk_lib_build_from_writer(Arena *arena, LNK_LibWriter *writer) +{ + ProfBeginFunction(); + + LNK_LibBuild lib = {0}; + lib.symbol_count = writer->symbol_list.count + 2; + lib.member_count = writer->member_list.count; + lib.symbol_array = lnk_lib_symbol_array_from_list(arena, writer->symbol_list); + lib.member_array = lnk_lib_member_array_from_list(arena, writer->member_list); + lnk_lib_symbol_array_sort(lib.symbol_array, lib.symbol_count); + + ProfEnd(); + return lib; +} + +internal String8List +lnk_coff_archive_from_lib_build(Arena *arena, LNK_LibBuild *lib, B32 emit_second_member, COFF_TimeStamp time_stamp, U32 mode) +{ + ProfBeginFunction(); + + Temp scratch = scratch_begin(&arena, 1); + + U64 symbol_count = lib->symbol_count - 2; + LNK_LibSymbol *symbol_arr = lib->symbol_array + 1; + + HashTable *name_ht = hash_table_init(scratch.arena, 1024); + U64 *member_off_arr = push_array_no_zero(scratch.arena, U64, lib->member_count); + String8List long_names_list = {0}; + String8List member_data_list = {0}; + + for (U64 member_idx = 0; member_idx < lib->member_count; member_idx += 1) { + LNK_LibMember *member = &lib->member_array[member_idx]; + + // make member name + String8 name; + U64 name_with_slash_size = member->name.size + 1; + if (name_with_slash_size > COFF_ARCHIVE_MAX_SHORT_NAME_SIZE) { + // have we seen this member name before? + KeyValuePair *is_present = hash_table_search_string(name_ht, member->name); + if (is_present) { + name = is_present->value_string; + } else { + name = push_str8f(scratch.arena, "/%u", long_names_list.total_size); + str8_list_pushf(scratch.arena, &long_names_list, "%S/\n", member->name); + hash_table_push_string_string(scratch.arena, name_ht, member->name, name); + } + } else { + name = push_str8f(scratch.arena, "%S/", member->name); + } + + member_off_arr[member_idx] = member_data_list.total_size; + + String8 member_data = member->data; + String8 member_header = lnk_build_lib_member_header(arena, name, time_stamp, 0, 0, mode, member_data.size); + + str8_list_push(arena, &member_data_list, member_header); + str8_list_push(arena, &member_data_list, member_data); + str8_list_push_pad(arena, &member_data_list, member_data_list.total_size, COFF_ARCHIVE_ALIGN); + } + + // long names member + if (long_names_list.total_size > 0) { + String8 header = lnk_build_lib_member_header(arena, str8_lit("//"), time_stamp, 0, 0, mode, long_names_list.total_size); + String8 data = str8_list_join(arena, &long_names_list, 0); + U64 member_offset = member_data_list.total_size + data.size + header.size; + str8_list_push_pad_front(arena, &member_data_list, member_offset, COFF_ARCHIVE_ALIGN); + str8_list_push_front(arena, &member_data_list, data); + str8_list_push_front(arena, &member_data_list, header); + } + + // compute size for symbol string table + U32 name_buffer_size = 0; + for (LNK_LibSymbol *ptr = &symbol_arr[0], *opl = ptr + symbol_count; ptr < opl; ptr += 1) { + name_buffer_size += ptr->name.size; + name_buffer_size += 1; // null + } + + // write symbol name buffer + U8 *name_buffer = push_array_no_zero(scratch.arena, U8, name_buffer_size); + { + U64 name_cursor = 0; + for (LNK_LibSymbol *ptr = &symbol_arr[0], *opl = ptr + symbol_count; ptr < opl; ptr += 1) { + MemoryCopy(name_buffer + name_cursor, ptr->name.str, ptr->name.size); + name_buffer[name_cursor + ptr->name.size] = '\0'; + name_cursor += ptr->name.size + 1; + } + } + + U64 member_base_off; + { + U64 sizeof_first_header = COFF_ARCHIVE_MEMBER_HEADER_SIZE + sizeof(U32) + sizeof(U32) * symbol_count + name_buffer_size; + U64 sizeof_second_header = COFF_ARCHIVE_MEMBER_HEADER_SIZE + sizeof(U32) + sizeof(U32) * lib->member_count + sizeof(U32) + sizeof(U16) * symbol_count + name_buffer_size; + U64 sizeof_long_names = COFF_ARCHIVE_MEMBER_HEADER_SIZE + long_names_list.total_size; + + sizeof_first_header = AlignPow2(sizeof_first_header, COFF_ARCHIVE_ALIGN); + sizeof_second_header = AlignPow2(sizeof_second_header, COFF_ARCHIVE_ALIGN); + sizeof_long_names = AlignPow2(sizeof_long_names, COFF_ARCHIVE_ALIGN); + + member_base_off = sizeof(g_coff_archive_sig); + member_base_off += sizeof_first_header; + if (emit_second_member) { + member_base_off += sizeof_second_header; + } + if (long_names_list.total_size > 0) { + member_base_off += sizeof_long_names; + } + } + + // second linker member + if (emit_second_member) { + U32 member_count32 = safe_cast_u32(lib->member_count); + U32 symbol_count32 = safe_cast_u32(symbol_count); + + U32 *member_off32_arr = push_array_no_zero(scratch.arena, U32, lib->member_count); + U16 *member_idx16_arr = push_array_no_zero(scratch.arena, U16, symbol_count); + + // write member offset array + for (U64 member_idx = 0; member_idx < lib->member_count; member_idx += 1) { + U64 member_off = member_base_off + member_off_arr[member_idx]; + U32 member_off32 = safe_cast_u32(member_off); + member_off32_arr[member_idx] = member_off32; + } + + // write member offset indices for each symbol + for (U64 symbol_idx = 0; symbol_idx < symbol_count; symbol_idx += 1) { + // member offset indices are 1-based + U64 member_idx = symbol_arr[symbol_idx].member_idx + 1; + U16 member_idx16 = safe_cast_u16(member_idx); + member_idx16_arr[symbol_idx] = member_idx16; + } + + // layout second member data + String8List second_member_data_list = {0}; + str8_list_push(scratch.arena, &second_member_data_list, str8_struct(&member_count32)); + str8_list_push(scratch.arena, &second_member_data_list, str8_array(member_off32_arr, lib->member_count)); + str8_list_push(scratch.arena, &second_member_data_list, str8_struct(&symbol_count32)); + str8_list_push(scratch.arena, &second_member_data_list, str8_array(member_idx16_arr, symbol_count)); + str8_list_push(scratch.arena, &second_member_data_list, str8(name_buffer, name_buffer_size)); + + String8 member_data = str8_list_join(arena, &second_member_data_list, 0); + String8 member_header = lnk_build_lib_member_header(arena, str8_lit("/"), time_stamp, 0, 0, mode, member_data.size); + + U64 member_offset = member_data_list.total_size + member_data.size + member_header.size; + str8_list_push_pad_front(arena, &member_data_list, member_offset, COFF_ARCHIVE_ALIGN); + str8_list_push_front(arena, &member_data_list, member_data); + str8_list_push_front(arena, &member_data_list, member_header); + } + + // first linker member (obsolete, but kept for compatability reasons) + { + U32 symbol_count_be = BE_U32(symbol_count); + U32 *member_off32_arr = push_array_no_zero(scratch.arena, U32, symbol_count); + + for (U64 symbol_idx = 0; symbol_idx < symbol_count; symbol_idx += 1) { + LNK_LibSymbol *symbol = &symbol_arr[symbol_idx]; + + // write big endian member offset + U64 member_off = member_base_off + member_off_arr[symbol->member_idx]; + U32 member_off32 = BE_U32(safe_cast_u32(member_off)); + member_off32_arr[symbol_idx] = member_off32; + } + + // layout first member data + String8List first_member_data_list = {0}; + str8_list_push(scratch.arena, &first_member_data_list, str8_struct(&symbol_count_be)); + str8_list_push(scratch.arena, &first_member_data_list, str8_array(member_off32_arr, symbol_count)); + str8_list_push(scratch.arena, &first_member_data_list, str8(name_buffer, name_buffer_size)); + + String8 member_data = str8_list_join(arena, &first_member_data_list, 0); + String8 member_header = lnk_build_lib_member_header(arena, str8_lit("/"), time_stamp, 0, 0, mode, member_data.size); + + U64 member_offset = sizeof(g_coff_archive_sig) + member_header.size + member_data.size; + str8_list_push_pad_front(arena, &member_data_list, member_offset, COFF_ARCHIVE_ALIGN); + str8_list_push_front(arena, &member_data_list, member_data); + str8_list_push_front(arena, &member_data_list, member_header); + } + + // archive signature + str8_list_push_front(arena, &member_data_list, str8_struct(&g_coff_archive_sig)); + + scratch_end(scratch); + ProfEnd(); + return member_data_list; +} + +//////////////////////////////// + +internal LNK_LibBuild +lnk_build_lib(Arena *arena, COFF_MachineType machine, COFF_TimeStamp time_stamp, String8 dll_name, LNK_ObjList obj_list, LNK_ExportTable *exptab) +{ + ProfBeginFunction(); + LNK_LibWriter *writer = lnk_lib_writer_alloc(); + for (LNK_ObjNode *obj_node = obj_list.first; obj_node != 0; obj_node = obj_node->next) { + lnk_lib_writer_push_obj(writer, &obj_node->data); + } + for (LNK_Export *exp = exptab->name_export_list.first; exp != 0; exp = exp->next) { + lnk_lib_writer_push_export(writer, machine, time_stamp, dll_name, exp); + } + LNK_LibBuild lib = lnk_lib_build_from_writer(arena, writer); + lnk_lib_writer_release(&writer); + ProfEnd(); + return lib; +} + +internal String8List +lnk_build_import_entry_obj(Arena *arena, String8 dll_name, COFF_MachineType machine) +{ + ProfBeginFunction(); + + Assert(machine == COFF_MachineType_X64); + Assert(str8_match(str8_lit("dll"), str8_skip_last_dot(dll_name), StringMatchFlag_CaseInsensitive|StringMatchFlag_RightSideSloppy)); + + String8List list = {0}; + + COFF_Header *coff_header = push_array(arena, COFF_Header, 1); + coff_header->machine = machine; + str8_list_push(arena, &list, str8_struct(coff_header)); + + coff_header->section_count = 2; + COFF_SectionHeader *coff_sect_header_array = push_array(arena, COFF_SectionHeader, coff_header->section_count); + str8_list_push(arena, &list, str8_array(coff_sect_header_array, coff_header->section_count)); + + PE_ImportEntry *import_entry = push_array(arena, PE_ImportEntry, 1); + U64 import_entry_off = list.total_size; + str8_list_push(arena, &list, str8_struct(import_entry)); + + String8 dll_name_cstr = push_cstr(arena, dll_name); + U64 dll_name_off = list.total_size; + str8_list_push(arena, &list, dll_name_cstr); + + U32 import_entry_reloc_count = 3; + COFF_Reloc *import_entry_reloc_array = push_array(arena, COFF_Reloc, import_entry_reloc_count); + U64 import_entry_reloc_off = list.total_size; + str8_list_push(arena, &list, str8_array(import_entry_reloc_array, import_entry_reloc_count)); + + coff_header->symbol_count = 7; + COFF_Symbol16 *symbol_array = push_array(arena, COFF_Symbol16, coff_header->symbol_count); + coff_header->symbol_table_foff = safe_cast_u32(list.total_size); + str8_list_push(arena, &list, str8_array(symbol_array, coff_header->symbol_count)); + + U64 string_table_base = list.total_size; + U32 *string_table_size_ptr = push_array(arena, U32, 1); + str8_list_push(arena, &list, str8_struct(string_table_size_ptr)); + + // PE_ImportEntry + { + COFF_SectionHeader *sect = &coff_sect_header_array[0]; + sect->name[0] = '.'; + sect->name[1] = 'i'; + sect->name[2] = 'd'; + sect->name[3] = 'a'; + sect->name[4] = 't'; + sect->name[5] = 'a'; + sect->name[6] = '$'; + sect->name[7] = '2'; + sect->fsize = sizeof(PE_ImportEntry); + sect->foff = import_entry_off; + sect->reloc_count = import_entry_reloc_count; + sect->relocs_foff = import_entry_reloc_off; + sect->flags = COFF_SectionFlag_CNT_INITIALIZED_DATA|(COFF_SectionAlign_4BYTES << COFF_SectionFlag_ALIGN_SHIFT)|COFF_SectionFlag_MEM_READ|COFF_SectionFlag_MEM_WRITE; + } + { + COFF_Reloc *lookup_table_voff_reloc = &import_entry_reloc_array[0]; + lookup_table_voff_reloc->apply_off = OffsetOf(PE_ImportEntry, lookup_table_voff); + lookup_table_voff_reloc->isymbol = 3; + lookup_table_voff_reloc->type = COFF_RelocTypeX64_ADDR32NB; + + COFF_Reloc *name_voff_reloc = &import_entry_reloc_array[1]; + name_voff_reloc->apply_off = OffsetOf(PE_ImportEntry, name_voff); + name_voff_reloc->isymbol = 2; + name_voff_reloc->type = COFF_RelocTypeX64_ADDR32NB; + + COFF_Reloc *import_addr_table_voff = &import_entry_reloc_array[2]; + import_addr_table_voff->apply_off = OffsetOf(PE_ImportEntry, import_addr_table_voff); + import_addr_table_voff->isymbol = 4; + import_addr_table_voff->type = COFF_RelocTypeX64_ADDR32NB; + } + + // dll name + { + COFF_SectionHeader *sect = &coff_sect_header_array[1]; + sect->name[0] = '.'; + sect->name[1] = 'i'; + sect->name[2] = 'd'; + sect->name[3] = 'a'; + sect->name[4] = 't'; + sect->name[5] = 'a'; + sect->name[6] = '$'; + sect->name[7] = '6'; + sect->fsize = dll_name_cstr.size; + sect->foff = dll_name_off; + sect->flags = COFF_SectionFlag_CNT_INITIALIZED_DATA|(COFF_SectionAlign_2BYTES << COFF_SectionFlag_ALIGN_SHIFT)|COFF_SectionFlag_MEM_READ|COFF_SectionFlag_MEM_WRITE; + } + + // import descriptor + { + String8 dll_name_no_ext = str8_substr(dll_name, r1u64(0, dll_name.size - 4)); + String8 symbol_name = push_str8f(arena, "__IMPORT_DESCRIPTOR_%S", dll_name_no_ext); + + U64 symbol_name_off = (list.total_size - string_table_base); + str8_list_push(arena, &list, push_cstr(arena, symbol_name)); + + COFF_Symbol16 *symbol = &symbol_array[0]; + symbol->name.long_name.zeroes = 0; + symbol->name.long_name.string_table_offset = symbol_name_off; + symbol->section_number = 1; + symbol->storage_class = COFF_SymStorageClass_EXTERNAL; + } + + // .idata$2 + { + COFF_Symbol16 *symbol = &symbol_array[1]; + symbol->name.short_name[0] = '.'; + symbol->name.short_name[1] = 'i'; + symbol->name.short_name[2] = 'd'; + symbol->name.short_name[3] = 'a'; + symbol->name.short_name[4] = 't'; + symbol->name.short_name[5] = 'a'; + symbol->name.short_name[6] = '$'; + symbol->name.short_name[7] = '2'; + symbol->section_number = 1; + symbol->storage_class = COFF_SymStorageClass_SECTION; + } + + // .idata$6 + { + COFF_Symbol16 *symbol = &symbol_array[2]; + symbol->name.short_name[0] = '.'; + symbol->name.short_name[1] = 'i'; + symbol->name.short_name[2] = 'd'; + symbol->name.short_name[3] = 'a'; + symbol->name.short_name[4] = 't'; + symbol->name.short_name[5] = 'a'; + symbol->name.short_name[6] = '$'; + symbol->name.short_name[7] = '6'; + symbol->section_number = 2; + symbol->storage_class = COFF_SymStorageClass_STATIC; + } + + // .idata$4 + { + COFF_Symbol16 *symbol = &symbol_array[3]; + symbol->name.short_name[0] = '.'; + symbol->name.short_name[1] = 'i'; + symbol->name.short_name[2] = 'd'; + symbol->name.short_name[3] = 'a'; + symbol->name.short_name[4] = 't'; + symbol->name.short_name[5] = 'a'; + symbol->name.short_name[6] = '$'; + symbol->name.short_name[7] = '4'; + symbol->section_number = COFF_SYMBOL_UNDEFINED_SECTION; + symbol->storage_class = COFF_SymStorageClass_SECTION; + } + + // .idata$5 + { + COFF_Symbol16 *symbol = &symbol_array[4]; + symbol->name.short_name[0] = '.'; + symbol->name.short_name[1] = 'i'; + symbol->name.short_name[2] = 'd'; + symbol->name.short_name[3] = 'a'; + symbol->name.short_name[4] = 't'; + symbol->name.short_name[5] = 'a'; + symbol->name.short_name[6] = '$'; + symbol->name.short_name[7] = '5'; + symbol->section_number = COFF_SYMBOL_UNDEFINED_SECTION; + symbol->storage_class = COFF_SymStorageClass_SECTION; + } + + // __NULL_IMPORT_DESCRIPTOR + { + U64 symbol_name_off = (list.total_size - string_table_base); + str8_list_push(arena, &list, push_cstr(arena, str8_lit("__NULL_IMPORT_DESCRIPTOR"))); + + COFF_Symbol16 *symbol = &symbol_array[5]; + symbol->name.long_name.zeroes = 0; + symbol->name.long_name.string_table_offset = symbol_name_off; + symbol->section_number = COFF_SYMBOL_UNDEFINED_SECTION; + symbol->storage_class = COFF_SymStorageClass_EXTERNAL; + } + + // NULL_THUNK_DATA + { + String8 dll_name_no_ext = str8_substr(dll_name, r1u64(0, dll_name.size - 4)); + String8 symbol_name = push_str8f(arena, "\x7f%S_NULL_THUNK_DATA", dll_name_no_ext); + + U64 symbol_name_off = (list.total_size - string_table_base); + str8_list_push(arena, &list, push_cstr(arena, symbol_name)); + + COFF_Symbol16 *symbol = &symbol_array[6]; + symbol->name.long_name.zeroes = 0; + symbol->name.long_name.string_table_offset = symbol_name_off; + symbol->section_number = COFF_SYMBOL_UNDEFINED_SECTION; + symbol->storage_class = COFF_SymStorageClass_EXTERNAL; + } + + // update string table size + *string_table_size_ptr = (list.total_size - string_table_base); + + ProfEnd(); + return list; +} + +internal String8List +lnk_build_null_import_descriptor_obj(Arena *arena, COFF_MachineType machine) +{ + ProfBeginFunction(); + + String8List list = {0}; + + COFF_Header *coff_header = push_array(arena, COFF_Header, 1); + coff_header->machine = machine; + str8_list_push(arena, &list, str8_struct(coff_header)); + + coff_header->section_count = 1; + COFF_SectionHeader *coff_sect_header_array = push_array(arena, COFF_SectionHeader, coff_header->section_count); + str8_list_push(arena, &list, str8_array(coff_sect_header_array, coff_header->section_count)); + + U64 null_import_data_size = 20; + U8 *null_import_data = push_array(arena, U8, null_import_data_size); + U64 null_import_data_off = list.total_size; + str8_list_push(arena, &list, str8(null_import_data, null_import_data_size)); + + coff_header->symbol_count = 1; + COFF_Symbol16 *symbol_array = push_array(arena, COFF_Symbol16, coff_header->symbol_count); + coff_header->symbol_table_foff = safe_cast_u32(list.total_size); + str8_list_push(arena, &list, str8_array(symbol_array, coff_header->symbol_count)); + + U64 string_table_base = list.total_size; + U32 *string_table_size_ptr = push_array(arena, U32, 1); + str8_list_push(arena, &list, str8_struct(string_table_size_ptr)); + + { + COFF_SectionHeader *sect = &coff_sect_header_array[0]; + sect->name[0] = '.'; + sect->name[1] = 'i'; + sect->name[2] = 'd'; + sect->name[3] = 'a'; + sect->name[4] = 't'; + sect->name[5] = 'a'; + sect->name[6] = '$'; + sect->name[7] = '3'; + sect->fsize = null_import_data_size; + sect->foff = null_import_data_off; + sect->flags = COFF_SectionFlag_CNT_INITIALIZED_DATA|(COFF_SectionAlign_4BYTES << COFF_SectionFlag_ALIGN_SHIFT)|COFF_SectionFlag_MEM_READ|COFF_SectionFlag_MEM_WRITE; + } + + { + U64 symbol_name_off = list.total_size - string_table_base; + str8_list_push(arena, &list, push_cstr(arena, str8_lit("__NULL_IMPORT_DESCRIPTOR"))); + + COFF_Symbol16 *symbol = &symbol_array[0]; + symbol->name.long_name.zeroes = 0; + symbol->name.long_name.string_table_offset = symbol_name_off; + symbol->section_number = 1; + symbol->storage_class = COFF_SymStorageClass_EXTERNAL; + } + + // update string table size + *string_table_size_ptr = (list.total_size - string_table_base); + + ProfEnd(); + return list; +} + +internal String8List +lnk_build_null_thunk_data_obj(Arena *arena, String8 dll_name, COFF_MachineType machine) +{ + ProfBeginFunction(); + + Assert(str8_match(str8_lit("dll"), str8_skip_last_dot(dll_name), StringMatchFlag_CaseInsensitive|StringMatchFlag_RightSideSloppy)); + + String8List list = {0}; + + COFF_Header *coff_header = push_array(arena, COFF_Header, 1); + coff_header->machine = machine; + str8_list_push(arena, &list, str8_struct(coff_header)); + + coff_header->section_count = 2; + COFF_SectionHeader *coff_sect_header_array = push_array(arena, COFF_SectionHeader, coff_header->section_count); + str8_list_push(arena, &list, str8_array(coff_sect_header_array, coff_header->section_count)); + + U64 lookup_entry_data_size = 8; + U8 *lookup_entry_data = push_array(arena, U8, lookup_entry_data_size); + U64 lookup_entry_data_off = list.total_size; + str8_list_push(arena, &list, str8(lookup_entry_data, lookup_entry_data_size)); + + U64 null_thunk_data_size = 8; + U8 *null_thunk_data = push_array(arena, U8, null_thunk_data_size); + U64 null_thunk_data_off = list.total_size; + str8_list_push(arena, &list, str8(null_thunk_data, null_thunk_data_size)); + + coff_header->symbol_count = 1; + COFF_Symbol16 *symbol_array = push_array(arena, COFF_Symbol16, coff_header->symbol_count); + coff_header->symbol_table_foff = safe_cast_u32(list.total_size); + str8_list_push(arena, &list, str8_array(symbol_array, coff_header->symbol_count)); + + U64 string_table_base = list.total_size; + U32 *string_table_size_ptr = push_array(arena, U32, 1); + str8_list_push(arena, &list, str8_struct(string_table_size_ptr)); + + { + COFF_SectionHeader *sect = &coff_sect_header_array[0]; + sect->name[0] = '.'; + sect->name[1] = 'i'; + sect->name[2] = 'd'; + sect->name[3] = 'a'; + sect->name[4] = 't'; + sect->name[5] = 'a'; + sect->name[6] = '$'; + sect->name[7] = '5'; + sect->fsize = lookup_entry_data_size; + sect->foff = lookup_entry_data_off; + sect->flags = COFF_SectionFlag_CNT_INITIALIZED_DATA|(COFF_SectionAlign_8BYTES << COFF_SectionFlag_ALIGN_SHIFT)|COFF_SectionFlag_MEM_READ|COFF_SectionFlag_MEM_WRITE; + } + + { + COFF_SectionHeader *sect = &coff_sect_header_array[1]; + sect->name[0] = '.'; + sect->name[1] = 'i'; + sect->name[2] = 'd'; + sect->name[3] = 'a'; + sect->name[4] = 't'; + sect->name[5] = 'a'; + sect->name[6] = '$'; + sect->name[7] = '4'; + sect->fsize = null_thunk_data_size; + sect->foff = null_thunk_data_off; + sect->flags = COFF_SectionFlag_CNT_INITIALIZED_DATA|(COFF_SectionAlign_8BYTES << COFF_SectionFlag_ALIGN_SHIFT)|COFF_SectionFlag_MEM_READ|COFF_SectionFlag_MEM_WRITE; + } + + { + String8 dll_name_no_ext = str8_substr(dll_name, r1u64(0, dll_name.size - 4)); + String8 symbol_name = push_str8f(arena, "\x7f%S_NULL_THUNK_DATA", dll_name_no_ext); + + U64 symbol_name_off = list.total_size - string_table_base; + str8_list_push(arena, &list, push_cstr(arena, symbol_name)); + + COFF_Symbol16 *symbol = &symbol_array[0]; + symbol->name.long_name.zeroes = 0; + symbol->name.long_name.string_table_offset = symbol_name_off; + symbol->section_number = 1; + symbol->storage_class = COFF_SymStorageClass_EXTERNAL; + } + + // update string table size + *string_table_size_ptr = (list.total_size - string_table_base); + + ProfEnd(); + return list; +} + +internal String8 +lnk_build_lib_member_header(Arena *arena, String8 name, COFF_TimeStamp time_stamp, U16 user_id, U16 group_id, U16 mode, U32 size) +{ + ProfBeginFunction(); + + Assert(name.size < 16); + Assert(user_id < 10000); + Assert(group_id < 10000); + Assert(mode < 10000); + Assert(size < 1000000000); + + //U64 sizeof_member_header = /* name */ 16 + /* time */ 12 + /* user_id */ 6 + /* group id */ 6 + /* mode */ 8 + /* size */ 10; + + Temp scratch = scratch_begin(&arena, 1); + String8List list = {0}; + str8_list_pushf(scratch.arena, &list, "%-16.*s", str8_varg(name)); + str8_list_pushf(scratch.arena, &list, "%-12u", time_stamp); + str8_list_pushf(scratch.arena, &list, "%-6u", user_id); + str8_list_pushf(scratch.arena, &list, "%-6u", group_id); + str8_list_pushf(scratch.arena, &list, "%-8u", mode); + str8_list_pushf(scratch.arena, &list, "%-10u", size); + str8_list_pushf(scratch.arena, &list, "`\n"); + String8 result = str8_list_join(arena, &list, 0); + + Assert(result.size == COFF_ARCHIVE_MEMBER_HEADER_SIZE); + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal String8List +lnk_build_import_lib(TP_Context *tp, TP_Arena *arena, COFF_MachineType machine, COFF_TimeStamp time_stamp, String8 lib_name, String8 dll_name, LNK_ExportTable *exptab) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + dll_name = str8_skip_last_slash(dll_name); + + // These objects appear in first three members of any lib that linker produces with /dll. + // Objects are used by MSVC linker to build import table. + String8List import_obj_array[3]; + import_obj_array[0] = lnk_build_import_entry_obj(scratch.arena, dll_name, machine); + import_obj_array[1] = lnk_build_null_import_descriptor_obj(scratch.arena, machine); + import_obj_array[2] = lnk_build_null_thunk_data_obj(scratch.arena, dll_name, machine); + + // build input list + LNK_InputObjList input_obj_list = {0}; + for (U64 i = 0; i < ArrayCount(import_obj_array); ++i) { + LNK_InputObj *input = lnk_input_obj_list_push(scratch.arena, &input_obj_list); + input->data = str8_list_join(scratch.arena, &import_obj_array[i], 0); + input->path = dll_name; + input->lib_path = lib_name; + } + + LNK_InputObj **inputs = lnk_array_from_input_obj_list(scratch.arena, input_obj_list); + LNK_SectionTable *st = lnk_section_table_alloc(0,0,0); + LNK_ObjList obj_list = {0}; + lnk_obj_list_push_parallel(tp, arena, &obj_list, st, input_obj_list.count, inputs); + + LNK_LibBuild import_lib = lnk_build_lib(scratch.arena, machine, time_stamp, dll_name, obj_list, exptab); + B32 emit_second_member = 0; // MSVC linker refuses to link with lib that has the second member. + String8List coff_archive_data = lnk_coff_archive_from_lib_build(arena->v[0], &import_lib, emit_second_member, time_stamp, /* -rw-r--r-- */ 644); + + // cleanup memory + lnk_section_table_release(&st); + scratch_end(scratch); + + ProfEnd(); + return coff_archive_data; +} + diff --git a/src/linker/lnk_lib.h b/src/linker/lnk_lib.h new file mode 100644 index 00000000..3801e9fd --- /dev/null +++ b/src/linker/lnk_lib.h @@ -0,0 +1,134 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct LNK_Lib +{ + String8 path; + String8 data; + COFF_ArchiveType type; + U32 symbol_count; + U32 * member_off_arr; + String8List symbol_name_list; + String8 long_names; +} LNK_Lib; + +typedef struct LNK_LibNode +{ + struct LNK_LibNode *next; + LNK_Lib data; +} LNK_LibNode; + +typedef struct LNK_LibNodeArray +{ + U64 count; + LNK_LibNode *v; +} LNK_LibNodeArray; + +typedef struct LNK_LibList +{ + U64 count; + struct LNK_LibNode *first; + struct LNK_LibNode *last; +} LNK_LibList; + +//////////////////////////////// + +typedef struct LNK_LibMember +{ + String8 name; + String8 data; +} LNK_LibMember; + +typedef struct LNK_LibMemberNode +{ + struct LNK_LibMemberNode *next; + LNK_LibMember data; +} LNK_LibMemberNode; + +typedef struct LNK_LibMemberList +{ + U64 count; + LNK_LibMemberNode *first; + LNK_LibMemberNode *last; +} LNK_LibMemberList; + +typedef struct LNK_LibSymbol +{ + String8 name; + U64 member_idx; +} LNK_LibSymbol; + +typedef struct LNK_LibSymbolNode +{ + struct LNK_LibSymbolNode *next; + LNK_LibSymbol data; +} LNK_LibSymbolNode; + +typedef struct LNK_LibSymbolList +{ + U64 count; + LNK_LibSymbolNode *first; + LNK_LibSymbolNode *last; +} LNK_LibSymbolList; + +typedef struct LNK_LibWriter +{ + Arena *arena; + LNK_LibMemberList member_list; + LNK_LibSymbolList symbol_list; +} LNK_LibWriter; + +typedef struct LNK_LibBuild +{ + U64 symbol_count; + U64 member_count; + LNK_LibSymbol *symbol_array; + LNK_LibMember *member_array; +} LNK_LibBuild; + +//////////////////////////////// + +typedef struct +{ + LNK_LibNode *node_arr; + String8 *data_arr; + String8 *path_arr; +} LNK_LibIniter; + +//////////////////////////////// + +internal LNK_LibNode * lnk_lib_list_reserve(Arena *arena, LNK_LibList *list, U64 count); +internal LNK_LibMemberNode * lnk_lib_member_list_push(Arena *arena, LNK_LibMemberList *list, LNK_LibMember member); +internal LNK_LibMember * lnk_lib_member_array_from_list(Arena *arena, LNK_LibMemberList list); +internal LNK_LibSymbolNode * lnk_lib_symbol_list_push(Arena *arena, LNK_LibSymbolList *list, LNK_LibSymbol symbol); + +internal LNK_LibSymbol * lnk_lib_symbol_array_from_list(Arena *arena, LNK_LibSymbolList list); +internal void lnk_lib_symbol_array_sort(LNK_LibSymbol *arr, U64 count); + +//////////////////////////////// + +internal LNK_Lib lnk_lib_from_data(Arena *arena, String8 data, String8 path); + +internal LNK_LibNodeArray lnk_lib_list_push_parallel(TP_Context *tp, TP_Arena *arena, LNK_LibList *list, String8Array data_arr, String8Array path_arr); +internal LNK_LibNode * lnk_lib_list_push(Arena *arena, LNK_LibList *list, String8 data, String8 path); + +//////////////////////////////// + +internal LNK_LibWriter * lnk_lib_writer_alloc(void); +internal void lnk_lib_writer_release(LNK_LibWriter **writer_ptr); +internal void lnk_lib_writer_push_obj(LNK_LibWriter *writer, LNK_Obj *obj); +internal void lnk_lib_writer_push_export(LNK_LibWriter *writer, COFF_MachineType machine, U64 time_stamp, String8 dll_name, LNK_Export *exp); +internal LNK_LibBuild lnk_lib_build_from_writer(Arena *arena, LNK_LibWriter *writer); +internal String8List lnk_coff_archive_from_lib_build(Arena *arena, LNK_LibBuild *lib, B32 emit_second_member, COFF_TimeStamp time_stamp, U32 mode); + +//////////////////////////////// + +internal LNK_LibBuild lnk_build_lib(Arena *arena, COFF_MachineType machine, COFF_TimeStamp time_stamp, String8 dll_name, LNK_ObjList obj_list, LNK_ExportTable *exptab); +internal String8List lnk_build_import_entry_obj(Arena *arena, String8 dll_name, COFF_MachineType machine); +internal String8List lnk_build_null_import_descriptor_obj(Arena *arena, COFF_MachineType machine); +internal String8List lnk_build_null_thunk_data_obj(Arena *arena, String8 dll_name, COFF_MachineType machine); +internal String8 lnk_build_lib_member_header(Arena *arena, String8 name, COFF_TimeStamp time_stamp, U16 user_id, U16 group_id, U16 mode, U32 size); +internal String8List lnk_build_import_lib(TP_Context *tp, TP_Arena *arena, COFF_MachineType machine, COFF_TimeStamp time_stamp, String8 lib_name, String8 dll_name, LNK_ExportTable *exptab); + diff --git a/src/linker/lnk_log.c b/src/linker/lnk_log.c new file mode 100644 index 00000000..a90c2dec --- /dev/null +++ b/src/linker/lnk_log.c @@ -0,0 +1,59 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal void +lnk_set_log_status(LNK_LogType type, B32 is_enabled) +{ + g_log_status[type] = is_enabled; +} + +internal B32 +lnk_get_log_status(LNK_LogType type) +{ + B32 status = g_log_status[type]; + return status; +} + +internal void +lnk_log(LNK_LogType type, char *fmt, ...) +{ + B32 is_log_enabled = g_log_status[type]; + if (is_log_enabled) { + Temp scratch = scratch_begin(0,0); + va_list args; + va_start(args, fmt); + String8 string = push_str8fv(scratch.arena, fmt, args); + fprintf(stdout, "%.*s\n", str8_varg(string)); + va_end(args); + scratch_end(scratch); + } +} + +internal LNK_LogType +lnk_log_type_from_string(String8 string) +{ + static struct { + char *name; + LNK_LogType type; + } map[] = { + "Null", LNK_Log_Null, + "Debug", LNK_Log_Debug, + "InputObj", LNK_Log_InputObj, + "InputLib", LNK_Log_InputLib, + "IO", LNK_Log_IO, + "SizeBreakdown", LNK_Log_SizeBreakdown, + "LinkStats", LNK_Log_LinkStats, + "Timers", LNK_Log_Timers, + }; + Assert(ArrayCount(map) == LNK_Log_Count); + + for (U64 i = 0; i < ArrayCount(map); ++i) { + if (str8_match(str8_cstring(map[i].name), string, StringMatchFlag_CaseInsensitive)) { + return map[i].type; + } + } + + return LNK_Log_Null; +} + + diff --git a/src/linker/lnk_log.h b/src/linker/lnk_log.h new file mode 100644 index 00000000..fe0f27d1 --- /dev/null +++ b/src/linker/lnk_log.h @@ -0,0 +1,24 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef enum +{ + LNK_Log_Null, + LNK_Log_Debug, + LNK_Log_InputObj, + LNK_Log_InputLib, + LNK_Log_IO, + LNK_Log_SizeBreakdown, + LNK_Log_LinkStats, + LNK_Log_Timers, + LNK_Log_Count +} LNK_LogType; + +internal void set_log_level(LNK_LogType type, B32 is_enabled); +internal B32 lnk_get_log_status(LNK_LogType type); +internal void lnk_log(LNK_LogType type, char *fmt, ...); + +internal LNK_LogType lnk_log_type_from_string(String8 string); + diff --git a/src/linker/lnk_obj.c b/src/linker/lnk_obj.c new file mode 100644 index 00000000..79250547 --- /dev/null +++ b/src/linker/lnk_obj.c @@ -0,0 +1,940 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +//////////////////////////////// + +internal void +lnk_error_obj(LNK_ErrorCode code, LNK_Obj *obj, char *fmt, ...) +{ + Temp scratch = scratch_begin(0, 0); + va_list args; + va_start(args, fmt); + String8 text = push_str8fv(scratch.arena, fmt, args); + + if (obj->lib_path.size) { + lnk_error(code, "%S(%S): %S", obj->lib_path, obj->path, text); + } else { + lnk_error(code, "%S: %S", obj->path, text); + } + + va_end(args); + scratch_end(scratch); +} + +//////////////////////////////// + +internal void +lnk_input_obj_list_push_node(LNK_InputObjList *list, LNK_InputObj *node) +{ + SLLQueuePush(list->first, list->last, node); + ++list->count; +} + +internal LNK_InputObj * +lnk_input_obj_list_push(Arena *arena, LNK_InputObjList *list) +{ + LNK_InputObj *node = push_array(arena, LNK_InputObj, 1); + lnk_input_obj_list_push_node(list, node); + return node; +} + +internal LNK_InputObj ** +lnk_array_from_input_obj_list(Arena *arena, LNK_InputObjList list) +{ + LNK_InputObj **result = push_array_no_zero(arena, LNK_InputObj *, list.count); + U64 i = 0; + for (LNK_InputObj *n = list.first; n != 0; n = n->next, ++i) { + Assert(i < list.count); + result[i] = n; + } + return result; +} + +internal void +lnk_input_obj_list_concat_in_place(LNK_InputObjList *list, LNK_InputObjList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal int +lnk_input_obj_compar(const void *raw_a, const void *raw_b) +{ + const LNK_InputObj **a = (const LNK_InputObj **) raw_a; + const LNK_InputObj **b = (const LNK_InputObj **) raw_b; + int cmp = str8_compar_case_sensetive(&(*a)->path, &(*b)->path); + return cmp; +} + +internal int +lnk_input_obj_compar_is_before(void *raw_a, void *raw_b) +{ + LNK_InputObj **a = raw_a; + LNK_InputObj **b = raw_b; + int cmp = str8_compar_case_sensetive(&(*a)->path, &(*b)->path); + int is_before = cmp < 0; + return is_before; +} + +internal LNK_InputObjList +lnk_list_from_input_obj_arr(LNK_InputObj **arr, U64 count) +{ + LNK_InputObjList list = {0}; + for (U64 i = 0; i < count; ++i) { + SLLQueuePush(list.first, list.last, arr[i]); + ++list.count; + } + return list; +} + +internal LNK_InputObjList +lnk_input_obj_list_from_string_list(Arena *arena, String8List list) +{ + LNK_InputObjList input_list = {0}; + for (String8Node *path = list.first; path != 0; path = path->next) { + LNK_InputObj *input = lnk_input_obj_list_push(arena, &input_list); + input->is_thin = 1; + input->dedup_id = path->string; + input->path = path->string; + } + return input_list; +} + +//////////////////////////////// + +internal LNK_Obj ** +lnk_obj_arr_from_list(Arena *arena, LNK_ObjList list) +{ + LNK_Obj **arr = push_array_no_zero(arena, LNK_Obj *, list.count); + U64 idx = 0; + for (LNK_ObjNode *node = list.first; node != 0; node = node->next, ++idx) { + arr[idx] = &node->data; + } + return arr; +} + +internal LNK_ObjNodeArray +lnk_obj_list_reserve(Arena *arena, LNK_ObjList *list, U64 count) +{ + LNK_ObjNodeArray arr = {0}; + if (count) { + arr.count = count; + arr.v = push_array(arena, LNK_ObjNode, count); + for (LNK_ObjNode *ptr = arr.v, *opl = arr.v + arr.count; ptr < opl; ++ptr) { + SLLQueuePush(list->first, list->last, ptr); + } + list->count += count; + } else { + MemoryZeroStruct(&arr); + } + + return arr; +} + +internal LNK_ChunkList +lnk_obj_search_chunks(Arena *arena, LNK_Obj *obj, String8 name, String8 postfix, B32 collect_discarded) +{ + LNK_ChunkList list = {0}; + for (U64 sect_idx = 0; sect_idx < obj->chunk_count; ++sect_idx) { + String8 obj_sect_name = obj->sect_name_arr[sect_idx]; + String8 obj_sect_sort = obj->sect_sort_arr[sect_idx]; + + B32 is_match = str8_match(obj_sect_name, name, 0) && + str8_match(obj_sect_sort, postfix, 0); + + if (is_match) { + LNK_ChunkPtr chunk = &obj->chunk_arr[sect_idx]; + + if (!collect_discarded && lnk_chunk_is_discarded(chunk)) { + continue; + } + + LNK_ChunkNode *node = push_array_no_zero(arena, LNK_ChunkNode, 1); + node->next = 0; + node->data = chunk; + + SLLQueuePush(list.first, list.last, node); + ++list.count; + } + } + return list; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_collect_obj_chunks_task) +{ + U64 obj_idx = task_id; + LNK_CollectObjChunksTaskData *task = raw_task; + LNK_Obj *obj = task->obj_arr[obj_idx]; + LNK_ChunkList *list_ptr = &task->list_arr[obj_idx]; + *list_ptr = lnk_obj_search_chunks(arena, obj, task->name, task->postfix, task->collect_discarded); +} + +internal LNK_ChunkList * +lnk_collect_obj_chunks(TP_Context *tp, TP_Arena *arena, U64 obj_count, LNK_Obj **obj_arr, String8 name, String8 postfix, B32 collect_discarded) +{ + LNK_CollectObjChunksTaskData task_data = {0}; + task_data.obj_arr = obj_arr; + task_data.name = name; + task_data.postfix = postfix; + task_data.list_arr = push_array_no_zero(arena->v[0], LNK_ChunkList, obj_count); + task_data.collect_discarded = collect_discarded; + tp_for_parallel(tp, arena, obj_count, lnk_collect_obj_chunks_task, &task_data); + return task_data.list_arr; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_symbol_collector) +{ + LNK_SymbolCollector *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + LNK_SymbolList *list = &task->out_arr[task_id]; + for (U64 obj_idx = range.min; obj_idx < range.max; ++obj_idx) { + LNK_Obj *obj = &task->in_arr.v[obj_idx].data; + for (LNK_SymbolNode *node = obj->symbol_list.first; node != 0; node = node->next) { + if (node->data->type == task->type) { + lnk_symbol_list_push(arena, list, node->data); + } + } + } +} + +internal LNK_SymbolList +lnk_run_symbol_collector(TP_Context *tp, TP_Arena *arena, LNK_ObjNodeArray arr, LNK_SymbolType symbol_type) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + LNK_SymbolCollector task_data; + task_data.type = symbol_type; + task_data.range_arr = tp_divide_work(scratch.arena, arr.count, tp->worker_count); + task_data.in_arr = arr; + task_data.out_arr = push_array(scratch.arena, LNK_SymbolList, tp->worker_count); + + tp_for_parallel(tp, arena, tp->worker_count, lnk_symbol_collector, &task_data); + + LNK_SymbolList list = {0}; + for (U64 ithread = 0; ithread < tp->worker_count; ++ithread) { + lnk_symbol_list_concat_in_place(&list, &task_data.out_arr[ithread]); + } + + scratch_end(scratch); + ProfEnd(); + return list; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_default_lib_collector) +{ + LNK_DefaultLibCollector *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + String8List *result = &task->out_arr[task_id]; + for (U64 obj_idx = range.min; obj_idx < range.max; obj_idx += 1) { + LNK_Obj *obj = &task->in_arr.v[obj_idx].data; + String8List list = lnk_parse_default_lib_directive(arena, &obj->directive_info.v[LNK_Directive_DefaultLib]); + str8_list_concat_in_place(result, &list); + } +} + +internal LNK_InputLibList +lnk_collect_default_lib_obj_arr(TP_Context *tp, TP_Arena *arena, LNK_ObjNodeArray arr) +{ + Temp scratch = scratch_begin(0,0); + + LNK_DefaultLibCollector task_data; + task_data.range_arr = tp_divide_work(scratch.arena, arr.count, tp->worker_count); + task_data.in_arr = arr; + task_data.out_arr = push_array(scratch.arena, LNK_InputLibList, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, lnk_default_lib_collector, &task_data); + + String8List result = str8_list_arr_concat(task_data.out_arr, tp->worker_count); + + scratch_end(scratch); + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_manifest_dependency_collector) +{ + LNK_ManifestDependencyCollector *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + String8List *list = &task->out_arr[task_id]; + + LNK_Obj **obj_ptr = &task->in_arr[range.min]; + LNK_Obj **obj_opl = &task->in_arr[range.max]; + + for (; obj_ptr < obj_opl; obj_ptr += 1) { + LNK_Obj *obj = *obj_ptr; + LNK_DirectiveList *dirs = &obj->directive_info.v[LNK_Directive_ManifestDependency]; + for (LNK_Directive *dir = dirs->first; dir != 0; dir = dir->next) { + String8List dep = str8_list_copy(arena, &dir->value_list); + str8_list_concat_in_place(list, &dep); + } + } +} + +internal String8List +lnk_collect_manifest_dependency_list(TP_Context *tp, TP_Arena *arena, LNK_Obj **obj_arr, U64 obj_count) +{ + Temp scratch = scratch_begin(0,0); + + LNK_ManifestDependencyCollector task_data = {0}; + task_data.in_arr = obj_arr; + task_data.out_arr = push_array(scratch.arena, String8List, tp->worker_count); + task_data.range_arr = tp_divide_work(scratch.arena, obj_count, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, lnk_manifest_dependency_collector, &task_data); + + String8List result = str8_list_arr_concat(task_data.out_arr, tp->worker_count); + + scratch_end(scratch); + return result; +} + +internal void +lnk_sect_defn_list_push_node(LNK_SectDefnList *list, LNK_SectDefn *node) +{ + SLLQueuePush(list->first, list->last, node); + ++list->count; +} + +internal LNK_SectDefn * +lnk_sect_defn_list_push(Arena *arena, LNK_SectDefnList *list, LNK_Obj *obj, String8 name, U64 idx, COFF_SectionFlags flags) +{ + LNK_SectDefn *node = push_array_no_zero(arena, LNK_SectDefn, 1); + node->next = 0; + node->obj = obj; + node->name = name; + node->idx = idx; + node->flags = flags; + lnk_sect_defn_list_push_node(list, node); + return node; +} + +internal void +lnk_sect_defn_list_concat_in_place(LNK_SectDefnList *list, LNK_SectDefnList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal void +lnk_sect_defn_list_concat_in_place_arr(LNK_SectDefnList *list, LNK_SectDefnList *to_concat_arr, U64 count) +{ + SLLConcatInPlaceArray(list, to_concat_arr, count); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_obj_initer) +{ + Temp scratch = scratch_begin(&arena, 1); + + LNK_ObjIniter *task = raw_task; + LNK_InputObj *input = task->inputs[task_id]; + U64 obj_idx = task->obj_id_base + task_id; + LNK_ObjNode *obj_node = task->obj_node_arr + task_id; + LNK_Obj *obj = &obj_node->data; + + //Assert(coff_data.size > 0); + + // cache path, we need it for error reports and debug stuff + String8 cached_path = push_str8_copy(arena, input->path); + String8 cached_lib_path = push_str8_copy(arena, input->lib_path); + + // parse coff obj + COFF_HeaderInfo coff_info = coff_header_info_from_data(input->data); + COFF_SectionHeader *coff_sect_arr = (COFF_SectionHeader *)(input->data.str + coff_info.section_array_off); + COFF_Symbol32Array coff_symbols = coff_symbol_array_from_data(scratch.arena, input->data, coff_info.symbol_off, coff_info.symbol_count, coff_info.symbol_size); + + // handle machines we dont support + if (coff_info.machine != COFF_MachineType_UNKNOWN && + coff_info.machine != COFF_MachineType_X64) { + lnk_error(LNK_Error_UnsupportedMachine, "%S: %S machine is supported", input->path, coff_string_from_machine_type(coff_info.machine)); + } + + U64 chunk_count = 0; + chunk_count += coff_info.section_count_no_null; + chunk_count += 1; // :common_block + + String8 *sect_name_arr = push_array_no_zero(arena, String8, chunk_count); + String8 *sect_sort_arr = push_array_no_zero(arena, String8, chunk_count); + LNK_Chunk *chunk_arr = push_array_no_zero(arena, LNK_Chunk, chunk_count); + + // init section name and postfix array + for (U64 sect_idx = 0; sect_idx < coff_info.section_count_no_null; sect_idx += 1) { + COFF_SectionHeader *coff_sect = &coff_sect_arr[sect_idx]; + + // read name + String8 sect_name = coff_section_header_get_name(coff_sect, input->data, coff_info.string_table_off); + + // parse section name + String8 name, postfix; + coff_parse_section_name(sect_name, &name, &postfix); + + // fill out + sect_name_arr[sect_idx] = name; + sect_sort_arr[sect_idx] = postfix; + } + + // :common_block + U64 common_block_idx = chunk_count - 1; + sect_name_arr[common_block_idx] = str8_lit(".bss"); + sect_sort_arr[common_block_idx] = str8_lit("~"); + + for (U64 sect_idx = 0; sect_idx < coff_info.section_count_no_null; sect_idx += 1) { + COFF_SectionHeader *coff_sect = &coff_sect_arr[sect_idx]; + + String8 data; + if (coff_sect->flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA) { + data = str8(0, coff_sect->fsize); + } else { + data = str8(input->data.str + coff_sect->foff, coff_sect->fsize); + } + + LNK_Chunk *chunk = &chunk_arr[sect_idx]; + chunk->ref = lnk_chunk_ref(0,0); // :chunk_ref_assign + chunk->align = coff_align_size_from_section_flags(coff_sect->flags); + chunk->is_discarded = !!(coff_sect->flags & COFF_SectionFlag_LNK_REMOVE); + chunk->sort_chunk = 1; + chunk->type = LNK_Chunk_Leaf; + chunk->sort_idx = sect_sort_arr[sect_idx]; + chunk->input_idx = LNK_MakeChunkInputIdx(obj_idx, sect_idx); + chunk->flags = coff_sect->flags; + chunk->associate = 0; + chunk->u.leaf = data; + lnk_chunk_set_debugf(arena, chunk, "%S: name: %S, isect: 0x%llX", path, sect_name_arr[sect_idx], sect_idx); + } + + // :common_block + LNK_Chunk *master_common_block = &chunk_arr[common_block_idx]; + master_common_block->ref = lnk_chunk_ref(0,0); // :chunk_ref_assign + master_common_block->align = 1; + master_common_block->is_discarded = 0; + master_common_block->sort_chunk = 0; + master_common_block->type = LNK_Chunk_List; + master_common_block->sort_idx = sect_sort_arr[common_block_idx]; + master_common_block->input_idx = LNK_MakeChunkInputIdx(obj_idx, common_block_idx); + master_common_block->flags = LNK_BSS_SECTION_FLAGS; + master_common_block->associate = 0; + master_common_block->u.list = push_array(arena, LNK_ChunkList, 1); + lnk_chunk_set_debugf(arena, master_common_block, "%S: master common block", path); + + // convert from coff + LNK_SymbolArray symbol_arr = lnk_symbol_array_from_coff(arena, input->data, cached_path, coff_info.string_table_off, coff_info.section_count_no_null, coff_sect_arr, coff_symbols, chunk_arr, master_common_block); + LNK_SymbolList symbol_list = lnk_symbol_list_from_array(arena, symbol_arr); + LNK_RelocList *reloc_list_arr = lnk_reloc_list_array_from_coff(arena, coff_info.machine, input->data, coff_info.section_count_no_null, coff_sect_arr, chunk_arr, symbol_arr); + + // fill out obj + obj->data = input->data; + obj->path = cached_path; + obj->lib_path = cached_lib_path; + obj->machine = coff_info.machine; + obj->chunk_count = chunk_count; + obj->sect_count = coff_info.section_count_no_null; + obj->sect_name_arr = sect_name_arr; + obj->sect_sort_arr = sect_sort_arr; + obj->chunk_arr = chunk_arr; + obj->symbol_list = symbol_list; + obj->sect_reloc_list_arr = reloc_list_arr; + obj->directive_info = lnk_init_directives(arena, cached_path, coff_info.section_count_no_null, sect_name_arr, chunk_arr); + + // parse exports + LNK_ExportParseList export_parse = {0}; + for (LNK_Directive *dir = obj->directive_info.v[LNK_Directive_Export].first; dir != 0; dir = dir->next) { + lnk_parse_export_direcive(arena, &obj->export_parse, dir->value_list, obj); + } + + // push /export symbols + for (LNK_ExportParse *exp = export_parse.first; exp != 0; exp = exp->next) { + LNK_Symbol *symbol = lnk_make_undefined_symbol(arena, exp->name, LNK_SymbolScopeFlag_Main); + lnk_symbol_list_push(arena, &obj->symbol_list, symbol); + } + + // push /include symbols + for (LNK_Directive *dir = obj->directive_info.v[LNK_Directive_Include].first; dir != 0; dir = dir->next) { + str8_list_concat_in_place(&obj->include_symbol_list, &dir->value_list); + } + + // parse /alternatename + for (LNK_Directive *dir = obj->directive_info.v[LNK_Directive_AlternateName].first; dir != 0; dir = dir->next) { + String8 *invalid_string = lnk_parse_alt_name_directive_list(arena, dir->value_list, &obj->alt_name_list); + if (invalid_string != 0) { + lnk_error_obj(LNK_Error_Cmdl, obj, "invalid syntax \"%S\", expected format \"FROM=TO\"", *invalid_string); + } + } + + scratch_end(scratch); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_obj_new_sect_scanner) +{ + LNK_ObjNewSectScanner *task = raw_task; + + Rng1U64 range = task->range_arr[task_id]; + HashTable *ht = hash_table_init(arena, 128); + + for (U64 obj_idx = range.min; obj_idx < range.max; obj_idx += 1) { + LNK_Obj *obj = &task->obj_node_arr[obj_idx].data; + + for (U64 chunk_idx = 0; chunk_idx < obj->chunk_count; chunk_idx += 1) { + String8 sect_name = obj->sect_name_arr[chunk_idx]; + COFF_SectionFlags sect_flags = obj->chunk_arr[chunk_idx].flags & ~COFF_SectionFlags_LNK_FLAGS; + + KeyValuePair *is_present = hash_table_search_string(ht, sect_name); + if (is_present) { + if (lnk_is_error_code_active(LNK_Warning_SectionFlagsConflict)) { + LNK_SectDefn *defn = is_present->value_raw; + if (defn->flags != sect_flags) { + lnk_sect_defn_list_push(arena, &task->defn_arr[task_id], obj, sect_name, chunk_idx, sect_flags); + } + } + } else { + LNK_SectDefn *defn = lnk_sect_defn_list_push(arena, &task->defn_arr[task_id], obj, sect_name, chunk_idx, sect_flags); + hash_table_push_string_raw(arena, ht, sect_name, defn); + } + } + } +} + +LNK_CHUNK_VISITOR_SIG(lnk_chunk_get_count_cb) +{ + U64 *counter = (U64 *)ud; + *counter += 1; + return 0; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_chunk_counter) +{ + U64 obj_idx = task_id; + LNK_ChunkCounter *task = raw_task; + LNK_Obj *obj = &task->obj_arr[obj_idx].data; + for (U64 chunk_idx = 0; chunk_idx < obj->chunk_count; chunk_idx += 1) { + String8 name = obj->sect_name_arr[chunk_idx]; + LNK_Chunk *chunk = &obj->chunk_arr[chunk_idx]; + LNK_Section *sect = lnk_section_table_search(task->st, name); + + U64 count = 0; + lnk_visit_chunks(0, chunk, lnk_chunk_get_count_cb, &count); + + task->chunk_count_arr_arr[sect->id][obj_idx] += count; + } +} + +internal +LNK_CHUNK_VISITOR_SIG(lnk_chunk_ref_assign) +{ + LNK_ChunkRefAssign *ctx = ud; + + // alloc chunk id + U64 chunk_id = ctx->chunk_id_arr_arr[sect_id][ctx->obj_idx]; + ctx->chunk_id_arr_arr[sect_id][ctx->obj_idx] += 1; + + // set chunk ref + chunk->ref = lnk_chunk_ref(sect_id, chunk_id); + + // keep visiting chunks + return 0; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_chunk_ref_assigner) +{ + LNK_ChunkRefAssigner *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + + for (U64 obj_idx = range.min; obj_idx < range.max; obj_idx += 1) { + LNK_Obj *obj = &task->obj_arr[obj_idx].data; + + for (U64 chunk_idx = 0; chunk_idx < obj->chunk_count; chunk_idx += 1) { + String8 name = obj->sect_name_arr[chunk_idx]; + String8 sort = obj->sect_sort_arr[chunk_idx]; + LNK_Chunk *chunk = &obj->chunk_arr[chunk_idx]; + + // :find_chunk_section + LNK_Section *sect = lnk_section_table_search(task->st, name); + Assert(sect); + + // :chunk_ref_assign + LNK_ChunkRefAssign ctx; + ctx.cman = sect->cman; + ctx.chunk_id_arr_arr = task->chunk_id_arr_arr; + ctx.obj_idx = obj_idx; + lnk_visit_chunks(sect->id, chunk, lnk_chunk_ref_assign, &ctx); + + // push to section chunk list + LNK_ChunkList **chunk_list_arr_arr = sort.size ? task->chunk_list_arr_arr : task->nosort_chunk_list_arr_arr; + lnk_chunk_list_push(arena, &chunk_list_arr_arr[sect->id][task_id], chunk); + } + } +} + +internal LNK_ObjNodeArray +lnk_obj_list_push_parallel(TP_Context *tp, TP_Arena *arena, LNK_ObjList *obj_list, LNK_SectionTable *st, U64 input_count, LNK_InputObj **inputs) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + U64 obj_id_base = obj_list->count; + LNK_ObjNodeArray obj_arr = lnk_obj_list_reserve(arena->v[0], obj_list, input_count); + + ProfBegin("Obj Initer"); + { + LNK_ObjIniter task = {0}; + task.inputs = inputs; + task.obj_id_base = obj_id_base; + task.obj_node_arr = obj_arr.v; + tp_for_parallel(tp, arena, input_count, lnk_obj_initer, &task); + } + ProfEnd(); + + if (st) { + ProfBegin("Section Table Update"); + { + TP_Temp temp = tp_temp_begin(arena); + + LNK_ObjNewSectScanner task; + task.range_arr = tp_divide_work(arena->v[0], obj_arr.count, tp->worker_count); + task.obj_node_arr = obj_arr.v; + task.defn_arr = push_array(arena->v[0], LNK_SectDefnList, tp->worker_count); + task.conf_arr = push_array(arena->v[0], LNK_SectDefnList, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, lnk_obj_new_sect_scanner, &task); + + + LNK_SectDefnList defn_list = {0}; + LNK_SectDefnList conf_list = {0}; + lnk_sect_defn_list_concat_in_place_arr(&defn_list, task.defn_arr, tp->worker_count); + lnk_sect_defn_list_concat_in_place_arr(&conf_list, task.conf_arr, tp->worker_count); + + + HashTable *ht = hash_table_init(arena->v[0], 128); + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + hash_table_push_string_u64(arena->v[0], ht, sect->name, sect->flags); + } + + + LNK_SectDefnList new_list = {0}; + for (LNK_SectDefn *curr = defn_list.first, *next; curr != 0; curr = next) { + next = curr->next; + curr->next = 0; + + KeyValuePair *is_present = hash_table_search_string(ht, curr->name); + if (is_present) { + if (lnk_is_error_code_active(LNK_Warning_SectionFlagsConflict)) { + COFF_SectionFlags flags = is_present->value_u64; + if (flags != curr->flags) { + lnk_sect_defn_list_push_node(&conf_list, curr); + } else { + // section is present or is in new_list + } + } + } else { + lnk_sect_defn_list_push_node(&new_list, curr); + hash_table_push_string_u64(arena->v[0], ht, curr->name, curr->flags); + } + } + + + for (LNK_SectDefn *defn = conf_list.first; defn != 0; defn = defn->next) { + KeyValuePair *is_present = hash_table_search_string(ht, defn->name); + if (!is_present) { + InvalidPath; + } + U64 sect_number = (defn->idx + 1); + COFF_SectionFlags expected_flags = is_present->value_u64; + String8 expected_flags_str = coff_string_from_section_flags(scratch.arena, expected_flags); + String8 current_flags_str = coff_string_from_section_flags(scratch.arena, defn->flags); + lnk_error_obj(LNK_Warning_SectionFlagsConflict, defn->obj, "detected section flags conflict in %S(No. %X); expected {%S} but got {%S}", defn->name, sect_number, expected_flags_str, current_flags_str); + } + + + // push new sections for :find_chunk_section + for (LNK_SectDefn *curr = new_list.first; curr != 0; curr = curr->next) { + lnk_section_table_push(st, curr->name, curr->flags & ~COFF_SectionFlags_LNK_FLAGS); + } + + tp_temp_end(temp); + } + ProfEnd(); + + ProfBegin("Count Chunks Per Section"); + U64 **chunk_id_arr_arr; + { + U64 **chunk_count_arr_arr = push_array_no_zero(scratch.arena, U64 *, st->id_max); + for (U64 sect_idx = 0; sect_idx < st->id_max; sect_idx += 1) { + chunk_count_arr_arr[sect_idx] = push_array(scratch.arena, U64, obj_arr.count); + } + + LNK_ChunkCounter task; + task.st = st; + task.obj_arr = obj_arr.v; + task.chunk_count_arr_arr = chunk_count_arr_arr; + tp_for_parallel(tp, 0, obj_arr.count, lnk_chunk_counter, &task); + + chunk_id_arr_arr = chunk_count_arr_arr; + for (U64 sect_idx = 1; sect_idx < st->id_max; sect_idx += 1) { + LNK_Section *sect = lnk_section_table_search_id(st, sect_idx); + if (!sect) continue; + for (U64 obj_idx = 0; obj_idx < obj_arr.count; obj_idx += 1) { + U64 chunk_id_base = sect->cman->total_chunk_count; + sect->cman->total_chunk_count += chunk_count_arr_arr[sect_idx][obj_idx]; + chunk_id_arr_arr[sect_idx][obj_idx] = chunk_id_base; + } + } + } + ProfEnd(); + + ProfBegin("Assign Chunk Refs"); + { + LNK_ChunkRefAssigner task; + task.st = st; + task.range_arr = tp_divide_work(scratch.arena, obj_arr.count, tp->worker_count); + task.chunk_id_arr_arr = chunk_id_arr_arr; + task.obj_arr = obj_arr.v; + task.nosort_chunk_list_arr_arr = lnk_make_chunk_list_arr_arr(scratch.arena, st->id_max, tp->worker_count); + task.chunk_list_arr_arr = lnk_make_chunk_list_arr_arr(scratch.arena, st->id_max, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, lnk_chunk_ref_assigner, &task); + + // merge chunks + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + lnk_chunk_list_concat_in_place_arr(sect->nosort_chunk->u.list, task.nosort_chunk_list_arr_arr[sect->id], tp->worker_count); + lnk_chunk_list_concat_in_place_arr(sect->root->u.list, task.chunk_list_arr_arr[sect->id], tp->worker_count); + } + } + ProfEnd(); + } + + ProfEnd(); + scratch_end(scratch); + return obj_arr; +} + +internal LNK_SymbolArray +lnk_symbol_array_from_coff(Arena *arena, + String8 coff_data, + String8 obj_path, + U64 string_table_off, + U64 sect_count, + COFF_SectionHeader *coff_sect_arr, + COFF_Symbol32Array coff_symbols, + LNK_Chunk *chunk_arr, + LNK_Chunk *master_common_block) +{ + LNK_SymbolList weak_symbol_list = {0}; + + LNK_SymbolArray symbol_array = {0}; + symbol_array.count = coff_symbols.count; + symbol_array.v = push_array(arena, LNK_Symbol, symbol_array.count); + + for (U64 symbol_idx = 0; symbol_idx < coff_symbols.count; symbol_idx += 1) { + COFF_Symbol32 *coff_symbol = &coff_symbols.v[symbol_idx]; + LNK_Symbol *symbol = &symbol_array.v[symbol_idx]; + lnk_symbol_set_debug(symbol, obj_path); + + String8 name = coff_read_symbol_name(coff_data, string_table_off, &coff_symbol->name); + + // TODO: we convert 16-bit symbols and copy them to arena; symbols with short names + // are stored in the symbol itself and becuase converted symbols are pushed to scratch + // that memory is discarded after obj is processed + name = push_str8_copy(arena, name); + + COFF_SymbolValueInterpType interp = coff_interp_symbol(coff_symbol); + switch (interp) { + case COFF_SymbolValueInterp_REGULAR: { + if (coff_symbol->section_number == 0 || coff_symbol->section_number > sect_count) { + lnk_error(LNK_Error_IllData, "%S: out ouf bounds section index in symbol \"%S (%u)\"", obj_path, name, coff_symbol->section_number); + break; + } + + LNK_DefinedSymbolVisibility visibility = LNK_DefinedSymbolVisibility_Static; + if (coff_symbol->storage_class == COFF_SymStorageClass_EXTERNAL) { + visibility = LNK_DefinedSymbolVisibility_Extern; + } + + LNK_DefinedSymbolFlags flags = 0; + if (coff_symbol->type.u.lsb == COFF_SymType_NULL && coff_symbol->type.u.msb == COFF_SymDType_FUNC) { + flags |= LNK_DefinedSymbolFlag_IsFunc; + } + + COFF_ComdatSelectType selection = COFF_ComdatSelectType_ANY; + U64 check_sum = 0; + { + B32 is_comdat = !!(coff_sect_arr[coff_symbol->section_number - 1].flags & COFF_SectionFlag_LNK_COMDAT); + B32 has_static_def = is_comdat && + coff_symbol->value == 0 && + coff_symbol->type.u.lsb == COFF_SymType_NULL && + coff_symbol->storage_class == COFF_SymStorageClass_STATIC && + coff_symbol->aux_symbol_count == 1; + if (has_static_def) { + COFF_SymbolSecDef *secdef = (COFF_SymbolSecDef *)(coff_symbol + 1); + selection = secdef->selection; + check_sum = secdef->check_sum; + + if (secdef->selection == COFF_ComdatSelectType_ASSOCIATIVE) { + LNK_Chunk *head_chunk = &chunk_arr[secdef->number - 1]; + LNK_Chunk *associate_chunk = &chunk_arr[coff_symbol->section_number - 1]; + lnk_chunk_associate(arena, head_chunk, associate_chunk); + } + } + } + + LNK_Chunk *chunk = &chunk_arr[coff_symbol->section_number - 1]; + U64 offset = coff_symbol->value; + lnk_init_defined_symbol_chunk(symbol, name, visibility, flags, chunk, offset, selection, check_sum); + } break; + case COFF_SymbolValueInterp_UNDEFINED: { + lnk_init_undefined_symbol(symbol, name, LNK_SymbolScopeFlag_Main); + } break; + case COFF_SymbolValueInterp_COMMON: { + // :common_block + LNK_Chunk *chunk = push_array_no_zero(arena, LNK_Chunk, 1); + chunk->ref = lnk_chunk_ref(0,0); // :chunk_ref_assign + chunk->align = 1; + chunk->is_discarded = 0; + chunk->sort_chunk = 1; + chunk->type = LNK_Chunk_Leaf; + chunk->sort_idx = str8(0,0); + chunk->input_idx = LNK_MakeChunkInputIdx(0, lnk_chunk_list_get_node_count(master_common_block)); + chunk->flags = LNK_BSS_SECTION_FLAGS; + chunk->associate = 0; + chunk->u.leaf = str8(0, coff_symbol->value); + lnk_chunk_set_debugf(arena, chunk, "common block %S", name); + lnk_chunk_list_push(arena, master_common_block->u.list, chunk); + + LNK_DefinedSymbolVisibility visibility = LNK_DefinedSymbolVisibility_Extern; + + LNK_DefinedSymbolFlags flags = 0; + if (coff_symbol->type.u.lsb == COFF_SymType_NULL && coff_symbol->type.u.msb == COFF_SymDType_FUNC) { + flags |= LNK_DefinedSymbolFlag_IsFunc; + } + + lnk_init_defined_symbol_chunk(symbol, name, visibility, flags, chunk, 0, COFF_ComdatSelectType_LARGEST, 0); + } break; + case COFF_SymbolValueInterp_WEAK: { + if (coff_symbol->aux_symbol_count == 0 || symbol_idx + 1 >= coff_symbols.count) { + lnk_error(LNK_Error_IllData, "%S: Weak symbol \"%S (%u)\" must at least one aux symbol", obj_path, name, symbol_idx); + break; + } + + COFF_SymbolWeakExt *weak_ext = (COFF_SymbolWeakExt*)(coff_symbol + 1); + if (weak_ext->tag_index >= symbol_array.count) { + lnk_error(LNK_Error_IllData, "%S: Weak symbol \"%S (%u)\" points to out of bounds symbol", obj_path, name, symbol_idx); + break; + } +#if 0 + if (symbol_array.v[weak_ext->tag_index] == NULL) { + lnk_error(LNK_ERROR_ILL_DATA, "%S: Weak symbol \"%S (%u)\" tags auxiliary symbol %u", + obj_path, name, symbol_idx, weak_ext->tag_index); + break; + } +#endif + + lnk_init_weak_symbol(symbol, name, weak_ext->characteristics, &symbol_array.v[weak_ext->tag_index]); + + lnk_symbol_list_push(arena, &weak_symbol_list, symbol); + } break; + case COFF_SymbolValueInterp_ABS: { + // Never code or data, synthetic symbol. COFF spec says bits in value are used + // as flags in symbol @feat.00, other symbols like @comp.id and @vol.md are undocumented. + // LLVM uses undocumented mask 0x4800 on @feat.00 to tell if object was compiled with /guard:cf. + + LNK_DefinedSymbolVisibility visibility = LNK_DefinedSymbolVisibility_Static; + if (coff_symbol->storage_class == COFF_SymStorageClass_EXTERNAL) { + visibility = LNK_DefinedSymbolVisibility_Extern; + } + + lnk_init_defined_symbol_va(symbol, name, visibility, 0, coff_symbol->value); + } break; + case COFF_SymbolValueInterp_DEBUG: { + // ignore + } break; + } + + // skip aux symbols + symbol_idx += coff_symbol->aux_symbol_count; + } + + return symbol_array; +} + +internal LNK_RelocList * +lnk_reloc_list_array_from_coff(Arena *arena, COFF_MachineType machine, String8 coff_data, U64 sect_count, COFF_SectionHeader *coff_sect_arr, LNK_Chunk *chunk_arr, LNK_SymbolArray symbol_array) +{ + LNK_RelocList *reloc_list_arr = push_array_no_zero(arena, LNK_RelocList, sect_count); + for (U64 sect_idx = 0; sect_idx < sect_count; sect_idx += 1) { + COFF_SectionHeader *coff_header = &coff_sect_arr[sect_idx]; + COFF_RelocInfo coff_reloc_info = coff_reloc_info_from_section_header(coff_data, coff_header); + COFF_Reloc *coff_reloc_v = (COFF_Reloc *)(coff_data.str + coff_reloc_info.array_off); + LNK_Chunk *sect_chunk = &chunk_arr[sect_idx]; + reloc_list_arr[sect_idx] = lnk_reloc_list_from_coff_reloc_array(arena, machine, sect_chunk, symbol_array, coff_reloc_v, coff_reloc_info.count); + } + return reloc_list_arr; +} + +internal LNK_DirectiveInfo +lnk_init_directives(Arena *arena, String8 obj_path, U64 chunk_count, String8 *sect_name_arr, LNK_Chunk *chunk_arr) +{ + LNK_DirectiveInfo directive_info = {0}; + for (U64 chunk_idx = 0; chunk_idx < chunk_count; chunk_idx += 1) { + String8 sect_name = sect_name_arr[chunk_idx]; + LNK_Chunk *sect_chunk = &chunk_arr[chunk_idx]; + Assert(sect_chunk->type == LNK_Chunk_Leaf); + + if (!str8_match(sect_name, str8_lit(".drectve"), 0)) { + continue; + } + if (sect_chunk->u.leaf.size < 3) { + lnk_error(LNK_Warning_IllData, "%S: can't parse %S", obj_path, sect_name); + continue; + } + if (~sect_chunk->flags & COFF_SectionFlag_LNK_INFO) { + lnk_error(LNK_Warning_IllData, "%S: %S missing COFF_SectionFlag_LNK_INFO.", obj_path, sect_name); + } + + // TODO: warn if section has relocations + + lnk_parse_directives(arena, &directive_info, sect_chunk->u.leaf, obj_path); + int bad_vs = 0; (void)bad_vs; + } + return directive_info; +} + +internal COFF_FeatFlags +lnk_obj_get_features(LNK_Obj *obj) +{ + COFF_FeatFlags result = 0; + LNK_Symbol *sym = lnk_symbol_list_search(obj->symbol_list, str8_lit("@feat.00"), 0); + if (sym) { + Assert(LNK_Symbol_IsDefined(sym->type)); + Assert(sym->u.defined.value_type == LNK_DefinedSymbolValue_VA); + result = sym->u.defined.u.va; + } + return result; +} + +internal U32 +lnk_obj_get_comp_id(LNK_Obj *obj) +{ + U32 result = 0; + LNK_Symbol *sym = lnk_symbol_list_search(obj->symbol_list, str8_lit("@comp.id"), 0); + if (sym) { + Assert(LNK_Symbol_IsDefined(sym->type)); + Assert(sym->u.defined.value_type == LNK_DefinedSymbolValue_VA); + result = sym->u.defined.u.va; + } + return result; +} + +internal U32 +lnk_obj_get_vol_md(LNK_Obj *obj) +{ + U32 result = 0; + LNK_Symbol *sym = lnk_symbol_list_search(obj->symbol_list, str8_lit("@vol.md"), 0); + if (sym) { + Assert(LNK_Symbol_IsDefined(sym->type)); + Assert(sym->u.defined.value_type == LNK_DefinedSymbolValue_VA); + result = sym->u.defined.u.va; + } + return result; +} + diff --git a/src/linker/lnk_obj.h b/src/linker/lnk_obj.h new file mode 100644 index 00000000..21cd8629 --- /dev/null +++ b/src/linker/lnk_obj.h @@ -0,0 +1,190 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +//////////////////////////////// + +typedef struct LNK_InputObj +{ + struct LNK_InputObj *next; + B32 is_thin; + B32 has_disk_read_failed; + String8 dedup_id; + String8 path; + String8 data; + String8 lib_path; +} LNK_InputObj; + +typedef struct LNK_InputObjList +{ + U64 count; + LNK_InputObj *first; + LNK_InputObj *last; +} LNK_InputObjList; + +//////////////////////////////// + +#define LNK_MakeChunkInputIdx(obj_idx, sect_idx) (((U64)(obj_idx) << 32) | (U64)((sect_idx) & max_U32)) + +typedef struct LNK_Obj +{ + String8 data; + String8 path; + String8 lib_path; + U64 common_symbol_size; + COFF_MachineType machine; + U64 chunk_count; + U64 sect_count; + String8 *sect_name_arr; + String8 *sect_sort_arr; + LNK_RelocList *sect_reloc_list_arr; + LNK_Chunk *chunk_arr; + LNK_SymbolList symbol_list; + LNK_DirectiveInfo directive_info; + LNK_ExportParseList export_parse; + String8List include_symbol_list; + LNK_AltNameList alt_name_list; +} LNK_Obj; + +typedef struct LNK_ObjNode +{ + struct LNK_ObjNode *next; + LNK_Obj data; +} LNK_ObjNode; + +typedef struct LNK_ObjList +{ + U64 count; + LNK_ObjNode *first; + LNK_ObjNode *last; +} LNK_ObjList; + +typedef struct LNK_ObjNodeArray +{ + U64 count; + LNK_ObjNode *v; +} LNK_ObjNodeArray; + +//////////////////////////////// + +typedef struct LNK_SectDefn +{ + struct LNK_SectDefn *next; + LNK_Obj *obj; + String8 name; + COFF_SectionFlags flags; + U64 idx; +} LNK_SectDefn; + +typedef struct +{ + U64 count; + LNK_SectDefn *first; + LNK_SectDefn *last; +} LNK_SectDefnList; + +typedef struct +{ + LNK_InputObj **inputs; + LNK_ObjNode *obj_node_arr; + U64 obj_id_base; + LNK_SectDefnList *defn_arr; + LNK_SectionTable *st; +} LNK_ObjIniter; + +typedef struct +{ + Rng1U64 *range_arr; + LNK_ObjNode *obj_node_arr; + LNK_SectDefnList *defn_arr; + LNK_SectDefnList *conf_arr; +} LNK_ObjNewSectScanner; + +typedef struct +{ + LNK_SectionTable *st; + LNK_ObjNode *obj_arr; + U64 **chunk_count_arr_arr; +} LNK_ChunkCounter; + +typedef struct +{ + LNK_ChunkManager *cman; + U64 **chunk_id_arr_arr; + U64 obj_idx; +} LNK_ChunkRefAssign; + +typedef struct +{ + LNK_SectionTable *st; + Rng1U64 *range_arr; + U64 **chunk_id_arr_arr; + LNK_ObjNode *obj_arr; + LNK_ChunkList **nosort_chunk_list_arr_arr; + LNK_ChunkList **chunk_list_arr_arr; +} LNK_ChunkRefAssigner; + +typedef struct +{ + LNK_SymbolType type; + LNK_ObjNodeArray in_arr; + LNK_SymbolList *out_arr; + Rng1U64 *range_arr; +} LNK_SymbolCollector; + +typedef struct +{ + LNK_Obj **obj_arr; + String8 name; + String8 postfix; + B32 collect_discarded; + LNK_ChunkList *list_arr; +} LNK_CollectObjChunksTaskData; + +typedef struct +{ + Rng1U64 *range_arr; + LNK_ObjNodeArray in_arr; + String8List *out_arr; +} LNK_DefaultLibCollector; + +typedef struct +{ + LNK_Obj **in_arr; + String8List *out_arr; + Rng1U64 *range_arr; +} LNK_ManifestDependencyCollector; + +//////////////////////////////// + +internal void lnk_error_obj(LNK_ErrorCode code, LNK_Obj *obj, char *fmt, ...); + +//////////////////////////////// + +internal void lnk_input_obj_list_push_node(LNK_InputObjList *list, LNK_InputObj *node); +internal void lnk_input_obj_list_concat_in_place(LNK_InputObjList *list, LNK_InputObjList *to_concat); +internal LNK_InputObj * lnk_input_obj_list_push(Arena *arena, LNK_InputObjList *list); +internal LNK_InputObj ** lnk_array_from_input_obj_list(Arena *arena, LNK_InputObjList list); +internal LNK_InputObjList lnk_input_obj_list_from_string_list(Arena *arena, String8List list); +internal LNK_InputObjList lnk_list_from_input_obj_arr(LNK_InputObj **arr, U64 count); + +//////////////////////////////// + +internal LNK_InputObjList lnk_input_obj_list_from_string_list(Arena *arena, String8List list); + +internal LNK_Obj ** lnk_obj_arr_from_list(Arena *arena, LNK_ObjList list); +internal LNK_ObjNodeArray lnk_obj_list_reserve(Arena *arena, LNK_ObjList *list, U64 count); +internal LNK_ChunkList * lnk_collect_obj_chunks(TP_Context *tp, TP_Arena *arena, U64 obj_count, LNK_Obj **obj_arr, String8 name, String8 postfix, B32 collect_discarded); +internal LNK_ObjNodeArray lnk_obj_list_push_parallel(TP_Context *tp, TP_Arena *tp_arena, LNK_ObjList *obj_list, LNK_SectionTable *st, U64 input_count, LNK_InputObj **inputs); + +internal LNK_Chunk * lnk_sect_chunk_array_from_coff(Arena *arena, U64 obj_id, String8 obj_path, String8 coff_data, U64 sect_count, COFF_SectionHeader *coff_sect_arr, String8 *sect_name_arr, String8 *sect_postfix_arr); +internal LNK_SymbolArray lnk_symbol_array_from_coff(Arena *arena, String8 coff_data, String8 obj_path, U64 string_table_off, U64 sect_count, COFF_SectionHeader *coff_sect_arr, COFF_Symbol32Array coff_symbols, LNK_Chunk *chunk_arr, LNK_Chunk *master_common_block); +internal LNK_RelocList lnk_reloc_list_from_coff_reloc_array(Arena *arena, COFF_MachineType machine, LNK_Chunk *chunk, LNK_SymbolArray symbol_array, COFF_Reloc *reloc_v, U64 reloc_count); +internal LNK_RelocList * lnk_reloc_list_array_from_coff(Arena *arena, COFF_MachineType machine, String8 coff_data, U64 sect_count, COFF_SectionHeader *coff_sect_arr, LNK_Chunk *sect_chunk_arr, LNK_SymbolArray symbol_array); +internal LNK_DirectiveInfo lnk_init_directives(Arena *arena, String8 obj_path, U64 chunk_count, String8 *sect_name_arr, LNK_Chunk *chunk_arr); + +internal U32 lnk_obj_get_features(LNK_Obj *obj); +internal U32 lnk_obj_get_comp_id(LNK_Obj *obj); +internal U32 lnk_obj_get_vol_md(LNK_Obj *obj); + diff --git a/src/linker/lnk_reloc.c b/src/linker/lnk_reloc.c new file mode 100644 index 00000000..e22c31d4 --- /dev/null +++ b/src/linker/lnk_reloc.c @@ -0,0 +1,153 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal LNK_Reloc * +lnk_reloc_list_reserve(Arena *arena, LNK_RelocList *list, U64 count) +{ + LNK_Reloc *arr = NULL; + if (count) { + arr = push_array(arena, LNK_Reloc, count); + for (LNK_Reloc *ptr = arr, *opl = arr + count; ptr < opl; ++ptr) { + SLLQueuePush(list->first, list->last, ptr); + } + list->count += count; + } + return arr; +} + +internal LNK_Reloc * +lnk_reloc_list_push(Arena *arena, LNK_RelocList *list) +{ + LNK_Reloc *node = push_array(arena, LNK_Reloc, 1); + SLLQueuePush(list->first, list->last, node); + list->count += 1; + return node; +} + +internal LNK_RelocList +lnk_reloc_list_copy(Arena *arena, LNK_RelocList *list) +{ + LNK_RelocList result = {0}; + for (LNK_Reloc *n = list->first; n != NULL; n = n->next) { + LNK_Reloc *r = lnk_reloc_list_push(arena, &result); + r->chunk = n->chunk; + r->type = n->type; + r->apply_off = n->apply_off; + r->symbol = n->symbol; + } + return result; +} + +internal void +lnk_reloc_list_concat_in_place(LNK_RelocList *list, LNK_RelocList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal void +lnk_reloc_list_concat_in_place_arr(LNK_RelocList *list, LNK_RelocList *arr, U64 count) +{ + SLLConcatInPlaceArray(list, arr, count); +} + +internal LNK_RelocList ** +lnk_make_reloc_list_arr_arr(Arena *arena, U64 slot_count, U64 per_count) +{ + LNK_RelocList **arr_arr = push_array_no_zero(arena, LNK_RelocList *, slot_count); + for (U64 i = 0; i < slot_count; i += 1) { + arr_arr[i] = push_array(arena, LNK_RelocList, per_count); + } + return arr_arr; +} + +internal LNK_RelocList +lnk_reloc_list_from_coff_reloc_array(Arena *arena, COFF_MachineType machine, LNK_Chunk *chunk, LNK_SymbolArray symbol_array, COFF_Reloc *reloc_v, U64 reloc_count) +{ + LNK_RelocList reloc_list = {0}; + LNK_Reloc *reloc_arr = lnk_reloc_list_reserve(arena, &reloc_list, reloc_count); + LNK_Reloc *reloc_ptr = reloc_arr; + LNK_Reloc *reloc_opl = reloc_arr + reloc_count; + COFF_Reloc *coff_reloc_ptr = reloc_v; + for (; reloc_ptr < reloc_opl; reloc_ptr += 1, coff_reloc_ptr += 1) { + Assert(coff_reloc_ptr->isymbol < symbol_array.count); + reloc_ptr->chunk = chunk; + reloc_ptr->type = lnk_ext_reloc_type_from_coff(machine, coff_reloc_ptr->type); + reloc_ptr->apply_off = coff_reloc_ptr->apply_off; + reloc_ptr->symbol = symbol_array.v + coff_reloc_ptr->isymbol; + } + return reloc_list; +} + +internal LNK_Reloc ** +lnk_reloc_array_from_list(Arena *arena, LNK_RelocList list) +{ + LNK_Reloc **arr = push_array_no_zero(arena, LNK_Reloc *, list.count); + U64 count = 0; + for (LNK_Reloc *node = list.first; node != 0; node = node->next) { + Assert(count < list.count); + arr[count++] = node; + } + return arr; +} + +internal LNK_RelocType +lnk_ext_reloc_type_from_coff(COFF_MachineType machine, U32 type) +{ + LNK_RelocType result = LNK_Reloc_NULL; + switch (machine) { + case COFF_MachineType_UNKNOWN: break; + case COFF_MachineType_X64: { + switch (type) { + case COFF_RelocTypeX64_ABS: result = LNK_Reloc_NULL; break; + case COFF_RelocTypeX64_ADDR64: result = LNK_Reloc_ADDR_64; break; + case COFF_RelocTypeX64_ADDR32: result = LNK_Reloc_ADDR_32; break; + case COFF_RelocTypeX64_ADDR32NB: result = LNK_Reloc_VIRT_OFF_32; break; + case COFF_RelocTypeX64_REL32: result = LNK_Reloc_REL32; break; + case COFF_RelocTypeX64_REL32_1: result = LNK_Reloc_REL32_1; break; + case COFF_RelocTypeX64_REL32_2: result = LNK_Reloc_REL32_2; break; + case COFF_RelocTypeX64_REL32_3: result = LNK_Reloc_REL32_3; break; + case COFF_RelocTypeX64_REL32_4: result = LNK_Reloc_REL32_4; break; + case COFF_RelocTypeX64_REL32_5: result = LNK_Reloc_REL32_5; break; + case COFF_RelocTypeX64_SECTION: result = LNK_Reloc_SECT_IDX; break; + case COFF_RelocTypeX64_SECREL: result = LNK_Reloc_SECT_REL; break; + case COFF_RelocTypeX64_SECREL7: lnk_not_implemented("TODO: COFF_RelocTypeX64_SECREL7"); break; + case COFF_RelocTypeX64_TOKEN: lnk_not_implemented("TODO: COFF_RelocTypeX64_TOKEN"); break; + case COFF_RelocTypeX64_SREL32: lnk_not_implemented("TODO: COFF_RelocTypeX64_SREL32"); break; + case COFF_RelocTypeX64_PAIR: lnk_not_implemented("TODO: COFF_RelocTypeX64_PAIR"); break; + case COFF_RelocTypeX64_SSPAN32: lnk_not_implemented("TODO: COFF_RelocTypeX64_SSPAN32"); break; + default: lnk_invalid_path("unknown relocation type 0x%X", type); + } + } break; + default: lnk_not_implemented("TODO: define remap for coff reloc types"); break; + } + return result; +} + +internal U32 +lnk_ext_reloc_type_to_coff(COFF_MachineType machine, LNK_RelocType type) +{ + U32 result = 0; + switch (machine) { + case COFF_MachineType_X64: { + switch (type) { + case LNK_Reloc_NULL: result = COFF_RelocTypeX64_ABS; break; + case LNK_Reloc_ADDR_64: result = COFF_RelocTypeX64_ADDR64; break; + case LNK_Reloc_ADDR_32: result = COFF_RelocTypeX64_ADDR32; break; + case LNK_Reloc_VIRT_OFF_32: result = COFF_RelocTypeX64_ADDR32NB; break; + case LNK_Reloc_REL32: result = COFF_RelocTypeX64_REL32; break; + case LNK_Reloc_REL32_1: result = COFF_RelocTypeX64_REL32_1; break; + case LNK_Reloc_REL32_2: result = COFF_RelocTypeX64_REL32_2; break; + case LNK_Reloc_REL32_3: result = COFF_RelocTypeX64_REL32_3; break; + case LNK_Reloc_REL32_4: result = COFF_RelocTypeX64_REL32_4; break; + case LNK_Reloc_REL32_5: result = COFF_RelocTypeX64_REL32_5; break; + case LNK_Reloc_SECT_IDX: result = COFF_RelocTypeX64_SECTION; break; + case LNK_Reloc_SECT_REL: result = COFF_RelocTypeX64_SECREL; break; + default: InvalidPath; + } + } break; + default: lnk_not_implemented("TODO: support for machine 0x%X", machine); break; + } + return result; +} + + diff --git a/src/linker/lnk_reloc.h b/src/linker/lnk_reloc.h new file mode 100644 index 00000000..c814123a --- /dev/null +++ b/src/linker/lnk_reloc.h @@ -0,0 +1,56 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef enum +{ + LNK_Reloc_NULL, + LNK_Reloc_ADDR_16, + LNK_Reloc_ADDR_32, + LNK_Reloc_ADDR_64, + LNK_Reloc_CHUNK_SIZE_FILE_16, + LNK_Reloc_CHUNK_SIZE_FILE_32, + LNK_Reloc_CHUNK_SIZE_VIRT_32, + LNK_Reloc_FILE_ALIGN_32, + LNK_Reloc_FILE_OFF_15, + LNK_Reloc_FILE_OFF_32, + LNK_Reloc_FILE_OFF_64, + LNK_Reloc_REL32, + LNK_Reloc_REL32_1, + LNK_Reloc_REL32_2, + LNK_Reloc_REL32_3, + LNK_Reloc_REL32_4, + LNK_Reloc_REL32_5, + LNK_Reloc_SECT_REL, + LNK_Reloc_SECT_IDX, + LNK_Reloc_VIRT_ALIGN_32, + LNK_Reloc_VIRT_OFF_32, +} LNK_RelocType; + +typedef struct LNK_Reloc +{ + struct LNK_Reloc *next; + LNK_Chunk *chunk; + LNK_RelocType type; + U64 apply_off; + struct LNK_Symbol *symbol; +} LNK_Reloc; + +typedef struct LNK_RelocList +{ + U64 count; + LNK_Reloc *first; + LNK_Reloc *last; +} LNK_RelocList; + +internal LNK_Reloc * lnk_reloc_list_reserve(Arena *arena, LNK_RelocList *list, U64 count); +internal LNK_Reloc * lnk_reloc_list_push(Arena *arena, LNK_RelocList *list); +internal LNK_RelocList lnk_reloc_list_copy(Arena *arena, LNK_RelocList *list); +internal void lnk_reloc_list_concat_in_place(LNK_RelocList *list, LNK_RelocList *to_concat); +internal void lnk_reloc_list_concat_in_place_arr(LNK_RelocList *list, LNK_RelocList *arr, U64 count); +internal LNK_RelocList ** lnk_make_reloc_list_arr_arr(Arena *arena, U64 slot_count, U64 per_count); +internal LNK_Reloc ** lnk_reloc_array_from_list(Arena *arena, LNK_RelocList list); +internal LNK_RelocType lnk_ext_reloc_type_from_coff(COFF_MachineType machine, U32 type); +internal U32 lnk_ext_reloc_type_to_coff(COFF_MachineType machine, LNK_RelocType type); + diff --git a/src/linker/lnk_section_table.c b/src/linker/lnk_section_table.c new file mode 100644 index 00000000..275b792b --- /dev/null +++ b/src/linker/lnk_section_table.c @@ -0,0 +1,855 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal LNK_SectionNode * +lnk_section_list_remove(LNK_SectionList *list, String8 name) +{ + LNK_SectionNode *section = lnk_section_list_search_node(list, name); + + if (list->count > 0) { + if (list->first == section) { + list->first = list->first->next; + list->count -= 1; + + if (list->last == section) { + list->last = NULL; + } + } else { + for (LNK_SectionNode *curr = list->first, *prev = NULL; curr != NULL; prev = curr, curr = curr->next) { + if (curr == section) { + prev->next = curr->next; + list->count -= 1; + + if (list->last == curr) { + list->last = prev; + } + + break; + } + } + } + } + return section; +} + +internal LNK_SectionNode * +lnk_section_list_search_node(LNK_SectionList *list, String8 name) +{ + LNK_SectionNode *node; + for (node = list->first; node != 0; node = node->next) { + if (str8_match(node->data.name, name, 0)) { + break; + } + } + return node; +} + +internal LNK_Section * +lnk_section_list_search(LNK_SectionList *list, String8 name) +{ + LNK_SectionNode *node = lnk_section_list_search_node(list, name); + return node != NULL ? &node->data : NULL; +} + +internal LNK_SectionArray +lnk_section_array_from_list(Arena *arena, LNK_SectionList list) +{ + LNK_SectionArray result; + result.count = 0; + result.v = push_array_no_zero(arena, LNK_Section, list.count); + for (LNK_SectionNode *node = list.first; node != 0; node = node->next) { + result.v[result.count] = node->data; + result.count += 1; + } + return result; +} + +internal LNK_SectionPtrArray +lnk_section_ptr_array_from_list(Arena *arena, LNK_SectionList list) +{ + LNK_SectionPtrArray result; + result.count = 0; + result.v = push_array_no_zero(arena, LNK_Section *, list.count); + for (LNK_SectionNode *node = list.first; node != 0; node = node->next) { + result.v[result.count] = &node->data; + result.count += 1; + } + return result; +} + +internal String8 +lnk_make_section_sort_index(Arena *arena, String8 name, COFF_SectionFlags flags, U64 section_index) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + // pack sections with run-time data closer + String8List sort_index_list = {0}; + if (flags & COFF_SectionFlag_MEM_DISCARDABLE) { + str8_list_pushf(scratch.arena, &sort_index_list, "b"); + } else { + str8_list_pushf(scratch.arena, &sort_index_list, "a"); + } + + if (str8_match(name, str8_lit(".null"), 0)) { + // null section always first + str8_list_pushf(scratch.arena, &sort_index_list, "a"); + } else if (str8_match(name, str8_lit(".rsrc"), 0)) { + // section with resource data must be last because during runtime windows might append pages + str8_list_pushf(scratch.arena, &sort_index_list, "c"); + } else { + str8_list_pushf(scratch.arena, &sort_index_list, "b"); + } + + // sort sections based on the contents + if (flags & COFF_SectionFlag_CNT_CODE) { + str8_list_pushf(scratch.arena, &sort_index_list, "a"); + if (str8_match(name, str8_lit(".text"), 0)) { + str8_list_pushf(scratch.arena, &sort_index_list, "a"); + } else { + str8_list_pushf(scratch.arena, &sort_index_list, "b"); + } + } else if (flags & COFF_SectionFlag_CNT_INITIALIZED_DATA) { + str8_list_pushf(scratch.arena, &sort_index_list, "b"); + if (str8_match(name, str8_lit(".data"), 0)) { + str8_list_pushf(scratch.arena, &sort_index_list, "a"); + } else if (str8_match(name, str8_lit(".rdata"), 0)) { + str8_list_pushf(scratch.arena, &sort_index_list, "b"); + } else if (str8_match(name, str8_lit(".tls"), 0)) { + str8_list_pushf(scratch.arena, &sort_index_list, "c"); + } else { + str8_list_pushf(scratch.arena, &sort_index_list, "d"); + } + } else if (flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA) { + str8_list_pushf(scratch.arena, &sort_index_list, "c"); + } else { + str8_list_pushf(scratch.arena, &sort_index_list, "d"); + } + + // sort sections based on read/write access so final section layout looks cleaner + if (flags & COFF_SectionFlag_MEM_READ && ~flags & COFF_SectionFlag_MEM_WRITE) { + str8_list_pushf(scratch.arena, &sort_index_list, "a"); + } else { + str8_list_pushf(scratch.arena, &sort_index_list, "b"); + } + + String8 order_index = str8_from_bits_u32(scratch.arena, safe_cast_u32(section_index)); + str8_list_push(scratch.arena, &sort_index_list, order_index); + + String8 result = str8_list_join(arena, &sort_index_list, 0); + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal void +lnk_section_associate_chunks(LNK_Section *sect, LNK_Chunk *head, LNK_Chunk *associate) +{ + lnk_chunk_associate(sect->arena, head, associate); +} + +internal LNK_Chunk * +lnk_section_push_chunk_raw(LNK_Section *sect, LNK_Chunk *parent, void *raw_ptr, U64 raw_size, String8 sort_index) +{ + return lnk_chunk_push_leaf(sect->arena, sect->cman, parent, sort_index, raw_ptr, raw_size); +} + +internal LNK_Chunk * +lnk_section_push_chunk_data(LNK_Section *sect, LNK_Chunk *parent, String8 data, String8 sort_index) +{ + return lnk_section_push_chunk_raw(sect, parent, data.str, data.size, sort_index); +} + +internal LNK_Chunk * +lnk_section_push_chunk_u32(LNK_Section *sect, LNK_Chunk *parent, U32 value, String8 sort_index) +{ + U32 *ptr = push_array_no_zero(sect->arena, U32, 1); + *ptr = value; + return lnk_section_push_chunk_raw(sect, parent, ptr, sizeof(*ptr), sort_index); +} + +internal LNK_Chunk * +lnk_section_push_chunk_u64(LNK_Section *sect, LNK_Chunk *parent, U32 value, String8 sort_index) +{ + U64 *ptr = push_array_no_zero(sect->arena, U64, 1); + *ptr = value; + return lnk_section_push_chunk_raw(sect, parent, ptr, sizeof(*ptr), sort_index); +} + +internal LNK_Chunk * +lnk_section_push_chunk_bss(LNK_Section *sect, LNK_Chunk *parent, U64 size, String8 sort_index) +{ + return lnk_section_push_chunk_raw(sect, parent, 0, size, sort_index); +} + +internal LNK_Chunk * +lnk_section_push_chunk_list(LNK_Section *sect, LNK_Chunk *parent, String8 sort_index) +{ + return lnk_chunk_push_list(sect->arena, sect->cman, parent, sort_index); +} + +internal LNK_Reloc * +lnk_section_push_reloc(LNK_Section *sect, LNK_Chunk *chunk, LNK_RelocType type, U64 apply_off, LNK_Symbol *symbol) +{ + Assert(symbol); + LNK_Reloc *reloc = lnk_reloc_list_push(sect->arena, §->reloc_list); + reloc->chunk = chunk; + reloc->type = type; + reloc->apply_off = apply_off; + reloc->symbol = symbol; + return reloc; +} + +internal LNK_Reloc * +lnk_section_push_reloc_undefined(LNK_Section *sect, LNK_Chunk *chunk, LNK_RelocType type, U64 apply_off, String8 undefined_symbol_name, LNK_SymbolScopeFlags scope_flags) +{ + LNK_Symbol *symbol = lnk_make_undefined_symbol(sect->arena, undefined_symbol_name, scope_flags); + LNK_Reloc *reloc = lnk_section_push_reloc(sect, chunk, type, apply_off, symbol); + return reloc; +} + +internal void +lnk_section_merge(LNK_Section *dst, LNK_Section *src) +{ + ProfBeginFunction(); + + // set merge info + src->is_merged = 1; + src->merge_sect_id = dst->id; + src->id_map = push_array_no_zero(src->arena, U64, src->cman->total_chunk_count); + + // put source root in a wrapper list so it has unique sort index otherwise + // after we merge sections sort indices might conflict + LNK_Chunk *src_root_wrapper = lnk_section_push_chunk_list(dst, dst->cman->root, str8(0,0)); + + // merge roots + lnk_merge_chunks(dst->arena, dst->cman, src_root_wrapper, src->cman->root, src->id_map, src->cman->total_chunk_count); + + // copy relocations + lnk_reloc_list_concat_in_place(&dst->reloc_list, &src->reloc_list); + + ProfEnd(); +} + +internal U8 +lnk_code_align_byte_from_machine(COFF_MachineType machine) +{ + U8 align_byte = 0; + switch (machine) { + case COFF_MachineType_X64: + case COFF_MachineType_X86: { + align_byte = 0xCC; + } break; + default: { + lnk_not_implemented("TODO: set align value for machine %S", coff_string_from_machine_type(machine)); + } break; + } + return align_byte; +} + +internal void +lnk_section_build_data(LNK_Section *sect, COFF_MachineType machine) +{ + if (sect->is_loose && sect->has_layout) { + // get value for align data fill + U8 align_byte = 0; + B32 is_code = !!(sect->flags & COFF_SectionFlag_CNT_CODE); + if (is_code) { + align_byte = lnk_code_align_byte_from_machine(machine); + } + + sect->layout = lnk_build_chunk_layout(sect->arena, sect->cman, sect->flags, align_byte); + + sect->is_loose = 0; + } +} + +internal LNK_SectionTable * +lnk_section_table_alloc(U64 section_virt_off, U64 sect_align, U64 file_align) +{ + ProfBeginFunction(); + Arena *arena = arena_alloc(); + LNK_SectionTable *st = push_array(arena, LNK_SectionTable, 1); + st->arena = arena; + st->section_virt_off = section_virt_off; + st->sect_align = sect_align; + st->file_align = file_align; + ProfEnd(); + return st; +} + +internal void +lnk_section_table_release(LNK_SectionTable **st_ptr) +{ + ProfBeginFunction(); + LNK_SectionTable *st = *st_ptr; + arena_release(st->arena); + *st_ptr = NULL; + ProfEnd(); +} + +internal LNK_Section * +lnk_section_table_push(LNK_SectionTable *st, String8 name, COFF_SectionFlags flags) +{ + ProfBeginFunction(); + LNK_SectionList *sect_list = &st->list; + + LNK_SectionNode *sect_node = push_array(st->arena, LNK_SectionNode, 1); + String8 sort_index = lnk_make_section_sort_index(st->arena, name, flags, st->id_max); + + B32 found = 0; + for (LNK_SectionNode *curr = sect_list->first, *prev = NULL; curr != NULL; prev = curr, curr = curr->next) { + LNK_Section *sect = &curr->data; + int cmp = str8_compar_case_sensetive(&sort_index, §->sort_index); + if (cmp < 0) { + if (prev == NULL) { + SLLQueuePushFront(sect_list->first, sect_list->last, sect_node); + } else { + prev->next = sect_node; + sect_node->next = curr; + } + found = 1; + break; + } + } + + if (!found) { + SLLQueuePush(sect_list->first, sect_list->last, sect_node); + } + sect_list->count += 1; + + U64 sect_id = st->id_max; + st->id_max += 1; + + LNK_Section *sect = §_node->data; + sect->arena = arena_alloc(); + sect->id = sect_id; + sect->name = push_str8_copy(sect->arena, name); + sect->sort_index = sort_index; + sect->flags = flags; + sect->cman = lnk_chunk_manager_alloc(sect->arena, sect_id, st->file_align); + sect->root = sect->cman->root; + sect->nosort_chunk = lnk_chunk_push_list(sect->arena, sect->cman, sect->root, str8(0,0)); + sect->nosort_chunk->sort_chunk = 0; + sect->emit_header = 1; + sect->has_layout = 1; + sect->is_loose = 1; + + lnk_chunk_set_debugf(sect->arena, sect->root, "root chunk for %S", name); + + ProfEnd(); + return sect; +} + +internal LNK_Section * +lnk_section_table_push_null(LNK_SectionTable *st) +{ + LNK_SectionList *list = &st->list; + SLLQueuePushFront(list->first, list->last, st->null_sect); + list->count += 1; + return &st->null_sect->data; +} + +LNK_CHUNK_VISITOR_SIG(lnk_chunk_has_leaf) +{ + B32 stop = 0; + if (chunk->type == LNK_Chunk_Leaf) { + B32 has_data = !lnk_chunk_is_discarded(chunk) && chunk->u.leaf.size > 0; + if (has_data) { + B32 *no_data = (B32*)ud; + *no_data = 0; + stop = 1; + } + } + return stop; +} + +LNK_CHUNK_VISITOR_SIG(lnk_chunk_mark_discarded) +{ + chunk->is_discarded = 1; + B32 stop = 0; + return stop; +} + +internal void +lnk_section_table_remove(LNK_SectionTable *st, LNK_SymbolTable *symtab, String8 name) +{ + ProfBeginFunction(); + + // remove node from list + LNK_SectionNode *sect_node = lnk_section_list_remove(&st->list, name); + LNK_Section *sect = §_node->data; + + // remove symbol for section root chunk + lnk_symbol_table_remove(symtab, LNK_SymbolScopeIndex_Internal, sect->symbol_name); + + // mark chunks as discarded + lnk_visit_chunks(sect->id, sect->root, lnk_chunk_mark_discarded, NULL); + + // push to empties + SLLQueuePush(st->empties_list.first, st->empties_list.last, sect_node); + st->empties_list.count += 1; + + ProfEnd(); +} + +internal LNK_Section * +lnk_section_table_search(LNK_SectionTable *st, String8 name) +{ + return lnk_section_list_search(&st->list, name); +} + +internal LNK_Section * +lnk_section_table_search_id(LNK_SectionTable *st, U64 id) +{ + for (LNK_SectionNode *node = st->list.first; node != NULL; node = node->next) { + if (node->data.id == id) { + return &node->data; + } + } + return NULL; +} + +internal void +lnk_section_table_merge(LNK_SectionTable *st, LNK_MergeDirectiveList merge_list) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + LNK_Section **src_dst = push_array(scratch.arena, LNK_Section *, st->id_max); + for (LNK_MergeDirectiveNode *merge_node = merge_list.first; merge_node != NULL; merge_node = merge_node->next) { + LNK_MergeDirective *merge = &merge_node->data; + + // are we trying to merge section that was already merged? + LNK_Section *merge_sect = lnk_section_list_search(&st->merge_list, merge->src); + if (merge_sect) { + LNK_Section *dst = src_dst[merge_sect->id]; + B32 is_ambiguous_merge = !str8_match(dst->name, merge->dst, 0); + if (is_ambiguous_merge) { + lnk_error(LNK_Warning_AmbiguousMerge, "Detected ambiguous section merge:"); + lnk_supplement_error("%S => %S (Merged)", merge_sect->name, dst->name); + lnk_supplement_error("%S => %S", merge_sect->name, merge->dst); + } + continue; + } + + // find source seciton + LNK_Section *src = lnk_section_table_search(st, merge->src); + if (src == NULL) { + lnk_error(LNK_Warning_IllData, "Can't find section \"%S\" to merge with \"%S\"", merge->src, merge->dst); + // TODO: supplement obj path if applicable + continue; + } + + // handle case where destination section doesn't exist + LNK_Section *dst = lnk_section_table_search(st, merge->dst); + if (dst == NULL) { + src->name = push_str8_copy(src->arena, merge->dst); + src_dst[src->id] = src; + continue; + } + + // update map + src_dst[src->id] = dst; + + // merge section with destination + lnk_section_merge(dst, src); + + // remove from output section list + LNK_SectionNode *src_node = lnk_section_list_remove(&st->list, src->name); + + // push section to merged list + SLLQueuePush(st->merge_list.first, st->merge_list.last, src_node); + st->merge_list.count += 1; + } + scratch_end(scratch); + ProfEnd(); +} + +internal void +lnk_section_table_remove_empties(LNK_SectionTable *st, LNK_SymbolTable *symtab) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + String8List name_list = {0}; + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + + B32 no_data = 1; + lnk_visit_chunks(sect->id, sect->root, lnk_chunk_has_leaf, (void*)&no_data); + + if (no_data) { + String8 name = push_str8_copy(scratch.arena, sect->name); + str8_list_push(scratch.arena, &name_list, name); + } + } + + for (String8Node *name = name_list.first; name != NULL; name = name->next) { + lnk_section_table_remove(st, symtab, name->string); + } + scratch_end(scratch); + ProfEnd(); +} + +internal LNK_SectionArray +lnk_section_table_get_output_sections(Arena *arena, LNK_SectionTable *st) +{ + LNK_SectionArray result = {0}; + result.count = 0; + result.v = push_array(arena, LNK_Section, st->list.count); + + for (LNK_SectionNode *sect_node = st->list.first; sect_node != 0; sect_node = sect_node->next) { + if (sect_node->data.emit_header && sect_node->data.has_layout) { + Assert(result.count < st->list.count); + result.v[result.count] = sect_node->data; + result.count += 1; + } + } + + U64 unused_entry_count = st->list.count - result.count; + arena_pop(arena, unused_entry_count * sizeof(result.v[0])); + + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_section_data_builder) +{ + LNK_SectionDataBuilder *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + for (U64 sect_idx = range.min; sect_idx < range.max; ++sect_idx) { + lnk_section_build_data(task->sect_arr[sect_idx], task->machine); + } +} + +internal void +lnk_section_table_build_data(TP_Context *tp, LNK_SectionTable *st, COFF_MachineType machine) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + LNK_SectionPtrArray sect_arr = lnk_section_ptr_array_from_list(scratch.arena, st->list); + + LNK_SectionDataBuilder task = {0}; + task.machine = machine; + task.range_arr = tp_divide_work(scratch.arena, sect_arr.count, tp->worker_count); + task.sect_arr = sect_arr.v; + tp_for_parallel(tp, 0, tp->worker_count, lnk_section_data_builder, &task); + + scratch_end(scratch); + ProfEnd(); +} + +internal void +lnk_section_table_assign_virtual_offsets(LNK_SectionTable *st) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + U64 cursor = st->section_virt_off; + Assert(cursor >= 0x1000); + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + if (sect_node == st->null_sect) continue; + LNK_Section *sect = §_node->data; + if (!sect->has_layout) continue; + sect->virt_off = cursor; + U64 sect_size = lnk_virt_size_from_chunk_ref(sect_id_map, sect->root->ref); + cursor += sect_size; + cursor = AlignPow2(cursor, st->sect_align); + } + scratch_end(scratch); + ProfEnd(); +} + +internal void +lnk_section_table_assign_file_offsets(LNK_SectionTable *st) +{ + ProfBeginFunction(); + U64 cursor = 0; + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + if (sect->flags & COFF_SectionFlag_CNT_UNINITIALIZED_DATA) { + continue; + } + if (!sect->has_layout) continue; + sect->file_off = cursor; + U64 root_size = sect->layout.chunk_file_size_array[sect->root->ref.chunk_id]; + cursor += root_size; + } + ProfEnd(); +} + +internal void +lnk_section_table_assign_indices(LNK_SectionTable *st) +{ + ProfBeginFunction(); + U64 isect = 0; + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + if (sect->emit_header) { + sect->isect = isect++; + } + } + ProfEnd(); +} + +internal String8 +lnk_section_table_serialize(Arena *arena, LNK_SectionTable *st) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + String8List image_list = {0}; + for (LNK_SectionNode *sect_node = st->list.first; sect_node != NULL; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + str8_list_push(scratch.arena, &image_list, sect->layout.data); + } + String8 result = str8_list_join(arena, &image_list, NULL); + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal LNK_ChunkPtr ** +lnk_chunk_id_map_from_section_table(Arena *arena, LNK_SectionTable *st) +{ + ProfBeginFunction(); + LNK_ChunkPtr **chunk_id_map = push_array(arena, LNK_ChunkPtr *, st->id_max); + for (LNK_SectionNode *node = st->list.first; node != 0; node = node->next) { + LNK_Section *sect = &node->data; + chunk_id_map[sect->id] = lnk_make_chunk_id_map(arena, sect->cman); + } + if (st->list.first->data.id != 0) { + chunk_id_map[0] = push_array(arena, LNK_ChunkPtr, 1); + chunk_id_map[0][0] = g_null_chunk_ptr; + } + ProfEnd(); + return chunk_id_map; +} + +internal LNK_Section ** +lnk_sect_id_map_from_section_table(Arena *arena, LNK_SectionTable *st) +{ + ProfBeginFunction(); + LNK_Section **map = push_array(arena, LNK_Section *, st->id_max); + LNK_SectionList *list_arr[] = { &st->list, &st->merge_list, &st->empties_list }; + for (U64 list_idx = 0; list_idx < ArrayCount(list_arr); ++list_idx) { + for (LNK_SectionNode *sect_node = list_arr[list_idx]->first; sect_node != NULL; sect_node = sect_node->next) { + LNK_Section *sect = §_node->data; + Assert(sect->id < st->id_max); + Assert(map[sect->id] == NULL); + map[sect->id] = sect; + } + } + if (map[0] == NULL) { + LNK_Section *sect = push_array(arena, LNK_Section, 1); + sect->layout.chunk_off_array = push_array(arena, U64, 1); + sect->layout.chunk_file_size_array = push_array(arena, U64, 1); + sect->layout.chunk_virt_size_array = push_array(arena, U64, 1); + map[0] = sect; + } + ProfEnd(); + return map; +} + +internal LNK_ChunkRef +lnk_get_final_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = chunk_ref; + if (sect_id_map[chunk_ref.sect_id]->is_merged) { + final_chunk_ref.sect_id = sect_id_map[chunk_ref.sect_id]->merge_sect_id; + final_chunk_ref.chunk_id = sect_id_map[chunk_ref.sect_id]->id_map[chunk_ref.chunk_id]; + // we don't support sections that were merged more than once. + Assert(!sect_id_map[final_chunk_ref.sect_id]->is_merged); + } + return final_chunk_ref; +} + +internal LNK_Section * +lnk_sect_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef input_chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, input_chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + return sect; +} + +internal LNK_Chunk * +lnk_chunk_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkPtr **chunk_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Chunk *chunk = chunk_id_map[final_chunk_ref.sect_id][final_chunk_ref.chunk_id]; + return chunk; +} + +internal U64 +lnk_isect_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_Section *sect = lnk_sect_from_chunk_ref(sect_id_map, chunk_ref); + U64 isect = sect->isect; + return isect; +} + +internal U64 +lnk_off_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + U64 off = sect->layout.chunk_off_array[final_chunk_ref.chunk_id]; + return off; +} + +internal U64 +lnk_virt_off_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + U64 off = sect->layout.chunk_off_array[final_chunk_ref.chunk_id]; + U64 virt_off = off + sect->virt_off; + return virt_off; +} + +internal U64 +lnk_file_off_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + U64 off = sect->layout.chunk_off_array[final_chunk_ref.chunk_id]; + U64 file_off = off + sect->file_off; + return file_off; +} + +internal U64 +lnk_virt_size_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + U64 virt_size = sect->layout.chunk_virt_size_array[final_chunk_ref.chunk_id]; + return virt_size; +} + +internal U64 +lnk_file_size_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + U64 file_size = sect->layout.chunk_file_size_array[final_chunk_ref.chunk_id]; + return file_size; +} + +internal String8 +lnk_data_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + U64 chunk_off = lnk_off_from_chunk_ref(sect_id_map, chunk_ref); + U64 chunk_size = lnk_file_size_from_chunk_ref(sect_id_map, chunk_ref); + String8 chunk_data = str8_substr(sect->layout.data, r1u64(chunk_off, chunk_off + chunk_size)); + return chunk_data; +} + +internal String8 +lnk_data_from_chunk_ref_no_pad(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + LNK_ChunkRef final_chunk_ref = lnk_get_final_chunk_ref(sect_id_map, chunk_ref); + LNK_Section *sect = sect_id_map[final_chunk_ref.sect_id]; + U64 chunk_off = lnk_off_from_chunk_ref(sect_id_map, chunk_ref); + U64 chunk_size = lnk_virt_size_from_chunk_ref(sect_id_map, chunk_ref); + String8 chunk_data = str8_substr(sect->layout.data, r1u64(chunk_off, chunk_off + chunk_size)); + return chunk_data; +} + +internal ISectOff +lnk_sc_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref) +{ + ISectOff sc = {0}; + sc.isect = lnk_isect_from_chunk_ref(sect_id_map, chunk_ref); + sc.off = lnk_off_from_chunk_ref(sect_id_map, chunk_ref); + return sc; +} + +internal U64 +lnk_virt_off_from_reloc(LNK_Section **sect_id_map, LNK_Reloc *reloc) +{ + U64 virt_off = lnk_virt_off_from_chunk_ref(sect_id_map, reloc->chunk->ref); + virt_off += reloc->apply_off; + return virt_off; +} + +internal U64 +lnk_isect_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol) +{ + Assert(LNK_Symbol_IsDefined(symbol->type)); + LNK_ChunkRef symbol_chunk_ref = symbol->u.defined.u.chunk->ref; + U64 symbol_isect = lnk_isect_from_chunk_ref(sect_id_map, symbol_chunk_ref); + return symbol_isect; +} + +internal U64 +lnk_sect_off_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol) +{ + Assert(LNK_Symbol_IsDefined(symbol->type)); + LNK_ChunkRef symbol_chunk_ref = symbol->u.defined.u.chunk->ref; + U64 chunk_off = lnk_off_from_chunk_ref(sect_id_map, symbol_chunk_ref); + U64 symbol_off = chunk_off + symbol->u.defined.u.chunk_offset; + return symbol_off; +} + +internal U64 +lnk_virt_off_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol) +{ + Assert(LNK_Symbol_IsDefined(symbol->type)); + LNK_ChunkRef symbol_chunk_ref = symbol->u.defined.u.chunk->ref; + U64 chunk_voff = lnk_virt_off_from_chunk_ref(sect_id_map, symbol_chunk_ref); + U64 symbol_voff = chunk_voff + symbol->u.defined.u.chunk_offset; + return symbol_voff; +} + +internal U64 +lnk_file_off_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol) +{ + Assert(LNK_Symbol_IsDefined(symbol->type)); + LNK_ChunkRef symbol_chunk_ref = symbol->u.defined.u.chunk->ref; + U64 chunk_foff = lnk_file_off_from_chunk_ref(sect_id_map, symbol_chunk_ref); + U64 symbol_foff = chunk_foff + symbol->u.defined.u.chunk_offset; + return symbol_foff; +} + +internal U64 +lnk_virt_size_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol) +{ + Assert(LNK_Symbol_IsDefined(symbol->type)); + U64 symbol_chunk_virt_size = lnk_virt_size_from_chunk_ref(sect_id_map, symbol->u.defined.u.chunk->ref); + return symbol_chunk_virt_size; +} + +internal U64 +lnk_file_size_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol) +{ + Assert(LNK_Symbol_IsDefined(symbol->type)); + U64 symbol_chunk_file_size = lnk_file_size_from_chunk_ref(sect_id_map, symbol->u.defined.u.chunk->ref); + return symbol_chunk_file_size; +} + +#if LNK_DEBUG_CHUNKS +internal void +lnk_dump_chunks(LNK_SectionTable *st) +{ + Temp scratch = scratch_begin(0, 0); + LNK_ChunkPtr **chunk_id_map = lnk_chunk_id_map_from_section_table(scratch.arena, st); + LNK_Section **sect_id_map = lnk_sect_id_map_from_section_table(scratch.arena, st); + for (U64 sect_id = 0; sect_id < st->id_max; ++sect_id) { + LNK_Section *sect = sect_id_map[sect_id]; + if (!sect) continue; + if (sect->is_merged) continue; + if (str8_match(sect->name, str8_lit(".text"), 0)) { + for (U64 chunk_id = 0; chunk_id < sect->cman->total_chunk_count; ++chunk_id) { + LNK_ChunkRef chunk_ref = { sect_id, chunk_id }; + LNK_Chunk *chunk = lnk_chunk_from_chunk_ref(sect_id_map, chunk_id_map, chunk_ref); + U64 chunk_foff = sect->file_off + sect->layout.chunk_off_array[chunk_id]; + printf("%llu {%04llX,%04llX} 0x%08llX %.*s\n", chunk_foff, sect_id, chunk_id, chunk_foff, str8_varg(chunk->debug)); + } + } + } + scratch_end(scratch); +} +#endif + diff --git a/src/linker/lnk_section_table.h b/src/linker/lnk_section_table.h new file mode 100644 index 00000000..37fd1405 --- /dev/null +++ b/src/linker/lnk_section_table.h @@ -0,0 +1,151 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef struct LNK_Section +{ + Arena *arena; + U64 id; + String8 name; + String8 symbol_name; + COFF_SectionFlags flags; + String8 sort_index; + + LNK_ChunkManager *cman; + LNK_Chunk *root; + + // overwhelming number of chunks don't have sort index and grouping + // them speeds up sort step + LNK_Chunk *nosort_chunk; + + LNK_RelocList reloc_list; + + B32 emit_header; // TODO: this is a hack to make reloc serializer work in resource converter + B32 has_layout; + B32 is_loose; + + B32 is_merged; + U64 merge_sect_id; + U64 *id_map; + + U64 isect; + U64 virt_off; + U64 file_off; + LNK_ChunkLayout layout; +} LNK_Section; + +typedef struct LNK_SectionNode +{ + struct LNK_SectionNode *next; + LNK_Section data; +} LNK_SectionNode; + +typedef struct LNK_SectionList +{ + U64 count; + LNK_SectionNode *first; + LNK_SectionNode *last; +} LNK_SectionList; + +typedef struct LNK_SectionArray +{ + U64 count; + LNK_Section *v; +} LNK_SectionArray; + +typedef struct LNK_SectionPtrArray +{ + U64 count; + LNK_Section **v; +} LNK_SectionPtrArray; + +typedef struct LNK_SectionTable +{ + Arena *arena; + U64 section_virt_off; + U64 sect_align; + U64 file_align; + U64 id_max; + LNK_SectionList list; + LNK_SectionList merge_list; + LNK_SectionList empties_list; + LNK_SectionNode *null_sect; +} LNK_SectionTable; + +//////////////////////////////// + +typedef struct +{ + COFF_MachineType machine; + Rng1U64 *range_arr; + LNK_Section **sect_arr; +} LNK_SectionDataBuilder; + +//////////////////////////////// + +internal LNK_SectionNode * lnk_section_list_remove(LNK_SectionList *list, String8 name); +internal LNK_SectionNode * lnk_section_list_search_node(LNK_SectionList *list, String8 name); +internal LNK_Section * lnk_section_list_search(LNK_SectionList *list, String8 name); + +internal LNK_SectionArray lnk_section_array_from_list(Arena *arena, LNK_SectionList list); +internal LNK_SectionPtrArray lnk_section_ptr_array_from_list(Arena *arena, LNK_SectionList list); + +internal void lnk_section_associate_chunks(LNK_Section *sect, LNK_Chunk *head, LNK_Chunk *associate); + +internal LNK_Reloc * lnk_section_push_reloc(LNK_Section *sect, LNK_Chunk *chunk, LNK_RelocType type, U64 apply_off, LNK_Symbol *symbol); +internal LNK_Reloc * lnk_section_push_reloc_undefined(LNK_Section *sect, LNK_Chunk *chunk, LNK_RelocType type, U64 apply_off, String8 undefined_symbol_name, LNK_SymbolScopeFlags scope_flags); + +internal void lnk_section_merge(LNK_Section *dst, LNK_Section *src); +internal void lnk_section_build_data(LNK_Section *sect, COFF_MachineType machine); + +internal String8 lnk_make_section_sort_index(Arena *arena, String8 name, COFF_SectionFlags flags, U64 section_index); + +internal LNK_Chunk * lnk_section_push_chunk_raw(LNK_Section *sect, LNK_Chunk *parent, void *data_ptr, U64 data_size, String8 sort_index); +internal LNK_Chunk * lnk_section_push_chunk_data(LNK_Section *sect, LNK_Chunk *parent, String8 data, String8 sort_index); +internal LNK_Chunk * lnk_section_push_chunk_u32(LNK_Section *sect, LNK_Chunk *parent, U32 value, String8 sort_index); +internal LNK_Chunk * lnk_section_push_chunk_u64(LNK_Section *sect, LNK_Chunk *parent, U32 value, String8 sort_index); +internal LNK_Chunk * lnk_section_push_chunk_bss(LNK_Section *sect, LNK_Chunk *parent, U64 size, String8 sort_index); +internal LNK_Chunk * lnk_section_push_chunk_list(LNK_Section *sect, LNK_Chunk *parent, String8 sort_index); + +internal LNK_SectionTable * lnk_section_table_alloc(U64 section_virt_off, U64 sect_align, U64 file_align); +internal void lnk_section_table_release(LNK_SectionTable **st_ptr); +internal LNK_Section * lnk_section_table_push(LNK_SectionTable *st, String8 name, COFF_SectionFlags flags); +internal LNK_Section * lnk_section_table_push_null(LNK_SectionTable *st); +internal void lnk_section_table_remove(LNK_SectionTable *st, LNK_SymbolTable *symtab, String8 name); +internal LNK_Section * lnk_section_table_search(LNK_SectionTable *st, String8 name); +internal LNK_Section * lnk_section_table_search_id(LNK_SectionTable *st, U64 id); +internal void lnk_section_table_merge(LNK_SectionTable *st, LNK_MergeDirectiveList merge_list); +internal void lnk_section_table_remove_empties(LNK_SectionTable *st, LNK_SymbolTable *symtab); +internal void lnk_section_table_build_data(TP_Context *tp, LNK_SectionTable *st, COFF_MachineType machine); +internal void lnk_section_table_assign_virtual_offsets(LNK_SectionTable *st); +internal void lnk_section_table_assign_file_offsets(LNK_SectionTable *st); +internal void lnk_section_table_assign_indices(LNK_SectionTable *st); +internal String8 lnk_section_table_serialize(Arena *arena, LNK_SectionTable *st); + +internal LNK_ChunkPtr ** lnk_chunk_id_map_from_section_table(Arena *arena, LNK_SectionTable *st); +internal LNK_Section ** lnk_sect_id_map_from_section_table(Arena *arena, LNK_SectionTable *st); +internal LNK_ChunkRef lnk_get_final_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal LNK_Section * lnk_sect_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal LNK_Chunk * lnk_chunk_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkPtr **chunk_id_map, LNK_ChunkRef chunk_ref); +internal U64 lnk_isect_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal U64 lnk_off_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal U64 lnk_virt_off_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal U64 lnk_file_off_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal U64 lnk_virt_size_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal U64 lnk_file_size_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal String8 lnk_data_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal String8 lnk_data_from_chunk_ref_no_pad(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal ISectOff lnk_sc_from_chunk_ref(LNK_Section **sect_id_map, LNK_ChunkRef chunk_ref); +internal U64 lnk_virt_off_from_reloc(LNK_Section **sect_id_map, LNK_Reloc *reloc); +internal U64 lnk_isect_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol); +internal U64 lnk_sect_off_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol); +internal U64 lnk_virt_off_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol); +internal U64 lnk_file_off_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol); +internal U64 lnk_virt_size_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol); +internal U64 lnk_file_size_from_symbol(LNK_Section **sect_id_map, LNK_Symbol *symbol); + +#if LNK_DEBUG_CHUNKS +internal void lnk_dump_chunks(LNK_SectionTable *st); +#endif + diff --git a/src/linker/lnk_symbol_table.c b/src/linker/lnk_symbol_table.c new file mode 100644 index 00000000..f6addf1f --- /dev/null +++ b/src/linker/lnk_symbol_table.c @@ -0,0 +1,832 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +global read_only LNK_Symbol g_null_symbol = { str8_lit_comp("NULL"), LNK_Symbol_DefinedStatic }; +global read_only LNK_Symbol *g_null_symbol_ptr = &g_null_symbol; + +internal void +lnk_init_symbol(LNK_Symbol *symbol, String8 name, LNK_SymbolType type) +{ + symbol->name = name; + symbol->type = type; +} + +internal void +lnk_init_defined_symbol(LNK_Symbol *symbol, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags) +{ + switch (visibility) { + case LNK_DefinedSymbolVisibility_Static: lnk_init_symbol(symbol, name, LNK_Symbol_DefinedStatic); break; + case LNK_DefinedSymbolVisibility_Extern: lnk_init_symbol(symbol, name, LNK_Symbol_DefinedExtern); break; + case LNK_DefinedSymbolVisibility_Internal: lnk_init_symbol(symbol, name, LNK_Symbol_DefinedInternal); break; + } + LNK_DefinedSymbol *def = &symbol->u.defined; + def->flags = flags; + def->value_type = LNK_DefinedSymbolValue_Null; +} + +internal void +lnk_init_defined_symbol_chunk(LNK_Symbol *symbol, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, LNK_Chunk *chunk, U64 offset, COFF_ComdatSelectType selection, U32 check_sum) +{ + lnk_init_defined_symbol(symbol, name, visibility, flags); + LNK_DefinedSymbol *def = &symbol->u.defined; + def->value_type = LNK_DefinedSymbolValue_Chunk; + def->u.chunk = chunk; + def->u.chunk_offset = offset; + def->u.check_sum = check_sum; + def->u.selection = selection; +} + +internal void +lnk_init_defined_symbol_va(LNK_Symbol *symbol, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, U64 va) +{ + lnk_init_defined_symbol(symbol, name, visibility, flags); + LNK_DefinedSymbol *def = &symbol->u.defined; + def->value_type = LNK_DefinedSymbolValue_VA; + def->u.va = va; +} + +internal void +lnk_init_undefined_symbol(LNK_Symbol *symbol, String8 name, LNK_SymbolScopeFlags scope_flags) +{ + lnk_init_symbol(symbol, name, LNK_Symbol_Undefined); + symbol->u.undefined.scope_flags = scope_flags; +} + +internal void +lnk_init_weak_symbol(LNK_Symbol *symbol, String8 name, COFF_WeakExtType lookup, LNK_Symbol *fallback) +{ + lnk_init_symbol(symbol, name, LNK_Symbol_Weak); + symbol->u.weak.scope_flags = LNK_SymbolScopeFlag_Defined; + symbol->u.weak.lookup_type = lookup; + symbol->u.weak.fallback_symbol = fallback; +} + +internal void +lnk_init_lazy_symbol(LNK_Symbol *symbol, String8 name, LNK_Lib *lib, U64 member_offset) +{ + lnk_init_symbol(symbol, name, LNK_Symbol_Lazy); + symbol->u.lazy.lib = lib; + symbol->u.lazy.member_offset = member_offset; +} + +internal LNK_Symbol * +lnk_make_defined_symbol(Arena *arena, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags) +{ + LNK_Symbol *symbol = push_array_no_zero(arena, LNK_Symbol, 1); + lnk_init_defined_symbol(symbol, name, visibility, flags); + return symbol; +} + +internal LNK_Symbol * +lnk_make_defined_symbol_chunk(Arena *arena, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, LNK_Chunk *chunk, U64 offset, COFF_ComdatSelectType selection, U32 check_sum) +{ + LNK_Symbol *symbol = push_array_no_zero(arena, LNK_Symbol, 1); + lnk_init_defined_symbol_chunk(symbol, name, visibility, flags, chunk, offset, selection, check_sum); + return symbol; +} + +internal LNK_Symbol * +lnk_make_defined_symbol_va(Arena *arena, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, U64 va) +{ + LNK_Symbol *symbol = push_array_no_zero(arena, LNK_Symbol, 1); + lnk_init_defined_symbol_va(symbol, name, visibility, flags, va); + return symbol; +} + +internal LNK_Symbol * +lnk_make_undefined_symbol(Arena *arena, String8 name, LNK_SymbolScopeFlags flags) +{ + LNK_Symbol *symbol = push_array_no_zero(arena, LNK_Symbol, 1); + lnk_init_undefined_symbol(symbol, name, flags); + return symbol; +} + +internal LNK_Symbol * +lnk_make_weak_symbol(Arena *arena, String8 name, COFF_WeakExtType lookup, LNK_Symbol *fallback) +{ + LNK_Symbol *symbol = push_array_no_zero(arena, LNK_Symbol, 1); + lnk_init_weak_symbol(symbol, name, lookup, fallback); + return symbol; +} + +internal LNK_Symbol * +lnk_make_lazy_symbol(Arena *arena, String8 name, LNK_Lib *lib, U64 member_offset) +{ + LNK_Symbol *symbol = push_array_no_zero(arena, LNK_Symbol, 1); + lnk_init_lazy_symbol(symbol, name, lib, member_offset); + return symbol; +} + +internal LNK_Chunk * +lnk_defined_symbol_get_chunk(LNK_DefinedSymbol *symbol) +{ + if (symbol->value_type == LNK_DefinedSymbolValue_Chunk) { + return symbol->u.chunk; + } + return 0; +} + +internal void +lnk_symbol_list_push_node(LNK_SymbolList *list, LNK_SymbolNode *node) +{ + DLLPushBack(list->first, list->last, node); + list->count += 1; +} + +internal LNK_SymbolNode * +lnk_symbol_list_push(Arena *arena, LNK_SymbolList *list, LNK_Symbol *symbol) +{ + LNK_SymbolNode *node = push_array(arena, LNK_SymbolNode, 1); + node->data = symbol; + lnk_symbol_list_push_node(list, node); + return node; +} + +internal void +lnk_symbol_list_push_list(LNK_SymbolList *list, LNK_SymbolList *to_push) +{ + if (to_push->count) { + if (list->count) { + list->last->next = to_push->first; + to_push->first->prev = list->last; + list->last = to_push->last; + list->count += to_push->count; + } else { + *list = *to_push; + } + MemoryZeroStruct(to_push); + } +} + +internal void +lnk_symbol_list_insert_after(LNK_SymbolList *list, LNK_SymbolNode *node, LNK_SymbolNode *insert) +{ + DLLInsert(list->first, list->last, node, insert); + list->count += 1; +} + +internal LNK_SymbolNode * +lnk_symbol_list_pop_node(LNK_SymbolList *list) +{ + LNK_SymbolNode *node = 0; + if (list->count) { + node = list->first; + DLLRemove(list->first, list->last, node); + node->next = 0; + node->prev = 0; + list->count -= 1; + } + return node; +} + +internal LNK_Symbol * +lnk_symbol_list_pop(LNK_SymbolList *list) +{ + LNK_SymbolNode *node = lnk_symbol_list_pop_node(list); + return node ? node->data : 0; +} + +internal void +lnk_symbol_list_remove(LNK_SymbolList *list, LNK_SymbolNode *node) +{ + Assert(list->count > 0); + + list->count -= 1; + DLLRemove(list->first, list->last, node); + + node->next = 0; + node->prev = 0; +} + +internal void +lnk_symbol_list_concat_in_place(LNK_SymbolList *list, LNK_SymbolList *to_concat) +{ + DLLConcatInPlace(list, to_concat); +} + +internal LNK_SymbolList +lnk_symbol_list_copy(Arena *arena, LNK_SymbolList list) +{ + LNK_SymbolList result = {0}; + LNK_SymbolNode *node_arr = push_array_no_zero(arena, LNK_SymbolNode, list.count); + for (LNK_SymbolNode *i = list.first; i != 0; i = i->next) { + Assert(result.count < list.count); + LNK_SymbolNode *n = &node_arr[result.count++]; + n->data = i->data; + SLLQueuePush(result.first, result.last, n); + } + return result; +} + +internal LNK_SymbolNode * +lnk_symbol_list_search_node(LNK_SymbolList list, String8 name, StringMatchFlags flags) +{ + for (LNK_SymbolNode *node = list.first; node != 0; node = node->next) { + if (str8_match(node->data->name, name, flags)) { + return node; + } + } + return 0; +} + +internal LNK_Symbol * +lnk_symbol_list_search(LNK_SymbolList list, String8 name, StringMatchFlags flags) +{ + LNK_SymbolNode *node = lnk_symbol_list_search_node(list, name, flags); + return node ? node->data : 0; +} + +internal LNK_SymbolList +lnk_symbol_list_from_array(Arena *arena, LNK_SymbolArray arr) +{ + LNK_SymbolList list = {0}; + LNK_SymbolNode *node_arr = push_array_no_zero(arena, LNK_SymbolNode, arr.count); + for (U64 i = 0; i < arr.count; i += 1) { + LNK_SymbolNode *node = &node_arr[i]; + node->prev = node->next = 0; + node->data = &arr.v[i]; + lnk_symbol_list_push_node(&list, node); + } + return list; +} + +internal LNK_SymbolNodeArray +lnk_symbol_node_array_from_list(Arena *arena, LNK_SymbolList list) +{ + LNK_SymbolNodeArray result = {0}; + result.count = 0; + result.v = push_array_no_zero(arena, LNK_SymbolNode *, list.count); + for (LNK_SymbolNode *i = list.first; i != 0; i = i->next, ++result.count) { + result.v[result.count] = i; + } + return result; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_symbol_node_ptr_hasher) +{ + LNK_SymbolNodePtrHasher *hasher = raw_task; + Rng1U64 range = hasher->range_arr[task_id]; + for (U64 symbol_idx = range.min; symbol_idx < range.max; symbol_idx += 1) { + LNK_SymbolNode *symbol_node = hasher->input_arr[symbol_idx]; + symbol_node->hash = lnk_symbol_table_hash(symbol_node->data->name); + } +} + +internal void +lnk_symbol_node_ptr_array_hash(TP_Context *tp, LNK_SymbolNode **arr, U64 count) +{ + Temp scratch = scratch_begin(0, 0); + LNK_SymbolNodePtrHasher hasher = {0}; + hasher.input_arr = arr; + hasher.range_arr = tp_divide_work(scratch.arena, count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_symbol_node_ptr_hasher, &hasher); + scratch_end(scratch); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_symbol_node_hasher) +{ + LNK_SymbolNodeHasher *hasher = raw_task; + Rng1U64 range = hasher->range_arr[task_id]; + for (U64 symbol_idx = range.min; symbol_idx < range.max; symbol_idx += 1) { + LNK_SymbolNode *symbol_node = &hasher->input_arr[symbol_idx]; + symbol_node->hash = lnk_symbol_table_hash(symbol_node->data->name); + } +} + +internal void +lnk_symbol_node_array_hash(TP_Context *tp, LNK_SymbolNode *arr, U64 count) +{ + Temp scratch = scratch_begin(0, 0); + LNK_SymbolNodeHasher hasher = {0}; + hasher.input_arr = arr; + hasher.range_arr = tp_divide_work(scratch.arena, count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_symbol_node_hasher, &hasher); + scratch_end(scratch); +} + +internal LNK_SymbolArray +lnk_symbol_array_from_list(Arena *arena, LNK_SymbolList list) +{ + LNK_SymbolArray arr = {0}; + arr.count = 0; + arr.v = push_array_no_zero(arena, LNK_Symbol, list.count); + for (LNK_SymbolNode *node = list.first; node != 0; node = node->next) { + arr.v[arr.count++] = *node->data; + } + return arr; +} + +internal LNK_Symbol * +lnk_symbol_array_search(LNK_SymbolArray symarr, String8 name, StringMatchFlags flags) +{ + for (U64 isym = 0; isym < symarr.count; ++isym) { + LNK_Symbol *sym = &symarr.v[isym]; + if (str8_match(sym->name, name, flags)) { + return sym; + } + } + return 0; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_symbol_name_hasher) +{ + LNK_SymbolNameHasher *task = raw_task; + Rng1U64 range = task->range_arr[task_id]; + for (U64 symbol_idx = range.min; symbol_idx < range.max; symbol_idx += 1) { + LNK_Symbol *symbol = &task->symbol_arr[symbol_idx]; + task->hash_arr[symbol_idx] = lnk_symbol_table_hash(symbol->name); + } +} + +internal U64 * +lnk_symbol_array_hash(TP_Context *tp, Arena *arena, LNK_Symbol *arr, U64 count) +{ + Temp scratch = scratch_begin(&arena, 1); + + U64 stride = CeilIntegerDiv(count, tp->worker_count); + Rng1U64 *range_arr = push_array_no_zero(scratch.arena, Rng1U64, tp->worker_count); + for (U64 thread_idx = 0; thread_idx < tp->worker_count; thread_idx += 1) { + Rng1U64 *range = &range_arr[thread_idx]; + range->min = Min(count, stride * thread_idx); + range->max = Min(count, range->min + stride); + } + + LNK_SymbolNameHasher hasher_ctx = {0}; + hasher_ctx.symbol_arr = arr; + hasher_ctx.range_arr = range_arr; + hasher_ctx.hash_arr = push_array_no_zero(arena, U64, count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_symbol_name_hasher, &hasher_ctx); + + scratch_end(scratch); + return hasher_ctx.hash_arr; +} + +internal LNK_SymbolTable * +lnk_symbol_table_alloc(void) +{ + return lnk_symbol_table_alloc_ex(0x1000, 0x100, 0x500, 0x1000); +} + +internal LNK_SymbolTable * +lnk_symbol_table_alloc_ex(U64 defined_cap, U64 internal_cap, U64 weak_cap, U64 lib_cap) +{ + ProfBeginDynamic("Alloc Symbol Table [Defined: 0x%llx, Internal: 0x%llx, Weak: 0x%llx, Lib: 0x%llx]", defined_cap, internal_cap, weak_cap, lib_cap); + Arena *arena = arena_alloc(); + LNK_SymbolTable *symtab = push_array(arena, LNK_SymbolTable, 1); + symtab->arena = arena; + symtab->bucket_count[LNK_SymbolScopeIndex_Defined] = defined_cap; + symtab->bucket_count[LNK_SymbolScopeIndex_Internal] = internal_cap; + symtab->bucket_count[LNK_SymbolScopeIndex_Weak] = weak_cap; + symtab->bucket_count[LNK_SymbolScopeIndex_Lib] = lib_cap; + for (U64 iscope = 0; iscope < ArrayCount(symtab->buckets); ++iscope) { + symtab->buckets[iscope] = push_array(symtab->arena, LNK_SymbolList, symtab->bucket_count[iscope]); + } + ProfEnd(); + return symtab; +} + +internal void +lnk_symbol_table_release(LNK_SymbolTable **symtab) +{ + ProfBeginFunction(); + arena_release((*symtab)->arena); + *symtab = 0; + ProfEnd(); +} + +internal U64 +lnk_symbol_table_hash(String8 string) +{ + return hash_from_str8(string); +} + +internal LNK_SymbolNode * +lnk_symbol_table_search_bucket(LNK_SymbolTable *symtab, LNK_SymbolScopeIndex scope_idx, U64 bucket_idx, String8 name, U64 hash) +{ + for (LNK_SymbolNode *node = symtab->buckets[scope_idx][bucket_idx].first; node != 0; node = node->next) { + if (hash == node->hash && str8_match(node->data->name, name, 0)) { + return node; + } + } + return 0; +} + +internal LNK_SymbolNode * +lnk_symbol_table_search_node_hash(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope_flags, String8 name, U64 hash) +{ + while (scope_flags) { + LNK_SymbolScopeIndex scope_idx = ctz64(scope_flags); + scope_flags &= scope_flags - 1; + U64 bucket_idx = hash % symtab->bucket_count[scope_idx]; + LNK_SymbolNode *node = lnk_symbol_table_search_bucket(symtab, scope_idx, bucket_idx, name, hash); + if (node) return node; + } + return 0; +} + +internal LNK_SymbolNode * +lnk_symbol_table_search_node(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope_flags, String8 name) +{ + U64 hash = lnk_symbol_table_hash(name); + return lnk_symbol_table_search_node_hash(symtab, scope_flags, name, hash); +} + +internal LNK_Symbol * +lnk_symbol_table_search(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope_flags, String8 name) +{ + LNK_SymbolNode *node = lnk_symbol_table_search_node(symtab, scope_flags, name); + return node ? node->data : 0; +} + +internal LNK_Symbol * +lnk_symbol_table_searchf(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope_flags, char *fmt, ...) +{ + Temp scratch = scratch_begin(0, 0); + + va_list args; + va_start(args, fmt); + String8 name = push_str8fv(scratch.arena, fmt, args); + va_end(args); + + LNK_Symbol *symbol = lnk_symbol_table_search(symtab, scope_flags, name); + scratch_end(scratch); + return symbol; +} + +internal void +lnk_symbol_table_remove(LNK_SymbolTable *symtab, LNK_SymbolScopeIndex scope, String8 name) +{ + U64 hash = lnk_symbol_table_hash(name); + U64 ibucket = hash % symtab->bucket_count[scope]; + for (;;) { + LNK_SymbolNode *node = lnk_symbol_table_search_bucket(symtab, scope, ibucket, name, hash); + if (!node) { + break; + } + LNK_SymbolList *bucket = &symtab->buckets[scope][ibucket]; + DLLRemove(bucket->first, bucket->last, node); + bucket->count -= 1; + } +} + +internal LNK_SymbolList * +lnk_symbol_table_bucket_from_hash(LNK_SymbolTable *symtab, LNK_SymbolScopeIndex scope_idx, U64 hash) +{ + U64 bucket_idx = hash % symtab->bucket_count[scope_idx]; + LNK_SymbolList *bucket = &symtab->buckets[scope_idx][bucket_idx]; + return bucket; +} + +internal void +lnk_symbol_table_push_(LNK_SymbolTable *symtab, LNK_SymbolScopeIndex scope_idx, LNK_SymbolNode *node, U64 hash) +{ + LNK_SymbolList *bucket = lnk_symbol_table_bucket_from_hash(symtab, scope_idx, hash); + node->hash = hash; + lnk_symbol_list_push_node(bucket, node); +} + +internal void +lnk_symbol_table_push_node_hash(LNK_SymbolTable *symtab, LNK_SymbolNode *node, U64 hash) +{ + switch (node->data->type) { + case LNK_Symbol_Null: break; + + case LNK_Symbol_DefinedExtern: { + lnk_symbol_table_push_(symtab, LNK_SymbolScopeIndex_Defined, node, hash); + } break; + case LNK_Symbol_DefinedInternal: { + lnk_symbol_table_push_(symtab, LNK_SymbolScopeIndex_Internal, node, hash); + } break; + case LNK_Symbol_Weak: { + LNK_SymbolNode *is_strong_defn_present = lnk_symbol_table_search_node(symtab, LNK_SymbolScopeFlag_Defined, node->data->name); + if (is_strong_defn_present) { + break; + } + + LNK_SymbolNode *is_weak_present = lnk_symbol_table_search_node(symtab, LNK_SymbolScopeFlag_Weak, node->data->name); + if (is_weak_present) { + B32 is_fallback_same = str8_match(is_weak_present->data->u.weak.fallback_symbol->name, node->data->u.weak.fallback_symbol->name, 0); + if (!is_fallback_same) { + lnk_error(LNK_Error_MultiplyDefinedSymbol, "Weak symbol %S conflict detected, symbol defined in:", node->data->name); + lnk_supplement_error("%S", node->data->debug); + lnk_supplement_error("%S", is_weak_present->data->debug); + } + break; + } + + lnk_symbol_table_push_(symtab, LNK_SymbolScopeIndex_Weak, node, hash); + } break; + case LNK_Symbol_Lazy: { + lnk_symbol_table_push_(symtab, LNK_SymbolScopeIndex_Lib, node, hash); + } break; + + // symbols not supported + case LNK_Symbol_Undefined: + case LNK_Symbol_DefinedStatic: { + InvalidPath; + } break; + } +} + +internal void +lnk_symbol_table_push_node(LNK_SymbolTable *symtab, LNK_SymbolNode *node) +{ + U64 hash = lnk_symbol_table_hash(node->data->name); + lnk_symbol_table_push_node_hash(symtab, node, hash); +} + +internal LNK_SymbolNode * +lnk_symbol_table_push(LNK_SymbolTable *symtab, LNK_Symbol *symbol) +{ + LNK_SymbolNode *node = push_array(symtab->arena, LNK_SymbolNode, 1); + node->data = symbol; + lnk_symbol_table_push_node(symtab, node); + return node; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_lazy_symbol_inserter) +{ + LNK_LazySymbolInserter *task = raw_task; + LNK_SymbolTable *symtab = task->symtab; + Rng1U64 range = task->range_arr[task_id]; + for (U64 bucket_idx = range.min; bucket_idx < range.max; bucket_idx += 1) { + LNK_SymbolList *bucket = &task->bucket_arr[bucket_idx]; + for (LNK_SymbolNode *curr = bucket->first, *next; curr != 0; curr = next) { + next = curr->next; + lnk_symbol_table_push_(symtab, LNK_SymbolScopeIndex_Lib, curr, curr->hash); + } + } +} + +internal void +lnk_symbol_table_push_lazy_arr(TP_Context *tp, LNK_SymbolTable *symtab, LNK_Symbol *arr, U64 count) +{ + Temp scratch = scratch_begin(0,0); + + ProfBegin("Push Symbol Nodes"); + LNK_SymbolNode *node_arr = push_array_no_zero(symtab->arena, LNK_SymbolNode, count); + for (U64 symbol_idx = 0; symbol_idx < count; symbol_idx += 1) { + LNK_SymbolNode *node = &node_arr[symbol_idx]; + node->prev = node->next = 0; + node->data = &arr[symbol_idx]; + } + ProfEnd(); + + ProfBegin("Hash Symbol Names"); + lnk_symbol_node_array_hash(tp, node_arr, count); + ProfEnd(); + + ProfBegin("Populate Buckets"); + LNK_SymbolList *bucket_arr = push_array(scratch.arena, LNK_SymbolList, symtab->bucket_count[LNK_SymbolScopeIndex_Lib]); + for (U64 symbol_idx = 0; symbol_idx < count; symbol_idx += 1) { + LNK_SymbolNode *symbol_node = &node_arr[symbol_idx]; + U64 bucket_idx = symbol_node->hash % symtab->bucket_count[LNK_SymbolScopeIndex_Lib]; + lnk_symbol_list_push_node(&bucket_arr[bucket_idx], symbol_node); + } + ProfEnd(); + + ProfBegin("Update Symbol Table"); + LNK_LazySymbolInserter symbol_inserter; + symbol_inserter.symtab = symtab; + symbol_inserter.bucket_arr = bucket_arr; + symbol_inserter.range_arr = tp_divide_work(scratch.arena, symtab->bucket_count[LNK_SymbolScopeIndex_Lib], tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_lazy_symbol_inserter, &symbol_inserter); + ProfEnd(); + + scratch_end(scratch); +} + +internal void +lnk_symbol_table_push_list(LNK_SymbolTable *symtab, LNK_SymbolList *list) +{ + ProfBeginFunction(); + MemoryZeroStruct(list); + ProfEnd(); +} + +internal LNK_Symbol * +lnk_resolve_symbol(LNK_SymbolTable *symtab, LNK_Symbol *resolve_symbol) +{ + LNK_Symbol *symbol = resolve_symbol; + B32 run_resolver; + do { + run_resolver = 0; + switch (symbol->type) { + case LNK_Symbol_Null: break; + case LNK_Symbol_Undefined: { + LNK_UndefinedSymbol *undef_symbol = &symbol->u.undefined; + LNK_Symbol *def = lnk_symbol_table_search(symtab, undef_symbol->scope_flags, symbol->name); + if (def) { + symbol = def; + run_resolver = 1; + } + } break; + case LNK_Symbol_Weak: { + LNK_WeakSymbol *weak = &symbol->u.weak; + LNK_Symbol *def = lnk_symbol_table_search(symtab, weak->scope_flags, symbol->name); + if (def) { + Assert(LNK_Symbol_IsDefined(def->type)); + symbol = def; + } else { + symbol = symbol->u.weak.fallback_symbol; + } + run_resolver = 1; + } break; + case LNK_Symbol_DefinedStatic: + case LNK_Symbol_DefinedExtern: + case LNK_Symbol_DefinedInternal: { + /* resolved */ + } break; + default: NotImplemented; + } + } while (run_resolver); + return symbol; +} + +internal LNK_SymbolList +lnk_pop_comdat_chain(LNK_SymbolList *bucket, LNK_SymbolNode **cursor) +{ + LNK_SymbolList chain_list = {0}; + + LNK_SymbolNode *leader_node = *cursor; + *cursor = (*cursor)->next; + + lnk_symbol_list_remove(bucket, leader_node); + lnk_symbol_list_push_node(&chain_list, leader_node); + + while (*cursor) { + LNK_SymbolNode *next = (*cursor)->next; + + // symbols with identical names are stored in order + if (!str8_match(leader_node->data->name, (*cursor)->data->name, 0)) { + break; + } + + // move node to chain list + lnk_symbol_list_remove(bucket, *cursor); + lnk_symbol_list_push_node(&chain_list, *cursor); + + // advance + *cursor = next; + } + + return chain_list; +} + +internal LNK_SymbolNode * +lnk_fold_comdat_chain(LNK_SymbolList chain_list) +{ + LNK_SymbolNode *lead_node = chain_list.first; + + if (LNK_Symbol_IsDefined(lead_node->data->type)) { + LNK_Symbol *lead = lead_node->data; + if (lead->u.defined.value_type != LNK_DefinedSymbolValue_Chunk && chain_list.count > 1) { + lnk_error(LNK_Error_MultiplyDefinedSymbol, "Unable to perfrom COMDAT fold on symbol %S, symbol must reference a section, defined in %S", + lead->name, lead->debug); + return 0; + } + } + + for (LNK_SymbolNode *curr_node = lead_node->next; curr_node != 0; curr_node = curr_node->next) { + Assert(LNK_Symbol_IsDefined(lead_node->data->type)); + Assert(LNK_Symbol_IsDefined(curr_node->data->type)); + + LNK_DefinedSymbol *lead_defined = &lead_node->data->u.defined; + LNK_DefinedSymbol *curr_defined = &curr_node->data->u.defined; + + if (curr_defined->value_type != LNK_DefinedSymbolValue_Chunk) { + lnk_error(LNK_Error_MultiplyDefinedSymbol, "Unable to perfrom COMDAT fold on symbol %S, symbol must reference a section, defined in %S", + curr_node->data->name, curr_node->data->debug); + return 0; + } + + // There is no mentioning of this rule in PE spec, but according to comment from lld-link in 'handleComdatSelection': + // "cl.exe picks "any" for vftabels when building with /GR- and "largest" when building /GR.". However, + // chromium links '__src_ucrt_dll_is_in_use' from MSVCRT which is not a vftable but still requires selection override. + if ((curr_defined->u.selection == COFF_ComdatSelectType_ANY && lead_defined->u.selection == COFF_ComdatSelectType_LARGEST) || + (curr_defined->u.selection == COFF_ComdatSelectType_LARGEST && lead_defined->u.selection == COFF_ComdatSelectType_ANY)) { + lead_defined->u.selection = COFF_ComdatSelectType_LARGEST; + curr_defined->u.selection = COFF_ComdatSelectType_LARGEST; + } + + // COMDATs must have same selection rule + if (lead_defined->u.selection != curr_defined->u.selection) { + String8 curr_selection_str = coff_string_from_comdat_select_type(curr_defined->u.selection); + String8 lead_selection_str = coff_string_from_comdat_select_type(lead_defined->u.selection); + lnk_error(LNK_Warning_UnresolvedComdat, + "COMDAT selection conflict detected in symbol %S defined in %S (%S), leader selection %S from %S", + curr_node->data->name, curr_node->data->debug, curr_selection_str, lead_selection_str, lead_node->data->debug); + return 0; + } + + switch (curr_defined->u.selection) { + case COFF_ComdatSelectType_NULL: + case COFF_ComdatSelectType_ANY: { + // both COMDATs are valid but to get smaller exe pick smallest + LNK_Chunk *lead_chunk = lead_defined->u.chunk; + LNK_Chunk *curr_chunk = curr_defined->u.chunk; + U64 lead_chunk_size = lnk_chunk_get_size(lead_chunk); + U64 curr_chunk_size = lnk_chunk_get_size(curr_chunk); + if (curr_chunk_size < lead_chunk_size) { + lead_node = curr_node; + } + } break; + case COFF_ComdatSelectType_NODUPLICATES: { + lnk_error(LNK_Error_MultiplyDefinedSymbol, "%S: error: multiply defined symbol %S in %S.", curr_node->data->debug, curr_node->data->name, lead_node->data->debug); + } break; + case COFF_ComdatSelectType_SAME_SIZE: { + LNK_Chunk *lead_chunk = lead_defined->u.chunk; + LNK_Chunk *curr_chunk = curr_defined->u.chunk; + U64 lead_chunk_size = lnk_chunk_get_size(lead_chunk); + U64 curr_chunk_size = lnk_chunk_get_size(curr_chunk); + B32 is_same_size = (lead_chunk_size == curr_chunk_size); + if (!is_same_size) { + lnk_error(LNK_Error_MultiplyDefinedSymbol, "%S: error: multiply defined symbol %S in %S.", curr_node->data->debug, curr_node->data->name, lead_node->data->debug); + } + } break; + case COFF_ComdatSelectType_EXACT_MATCH: { + B32 is_exact_match = (lead_defined->u.check_sum == curr_defined->u.check_sum); + if (!is_exact_match) { + lnk_error(LNK_Error_MultiplyDefinedSymbol, "%S: error: multiply defined symbol %S in %S.", curr_node->data->debug, curr_node->data->name, lead_node->data->debug); + } + } break; + case COFF_ComdatSelectType_LARGEST: { + LNK_Chunk *lead_chunk = lead_defined->u.chunk; + LNK_Chunk *curr_chunk = curr_defined->u.chunk; + U64 lead_chunk_size = lnk_chunk_get_size(lead_chunk); + U64 curr_chunk_size = lnk_chunk_get_size(curr_chunk); + if (lead_chunk_size > curr_chunk_size) { + lead_node = curr_node; + } + } break; + case COFF_ComdatSelectType_ASSOCIATIVE: { + // ignore + } break; + } + } + + // rewire chunks so they point to COMDAT leader + for (LNK_SymbolNode *curr_node = chain_list.first; curr_node != 0; curr_node = curr_node->next) { + if (curr_node == lead_node) { + continue; + } + + LNK_DefinedSymbol *curr_defined = &curr_node->data->u.defined; + LNK_Chunk *curr_chunk = curr_defined->u.chunk; + + // copy offset because after folding COMDATS we might end + // up with larger sized chunk and, for instance, a vftable + // might have a function pointer preceeding lead symbol + curr_defined->u.chunk = lead_node->data->u.defined.u.chunk; + curr_defined->u.chunk_offset = lead_node->data->u.defined.u.chunk_offset; + + // discard chunk from output + curr_chunk->is_discarded = 1; + + // static symbols that are not part of obj's symbol table might point to discarded chunk + curr_chunk->ref = lead_node->data->u.defined.u.chunk->ref; + } + + return lead_node; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_comdat_folder) +{ + LNK_ComdatFolder *task = raw_task; + LNK_SymbolTable *symtab = task->symtab; + Rng1U64 range = task->range_arr[task_id]; + for (U64 bucket_idx = range.min; bucket_idx < range.max; ++bucket_idx) { + LNK_SymbolList *bucket = &symtab->buckets[LNK_SymbolScopeIndex_Defined][bucket_idx]; + LNK_SymbolList leader_list = {0}; + LNK_SymbolNode *curr = bucket->first; + while (curr) { + LNK_SymbolList chain_list = lnk_pop_comdat_chain(bucket, &curr); + LNK_SymbolNode *leader_node = lnk_fold_comdat_chain(chain_list); + if (leader_node) { + lnk_symbol_list_push_node(&leader_list, leader_node); + } + } + Assert(bucket->count == 0); + *bucket = leader_list; + } +} + +internal void +lnk_fold_comdat_chunks(TP_Context *tp, LNK_SymbolTable *symtab) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + LNK_ComdatFolder folder = {0}; + folder.symtab = symtab; + folder.range_arr = tp_divide_work(scratch.arena, symtab->bucket_count[LNK_SymbolScopeIndex_Defined], tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_comdat_folder, &folder); + + scratch_end(scratch); + ProfEnd(); +} diff --git a/src/linker/lnk_symbol_table.h b/src/linker/lnk_symbol_table.h new file mode 100644 index 00000000..4c68996c --- /dev/null +++ b/src/linker/lnk_symbol_table.h @@ -0,0 +1,253 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef enum +{ + LNK_SymbolScopeIndex_Defined, + LNK_SymbolScopeIndex_Internal, // symbols defined by linker + LNK_SymbolScopeIndex_Weak, + LNK_SymbolScopeIndex_Lib, + LNK_SymbolScopeIndex_Count +} LNK_SymbolScopeIndex; + +enum +{ + LNK_SymbolScopeFlag_Defined = 1, + LNK_SymbolScopeFlag_Internal = 2, + LNK_SymbolScopeFlag_Weak = 4, + LNK_SymbolScopeFlag_Lib = 8, + + LNK_SymbolScopeFlag_Main = LNK_SymbolScopeFlag_Defined | LNK_SymbolScopeFlag_Weak, + LNK_SymbolScopeFlag_All = LNK_SymbolScopeFlag_Defined | LNK_SymbolScopeFlag_Weak | LNK_SymbolScopeFlag_Lib | LNK_SymbolScopeFlag_Internal +}; +typedef U64 LNK_SymbolScopeFlags; + +typedef enum +{ + LNK_DefinedSymbolVisibility_Static, + LNK_DefinedSymbolVisibility_Extern, + LNK_DefinedSymbolVisibility_Internal, +} LNK_DefinedSymbolVisibility; + +enum +{ + LNK_DefinedSymbolFlag_IsFunc = (1 << 0), + LNK_DefinedSymbolFlag_IsThunk = (1 << 1), +}; +typedef U64 LNK_DefinedSymbolFlags; + +typedef enum +{ + LNK_DefinedSymbolValue_Null, + LNK_DefinedSymbolValue_Chunk, + LNK_DefinedSymbolValue_VA +} LNK_DefinedSymbolValueType; + +typedef struct LNK_DefinedSymbol +{ + LNK_DefinedSymbolFlags flags; + LNK_DefinedSymbolValueType value_type; + union { + struct { + LNK_Chunk *chunk; + U64 chunk_offset; + U32 check_sum; + COFF_ComdatSelectType selection; + }; + U64 va; + } u; +} LNK_DefinedSymbol; + +typedef struct LNK_WeakSymbol +{ + LNK_SymbolScopeFlags scope_flags; + COFF_WeakExtType lookup_type; + struct LNK_Symbol *fallback_symbol; +} LNK_WeakSymbol; + +typedef struct LNK_UndefinedSymbol +{ + LNK_SymbolScopeFlags scope_flags; +} LNK_UndefinedSymbol; + +typedef struct LNK_LazySymbol +{ + struct LNK_Lib *lib; + U64 member_offset; +} LNK_LazySymbol; + +#define LNK_Symbol_IsDefined(type) ((type) == LNK_Symbol_DefinedStatic || (type) == LNK_Symbol_DefinedExtern || (type) == LNK_Symbol_DefinedInternal) +typedef enum +{ + LNK_Symbol_Null, + LNK_Symbol_DefinedStatic, + LNK_Symbol_DefinedExtern, + LNK_Symbol_DefinedInternal, + LNK_Symbol_Weak, + LNK_Symbol_Lazy, + LNK_Symbol_Undefined, +} LNK_SymbolType; + +#define LNK_DEBUG_SYMBOLS 1 +typedef struct LNK_Symbol +{ + String8 name; + LNK_SymbolType type; + union { + LNK_DefinedSymbol defined; + LNK_WeakSymbol weak; + LNK_UndefinedSymbol undefined; + LNK_LazySymbol lazy; + } u; +#if LNK_DEBUG_SYMBOLS + String8 debug; +#endif +} LNK_Symbol; + +typedef struct LNK_SymbolNode +{ + struct LNK_SymbolNode *next; + struct LNK_SymbolNode *prev; + U64 hash; + LNK_Symbol *data; +} LNK_SymbolNode; + +typedef struct LNK_SymbolList +{ + U64 count; + LNK_SymbolNode *first; + LNK_SymbolNode *last; +} LNK_SymbolList; + +typedef struct LNK_SymbolNodeArray +{ + U64 count; + LNK_SymbolNode **v; +} LNK_SymbolNodeArray; + +typedef struct LNK_SymbolArray +{ + U64 count; + LNK_Symbol *v; +} LNK_SymbolArray; + +typedef struct LNK_SymbolTable +{ + Arena *arena; + U64 bucket_count[LNK_SymbolScopeIndex_Count]; + LNK_SymbolList *buckets[LNK_SymbolScopeIndex_Count]; +} LNK_SymbolTable; + +//////////////////////////////// +// parallel for wrappers + +typedef struct +{ + LNK_Symbol *symbol_arr; + Rng1U64 *range_arr; + U64 *hash_arr; +} LNK_SymbolNameHasher; + +typedef struct +{ + LNK_SymbolNode **input_arr; + Rng1U64 *range_arr; +} LNK_SymbolNodePtrHasher; + +typedef struct +{ + LNK_SymbolNode *input_arr; + Rng1U64 *range_arr; +} LNK_SymbolNodeHasher; + +typedef struct +{ + LNK_SymbolTable *symtab; + LNK_SymbolList *bucket_arr; + Rng1U64 *range_arr; +} LNK_DefinedSymbolInserter; + +typedef struct +{ + LNK_SymbolTable *symtab; + LNK_SymbolList *bucket_arr; + Rng1U64 *range_arr; +} LNK_LazySymbolInserter; + +typedef struct +{ + LNK_SymbolTable *symtab; + Rng1U64 *range_arr; +} LNK_ComdatFolder; + +//////////////////////////////// + +extern LNK_Symbol g_null_symbol; +extern LNK_Symbol *g_null_symbol_ptr; + +internal void lnk_init_symbol(LNK_Symbol *symbol, String8 name, LNK_SymbolType type); +internal void lnk_init_defined_symbol(LNK_Symbol *symbol, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags); +internal void lnk_init_defined_symbol_chunk(LNK_Symbol *symbol, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, LNK_Chunk *chunk, U64 offset, COFF_ComdatSelectType selection, U32 check_sum); +internal void lnk_init_defined_symbol_va(LNK_Symbol *symbol, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, U64 va); +internal void lnk_init_undefined_symbol(LNK_Symbol *symbol, String8 name, LNK_SymbolScopeFlags scope_flags); +internal void lnk_init_weak_symbol(LNK_Symbol *symbol, String8 name, COFF_WeakExtType lookup, LNK_Symbol *fallback); + +internal LNK_Symbol * lnk_make_defined_symbol(Arena *arena, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags); +internal LNK_Symbol * lnk_make_defined_symbol_chunk(Arena *arena, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, LNK_Chunk *chunk, U64 offset, COFF_ComdatSelectType selection, U32 check_sum); +internal LNK_Symbol * lnk_make_defined_symbol_va(Arena *arena, String8 name, LNK_DefinedSymbolVisibility visibility, LNK_DefinedSymbolFlags flags, U64 va); +internal LNK_Symbol * lnk_make_undefined_symbol(Arena *arena, String8 name, LNK_SymbolScopeFlags scope_flags); +internal LNK_Symbol * lnk_make_weak_symbol(Arena *arena, String8 name, COFF_WeakExtType lookup, LNK_Symbol *fallback); +internal LNK_Symbol * lnk_make_lazy_symbol(Arena *arena, String8 name, struct LNK_Lib *lib, U64 member_offset); + +#if LNK_DEBUG_SYMBOLS +#define lnk_symbol_set_debugf(a, s, fmt, ...) do { (s)->debug = push_str8f((a), fmt, __VA_ARGS__); } while (0) +#define lnk_symbol_set_debug(s, str) do { (s)->debug = str; } while (0) +#else +#define lnk_symbol_set_debugf(...) +#define lnk_symbol_set_debug(...) +#endif + +internal LNK_Chunk * lnk_defined_symbol_get_chunk(LNK_DefinedSymbol *symbol); + +internal void lnk_symbol_update_chunk_ref(LNK_Symbol *symbol, U64 src_sect_id, U64 dst_sect_id, U64 *id_map, U64 id_count); + +internal void lnk_symbol_list_push_node(LNK_SymbolList *list, LNK_SymbolNode *node); +internal LNK_SymbolNode * lnk_symbol_list_push(Arena *arena, LNK_SymbolList *list, LNK_Symbol *symbol); +internal void lnk_symbol_list_push_list(LNK_SymbolList *list, LNK_SymbolList *to_push); +internal void lnk_symbol_list_insert_after(LNK_SymbolList *list, LNK_SymbolNode *node, LNK_SymbolNode *insert); +internal LNK_SymbolNode * lnk_symbol_list_pop_node(LNK_SymbolList *list); +internal LNK_Symbol * lnk_symbol_list_pop(LNK_SymbolList *list); +internal void lnk_symbol_list_remove(LNK_SymbolList *list, LNK_SymbolNode *node); +internal void lnk_symbol_list_concat_in_place(LNK_SymbolList *list, LNK_SymbolList *to_concat); +internal LNK_SymbolNodeArray lnk_symbol_node_array_from_list(Arena *arena, LNK_SymbolList list); + +internal LNK_SymbolList lnk_symbol_list_from_array(Arena *arena, LNK_SymbolArray arr); +internal LNK_SymbolNodeArray lnk_symbol_node_array_from_list(Arena *arena, LNK_SymbolList list); +internal LNK_SymbolArray lnk_symbol_array_from_list(Arena *arena, LNK_SymbolList list); +internal LNK_Symbol * lnk_symbol_array_search(LNK_SymbolArray symarr, String8 name, StringMatchFlags flags); +internal U64 * lnk_symbol_array_hash(TP_Context *tp, Arena *arena, LNK_Symbol *arr, U64 count); + +internal LNK_SymbolTable * lnk_symbol_table_alloc(void); +internal LNK_SymbolTable * lnk_symbol_table_alloc_ex(U64 defined_cap, U64 internal_cap, U64 weak_cap, U64 lib_cap); +internal void lnk_symbol_table_release(LNK_SymbolTable **symtab); +internal U64 lnk_symbol_table_hash(String8 string); +internal LNK_SymbolNode * lnk_symbol_table_search_bucket(LNK_SymbolTable *symtab, LNK_SymbolScopeIndex scope_idx, U64 bucket_idx, String8 name, U64 hash); +internal LNK_SymbolNode * lnk_symbol_table_search_node_hash(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope_flags, String8 name, U64 hash); +internal LNK_SymbolNode * lnk_symbol_table_search_node(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope, String8 name); +internal LNK_Symbol * lnk_symbol_table_search(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope_flags, String8 name); +internal LNK_Symbol * lnk_symbol_table_searchf(LNK_SymbolTable *symtab, LNK_SymbolScopeFlags scope_flags, char *fmt, ...); +internal void lnk_symbol_table_push_node_hash(LNK_SymbolTable *symtab, LNK_SymbolNode *node, U64 hash); +internal void lnk_symbol_table_push_node(LNK_SymbolTable *symtab, LNK_SymbolNode *node); +internal LNK_SymbolNode * lnk_symbol_table_push(LNK_SymbolTable *symtab, LNK_Symbol *symbol); +internal void lnk_symbol_table_push_lazy_arr(TP_Context *tp, LNK_SymbolTable *symtab, LNK_Symbol *arr, U64 count); +internal void lnk_symbol_table_remove(LNK_SymbolTable *symtab, LNK_SymbolScopeIndex scope, String8 name); +internal void lnk_symbol_table_replace(LNK_SymbolTable *symtab, LNK_SymbolScopeIndex iscope, LNK_Symbol *symbol); + +internal LNK_Symbol * lnk_resolve_symbol(LNK_SymbolTable *symtab, LNK_Symbol *resolve_symbol); + +internal LNK_SymbolList lnk_pop_comdat_chain(LNK_SymbolList *bucket, LNK_SymbolNode **cursor); +internal LNK_SymbolNode * lnk_fold_comdat_chain(LNK_SymbolList chain_list); +internal void lnk_fold_comdat_chunks(TP_Context *tp, LNK_SymbolTable *symtab); + diff --git a/src/linker/lnk_timer.c b/src/linker/lnk_timer.c new file mode 100644 index 00000000..61872279 --- /dev/null +++ b/src/linker/lnk_timer.c @@ -0,0 +1,31 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +global LNK_Timer g_timers[LNK_Timer_Count]; + +internal void +lnk_timer_begin(LNK_TimerType timer) +{ + g_timers[timer].begin = os_now_microseconds(); +} + +internal void +lnk_timer_end(LNK_TimerType timer) +{ + g_timers[timer].end = os_now_microseconds(); +} + +internal String8 +lnk_string_from_timer_type(LNK_TimerType type) +{ + switch (type) { + case LNK_Timer_Image: return str8_lit("Image"); + case LNK_Timer_Pdb: return str8_lit("PDB"); + case LNK_Timer_Rdi: return str8_lit("RDI"); + case LNK_Timer_Lib: return str8_lit("Lib"); + case LNK_Timer_Debug: return str8_lit("Debug"); + default: InvalidPath; + } + return str8_zero(); +} + diff --git a/src/linker/lnk_timer.h b/src/linker/lnk_timer.h new file mode 100644 index 00000000..41b81053 --- /dev/null +++ b/src/linker/lnk_timer.h @@ -0,0 +1,24 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +typedef enum LNK_TimerType +{ + LNK_Timer_Image, + LNK_Timer_Pdb, + LNK_Timer_Rdi, + LNK_Timer_Lib, + LNK_Timer_Debug, + LNK_Timer_Count +} LNK_TimerType; + +typedef struct LNK_Timer +{ + U64 begin; + U64 end; +} LNK_Timer; + +internal void lnk_timer_begin(LNK_TimerType timer); +internal void lnk_timer_end(LNK_TimerType timer); + diff --git a/src/linker/os_ext/core/os_core.c b/src/linker/os_ext/core/os_core.c new file mode 100644 index 00000000..a685a63d --- /dev/null +++ b/src/linker/os_ext/core/os_core.c @@ -0,0 +1,188 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal +THREAD_POOL_TASK_FUNC(os_data_size_from_file_path_task) +{ + OS_DataSizeFromFilePathTask *task = raw_task; + + String8 path = task->path_arr.v[task_id]; + OS_Handle handle = os_file_open(OS_AccessFlag_Read|OS_AccessFlag_ShareRead, path); + FileProperties props = os_properties_from_file(handle); + + task->handle_arr[task_id] = handle; + task->size_arr[task_id] = props.size; +} + +internal +THREAD_POOL_TASK_FUNC(os_data_from_file_path_task) +{ + OS_DataFromFilePathTask *task = raw_task; + + OS_Handle handle = task->handle_arr[task_id]; + U64 size = task->size_arr[task_id]; + U8 *buffer = task->buffer + task->off_arr[task_id]; + + U64 read_size = os_file_read(handle, rng_1u64(0, size), buffer); + Assert(read_size == size); + + task->data_arr.v[task_id] = str8(buffer, read_size); + + os_file_close(handle); +} + +internal String8Array +os_data_from_file_path_parallel(TP_Context *tp, Arena *arena, String8Array path_arr) +{ + Temp scratch = scratch_begin(&arena,1); + + OS_Handle *handle_arr = push_array_no_zero(scratch.arena, OS_Handle, path_arr.count); + U64 *size_arr = push_array_no_zero(scratch.arena, U64, path_arr.count); + U64 *off_arr = push_array_no_zero(scratch.arena, U64, path_arr.count); + + // open handles and get sizes + OS_DataSizeFromFilePathTask sizer; + sizer.path_arr = path_arr; + sizer.size_arr = size_arr; + sizer.handle_arr = handle_arr; + tp_for_parallel(tp, 0, path_arr.count, os_data_size_from_file_path_task, &sizer); + + // compute file buffer size + U64 total_data_size = sum_array_u64(path_arr.count, sizer.size_arr); + + // assign offsets into file buffer + MemoryCopyTyped(off_arr, sizer.size_arr, path_arr.count); + counts_to_offsets_array_u64(path_arr.count, off_arr); + + // read files and close handles + OS_DataFromFilePathTask reader; + reader.data_arr = str8_array_reserve(arena, path_arr.count); + reader.handle_arr = handle_arr; + reader.size_arr = size_arr;; + reader.off_arr = off_arr; + reader.buffer = push_array_no_zero(arena, U8, total_data_size); + tp_for_parallel(tp, 0, path_arr.count, os_data_from_file_path_task, &reader); + + String8Array result = {0}; + result.count = path_arr.count; + result.v = reader.data_arr.v; + + scratch_end(scratch); + return result; +} + +internal String8List +os_file_search(Arena *arena, String8List dir_list, String8 file_path) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + String8List match_list; MemoryZeroStruct(&match_list); + + if (os_file_path_exists(file_path)) { + String8 str = push_str8_copy(arena, file_path); + str8_list_push(arena, &match_list, str); + } + + PathStyle file_path_style = path_style_from_str8(file_path); + B32 is_relative = file_path_style != PathStyle_WindowsAbsolute && + file_path_style != PathStyle_UnixAbsolute; + + if (is_relative) { + for (String8Node *i = dir_list.first; i != 0; i = i->next) { + String8List path_list = {0}; + str8_list_push(scratch.arena, &path_list, i->string); + str8_list_push(scratch.arena, &path_list, file_path); + String8 path = str8_path_list_join_by_style(scratch.arena, &path_list, PathStyle_SystemAbsolute); + B32 file_exists = os_file_path_exists(path); + if (file_exists) { + B32 is_unique = 1; + OS_FileID file_id = os_id_from_file_path(path); + for (String8Node *k = match_list.first; k != 0; k = k->next) { + OS_FileID test_id = os_id_from_file_path(k->string); + int cmp = os_file_id_compare(test_id, file_id) != 0; + if (cmp == 0) { + is_unique = 0; + break; + } + } + if (is_unique) { + String8 str = push_str8_copy(arena, path); + str8_list_push(arena, &match_list, str); + } + } + } + } + + scratch_end(scratch); + ProfEnd(); + return match_list; +} + +static struct +{ + String8 string; + OperatingSystem os; +} g_os_map[] = { + { str8_lit_comp("windows"), OperatingSystem_Windows, }, + { str8_lit_comp("linux"), OperatingSystem_Linux, }, + { str8_lit_comp("mac"), OperatingSystem_Mac, }, +}; + +internal OperatingSystem +operating_system_from_string(String8 string) +{ + for (U64 i = 0; i < ArrayCount(g_os_map); ++i) { + if (str8_match(g_os_map[i].string, string, StringMatchFlag_CaseInsensitive)) { + return g_os_map[i].os; + } + } + return OperatingSystem_Null; +} + +internal B32 +os_try_guid_from_string(String8 string, OS_Guid *guid_out) +{ + Temp scratch = scratch_begin(0,0); + B32 is_parsed = 0; + String8List list = str8_split_by_string_chars(scratch.arena, string, str8_lit("-"), StringSplitFlag_KeepEmpties); + if (list.node_count == 5) { + String8 data1_str = list.first->string; + String8 data2_str = list.first->next->string; + String8 data3_str = list.first->next->next->string; + String8 data4_hi_str = list.first->next->next->next->string; + String8 data4_lo_str = list.first->next->next->next->next->string; + if (str8_is_integer(data1_str, 16) && + str8_is_integer(data2_str, 16) && + str8_is_integer(data3_str, 16) && + str8_is_integer(data4_hi_str, 16) && + str8_is_integer(data4_lo_str, 16)) { + U64 data1 = u64_from_str8(data1_str, 16); + U64 data2 = u64_from_str8(data2_str, 16); + U64 data3 = u64_from_str8(data3_str, 16); + U64 data4_hi = u64_from_str8(data4_hi_str, 16); + U64 data4_lo = u64_from_str8(data4_lo_str, 16); + if (data1 <= max_U32 && + data2 <= max_U16 && + data3 <= max_U16 && + data4_hi <= max_U16 && + data4_lo <= 0xffffffffffff) { + guid_out->data1 = (U32)data1; + guid_out->data2 = (U16)data2; + guid_out->data3 = (U16)data3; + U64 data4 = (data4_hi << 48) | data4_lo; + MemoryCopy(&guid_out->data4[0], &data4, sizeof(data4)); + is_parsed = 1; + } + } + } + scratch_end(scratch); + return is_parsed; +} + +internal OS_Guid +os_guid_from_string(String8 string) +{ + OS_Guid guid = {0}; + os_try_guid_from_string(string, &guid); + return guid; +} diff --git a/src/linker/os_ext/core/os_core.h b/src/linker/os_ext/core/os_core.h new file mode 100644 index 00000000..0ba75989 --- /dev/null +++ b/src/linker/os_ext/core/os_core.h @@ -0,0 +1,39 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#if OS_WINDOWS +# include "os_core_win32.c" +#else +# error "undefined OS" +#endif + +typedef struct +{ + String8Array path_arr; + OS_Handle *handle_arr; + U64 *size_arr; +} OS_DataSizeFromFilePathTask; + +typedef struct +{ + String8Array data_arr; + OS_Handle *handle_arr; + U64 *size_arr; + U64 *off_arr; + U8 *buffer; +} OS_DataFromFilePathTask; + +internal String8Array os_data_from_file_path_parallel(TP_Context *tp, Arena *arena, String8Array path_arr); +internal String8List os_file_search(Arena *arena, String8List dir_list, String8 file_path); +internal B32 os_folder_path_exists(String8 path); + +internal OperatingSystem operating_system_from_string(String8 string); + +internal B32 os_set_large_pages(B32 toggle); + +internal U32 os_get_process_start_time_unix(void); + +internal B32 os_try_guid_from_string(String8 string, OS_Guid *guid_out); +internal OS_Guid os_guid_from_string(String8 string); diff --git a/src/linker/os_ext/core/os_core_win32.c b/src/linker/os_ext/core/os_core_win32.c new file mode 100644 index 00000000..d85af683 --- /dev/null +++ b/src/linker/os_ext/core/os_core_win32.c @@ -0,0 +1,168 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U32 +w32_unix_time_from_file_time(FILETIME file_time) +{ + U64 win32_time = ((U64)file_time.dwHighDateTime << 32) | file_time.dwLowDateTime; + U64 unix_time64 = ((win32_time - 0x19DB1DED53E8000ULL) / 10000000); + + Assert(unix_time64 <= max_U32); + U32 unix_time32 = (U32)unix_time64; + + return unix_time32; +} + +internal B32 +os_w32_has_path_volume_prefix(String8 path) +{ + if (path.size >= 2) { + U8 *ptr = path.str; + U8 *opl = path.str + path.size; + UnicodeDecode a = utf8_decode(ptr, (U64)(opl-ptr)); + ptr += a.inc; + UnicodeDecode b = utf8_decode(ptr, (U64)(opl-ptr)); + return a.codepoint < max_U8 && char_is_alpha(a.codepoint) && b.codepoint == ':'; + } + return 0; +} + +internal B32 +os_w32_has_device_prefix(String8 path) +{ + if (path.size >= 3) { + U8 *ptr = path.str; + U8 *opl = path.str + path.size; + UnicodeDecode a = utf8_decode(ptr, (U64)(opl-ptr)); + ptr += a.inc; + UnicodeDecode b = utf8_decode(ptr, (U64)(opl-ptr)); + ptr += b.inc; + UnicodeDecode c = utf8_decode(ptr, (U64)(opl-ptr)); + return a.codepoint == '\\' && b.codepoint == '\\' && (c.codepoint == '?' || c.codepoint == '.'); + } + return 0; +} + +internal B32 +os_w32_has_unc_prefix(String8 path) +{ + if (path.size >= 2) { + U8 *ptr = path.str; + U8 *opl = path.str + path.size; + UnicodeDecode a = utf8_decode(ptr, (U64)(opl-ptr)); + ptr += a.inc; + UnicodeDecode b = utf8_decode(ptr, (U64)(opl-ptr)); + return a.codepoint == '\\' && b.codepoint == '\\'; + } + return 0; +} + +internal B32 +os_w32_has_root_drive_prefix(String8 path) +{ + if (path.size >= 1) { + UnicodeDecode a = utf8_decode(path.str, path.size); + return a.codepoint == '\\'; + } + return 0; +} + +internal B32 +os_w32_is_path_relative_current_directory(String8 path) +{ + if (os_w32_has_path_volume_prefix(path)) { + return 0; + } + if (os_w32_has_device_prefix(path)) { + return 0; + } + if (os_w32_has_unc_prefix(path)) { + return 0; + } + if (os_w32_has_root_drive_prefix(path)) { + return 0; + } + return 1; +} + +internal String8 +os_make_full_path(Arena *arena, String8 path) +{ + String8 full_path; + if (os_w32_is_path_relative_current_directory(path)) { + Temp scratch = scratch_begin(&arena, 1); + String8 current_dir = os_get_current_path(scratch.arena); + String8List list = {0}; + str8_list_push(scratch.arena, &list, current_dir); + str8_list_push(scratch.arena, &list, path); + String8 temp_full_path = str8_list_join(scratch.arena, &list, &(StringJoin){ .sep = str8_lit_comp("\\") }); + String8List split_full_path = str8_split_path(scratch.arena, temp_full_path); + str8_path_list_resolve_dots_in_place(&split_full_path, PathStyle_WindowsAbsolute); + full_path = str8_list_join(arena, &split_full_path, &(StringJoin){ .sep = str8_lit_comp("\\") }); + scratch_end(scratch); + } else { + full_path = push_str8_copy(arena, path); + } + return full_path; +} + +internal B32 +os_folder_path_exists(String8 path) +{ + Temp scratch = scratch_begin(0,0); + + String8 actual_path = path; + if (os_w32_is_path_relative_current_directory(path)) { + String8 current = os_get_current_path(scratch.arena); + String8List list = {0}; + str8_list_push(scratch.arena, &list, current); + str8_list_push(scratch.arena, &list, path); + StringJoin join = { .sep = str8_lit_comp("\\") }; + actual_path =str8_list_join(scratch.arena, &list, &join); + } + + String16 path16 = str16_from_8(scratch.arena, actual_path); + DWORD attributes = GetFileAttributesW((WCHAR *)path16.str); + B32 exists = (attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY); + scratch_end(scratch); + return exists; +} + +internal B32 +os_set_large_pages(B32 toggle) +{ + B32 is_ok = 0; + + HANDLE token; + if(OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token)) + { + LUID luid; + if(LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) + { + TOKEN_PRIVILEGES priv; + priv.PrivilegeCount = 1; + priv.Privileges[0].Luid = luid; + priv.Privileges[0].Attributes = toggle ? SE_PRIVILEGE_ENABLED : SE_PRIVILEGE_REMOVED; + if (AdjustTokenPrivileges(token, 0, &priv, sizeof(priv), 0, 0) == ERROR_SUCCESS) { + is_ok = 1; + } + } + CloseHandle(token); + } + return is_ok; +} + +internal U32 +os_get_process_start_time_unix(void) +{ + HANDLE handle = GetCurrentProcess(); + FILETIME start_time = {0}; + FILETIME exit_time; + FILETIME kernel_time; + FILETIME user_time; + if (GetProcessTimes(handle, &start_time, &exit_time, &kernel_time, &user_time)) { + return w32_unix_time_from_file_time(start_time); + } + return 0; +} + diff --git a/src/linker/os_ext/os_inc.c b/src/linker/os_ext/os_inc.c new file mode 100644 index 00000000..db710e2f --- /dev/null +++ b/src/linker/os_ext/os_inc.c @@ -0,0 +1,14 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#include "core/os_core.c" + +#if OS_WINDOWS +//# include "core/win32/os_core_win32.c" +#elif OS_LINUX +//# include "core/linux/os_core_linux.c" +#else +# error no OS layer setup +#endif + + diff --git a/src/linker/os_ext/os_inc.h b/src/linker/os_ext/os_inc.h new file mode 100644 index 00000000..48b61692 --- /dev/null +++ b/src/linker/os_ext/os_inc.h @@ -0,0 +1,15 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#include "core/os_core.h" + +#if OS_WINDOWS +//# include "core/win32/os_core_win32.h" +#elif OS_LINUX +//# include "core/linux/os_core_linux.h" +#else +# error no OS layer setup +#endif + diff --git a/src/linker/path_ext/path.c b/src/linker/path_ext/path.c new file mode 100644 index 00000000..ac41560d --- /dev/null +++ b/src/linker/path_ext/path.c @@ -0,0 +1,84 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal String8 +make_file_name_with_ext(Arena *arena, String8 file_name, String8 ext) +{ + String8 file_name_no_ext = str8_chop_last_dot(file_name); + String8 result = push_str8f(arena, "%S.%S", file_name_no_ext, ext); + return result; +} + +internal String8 +make_file_path_with_ext(Arena *arena, String8 file_name, String8 ext) +{ + Temp scratch = scratch_begin(&arena, 1); + + String8 curr = os_get_current_path(scratch.arena); + String8 name = make_file_name_with_ext(scratch.arena, str8_skip_last_slash(file_name), ext); + + String8List list = {0}; + str8_list_push(scratch.arena, &list, curr); + str8_list_push(scratch.arena, &list, name); + String8 result = str8_path_list_join_by_style(arena, &list, PathStyle_SystemAbsolute); + + scratch_end(scratch); + return result; +} + +internal String8 +path_char_from_style(PathStyle style) +{ + String8 result = str8_zero(); + switch (style) { + case PathStyle_Null: break; + case PathStyle_Relative: break; + case PathStyle_WindowsAbsolute: result = str8_lit("\\"); break; + case PathStyle_UnixAbsolute: result = str8_lit("/"); break; + } + return result; +} + +internal String8 +path_convert_slashes(Arena *arena, String8 path, PathStyle path_style) +{ + Temp scratch = scratch_begin(&arena, 1); + String8List list = str8_split_path(scratch.arena, path); + StringJoin join = {0}; + join.sep = path_char_from_style(path_style); + String8 result = str8_list_join(arena, &list, &join); + scratch_end(scratch); + return result; +} + +internal String8 +path_canon_from_regular_path(Arena *arena, String8 path) +{ + Temp scratch = scratch_begin(&arena, 1); + String8 result; + result = lower_from_str8(scratch.arena, path); + result = path_convert_slashes(arena, result, PathStyle_UnixAbsolute); + scratch_end(scratch); + return result; +} + +struct { + String8 string; + PathStyle path_style; +} g_path_style_map[] = { + { str8_lit_comp("windows"), PathStyle_WindowsAbsolute }, + { str8_lit_comp("unix"), PathStyle_UnixAbsolute }, + { str8_lit_comp("system"), PathStyle_SystemAbsolute }, +}; + +internal PathStyle +path_style_from_string(String8 string) +{ + for (U64 i = 0; i < ArrayCount(g_path_style_map); ++i) { + if (str8_match(g_path_style_map[i].string, string, StringMatchFlag_CaseInsensitive)) { + return g_path_style_map[i].path_style; + } + } + return PathStyle_Null; +} + diff --git a/src/linker/path_ext/path.h b/src/linker/path_ext/path.h new file mode 100644 index 00000000..383d1011 --- /dev/null +++ b/src/linker/path_ext/path.h @@ -0,0 +1,11 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal String8 make_file_name_with_ext(Arena *arena, String8 file_name, String8 ext); +internal String8 make_file_path_with_ext(Arena *arena, String8 file_name, String8 ext); +internal String8 path_convert_slashes(Arena *arena, String8 path, PathStyle path_style); +internal String8 path_canon_from_regular_path(Arena *arena, String8 path); +internal PathStyle path_style_from_string(String8 string); + diff --git a/src/linker/pdb_ext/msf_builder.c b/src/linker/pdb_ext/msf_builder.c new file mode 100644 index 00000000..11afbf80 --- /dev/null +++ b/src/linker/pdb_ext/msf_builder.c @@ -0,0 +1,2223 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U64 +msf_get_data_node_size(MSF_UInt page_size) +{ + U64 interval = msf_get_fpm_interval_correct(page_size); + U64 bytes_per_interval = interval * (U64)page_size; + return bytes_per_interval; +} + +internal void +msf_page_data_list_push(Arena *arena, MSF_PageDataList *list, MSF_UInt page_size, MSF_UInt count) +{ + U64 data_size = msf_get_data_node_size(page_size); + for (MSF_UInt i = 0; i < count; i += 1) { + // TODO: clearing memory to zero here is expensive, + // with 4KiB pages we have to zero-out 128 MiB + // memory block + // + // we can make API for stream allocation to let user + // choose between zeroed and dirty allocations + // + U8 *data = push_array(arena, U8, data_size); + + // init node + MSF_PageDataNode *node = push_array_no_zero(arena, MSF_PageDataNode, 1); + node->prev = 0; + node->next = 0; + node->data = data; + + // push node to list + DLLPushBack(list->first, list->last, node); + list->count += 1; + } +} + +internal MSF_PageDataList +msf_page_data_list_pop(MSF_PageDataList *list, MSF_UInt count) +{ + MSF_PageDataList result = {0}; + + MSF_UInt to_remove = Min(count, list->count); + for (MSF_UInt i = 0; i < to_remove; i += 1) { + MSF_PageDataNode *node = list->last; + DLLRemove(list->first, list->last, node); + + node->prev = 0; + node->next = 0; + + DLLPushBack(result.first, result.last, node); + result.count += 1; + } + list->count -= to_remove; + + return result; +} + +internal void +msf_page_data_list_concat_in_place(MSF_PageDataList *list, MSF_PageDataList *to_concat) +{ + DLLConcatInPlace(list, to_concat); +} + +internal void +msf_set_page_data_list(Arena *arena, MSF_PageDataList *list, MSF_UInt page_size, String8 data) +{ + ProfBeginFunction(); + + U64 node_size = msf_get_data_node_size(page_size); + U64 node_count = CeilIntegerDiv(data.size, node_size); + + U64 node_idx; + for (node_idx = 0; node_idx < node_count - 1; node_idx += 1) { + MSF_PageDataNode *node = push_array(arena, MSF_PageDataNode, 1); + node->data = data.str + node_idx * node_size; + SLLQueuePush(list->first, list->last, node); + list->count += 1; + } + + ProfBegin("Last Page Handle"); + B32 is_last_node_size_aligned = (data.size & (node_size - 1)) == 0; + U8 *last_node_data = 0; + if (is_last_node_size_aligned) { + last_node_data = data.str + node_idx * node_size; + } else { + U64 last_node_size = data.size % node_size; + last_node_data = push_array_no_zero(arena, U8, node_size); + MemoryCopy(last_node_data, data.str + node_idx * node_size, last_node_size); + } + ProfEnd(); + + MSF_PageDataNode *last_node = push_array(arena, MSF_PageDataNode, 1); + last_node->data = last_node_data; + SLLQueuePush(list->first, list->last, last_node); + list->count += 1; + + ProfEnd(); +} + +internal String8 +msf_data_from_pn(MSF_PageDataList list, MSF_UInt page_size, MSF_PageNumber pn) +{ + U64 node_size = msf_get_data_node_size(page_size); + U64 page_offset = (U64)pn * (U64)page_size; + U64 data_node_idx = page_offset / node_size; + MSF_PageDataNode *node = list.first; + for (U64 i = 0; i < data_node_idx; i += 1) { + node = node->next; + } + U64 node_offset = page_offset % node_size; + U8 *ptr = node->data + node_offset; + String8 data = str8(ptr, page_size); + return data; +} + +//////////////////////////////// + +internal MSF_StreamNode * +msf_stream_list_push(Arena *arena, MSF_StreamList *list) +{ + MSF_StreamNode *n = push_array(arena, MSF_StreamNode, 1); + DLLPushBack(list->first, list->last, n); + list->count += 1; + return n; +} + +internal void +msf_stream_list_remove(MSF_StreamList *list, MSF_StreamNode *node) +{ + Assert(list->count > 0); + DLLRemove(list->first, list->last, node); + list->count -= 1; +} + +//////////////////////////////// + +internal void +msf_page_list_push_node(MSF_PageList *list, MSF_PageNode *node) +{ + DLLPushBack(list->first, list->last, node); + list->count += 1; +} + +internal MSF_PageNode * +msf_page_list_push(Arena *arena, MSF_PageList *list) +{ + MSF_PageNode *node = push_array(arena, MSF_PageNode, 1); + msf_page_list_push_node(list, node); + return node; +} + +internal MSF_PageNode * +msf_page_list_pop_last(MSF_PageList *list) +{ + MSF_PageNode *node = NULL; + if (list->count) { + node = list->last; + DLLRemove(list->first, list->last, node); + list->count -= 1; + } + return node; +} + +internal void +msf_page_list_concat_in_place(MSF_PageList *list, MSF_PageList *to_concat) +{ + DLLConcatInPlace(list, to_concat); +} + +internal MSF_PageNumber * +msf_page_list_to_arr(Arena *arena, MSF_PageList list) +{ + MSF_PageNumber *arr = push_array(arena, MSF_PageNumber, list.count); + MSF_UInt i = 0; + for (MSF_PageNode *node = list.first; node != 0; node = node->next, i += 1) { + arr[i] = node->pn; + } + return arr; +} + +internal MSF_PageNode * +msf_page_from_index(MSF_PageList page_list, MSF_UInt index) +{ + MSF_PageNode *page; + + B32 scan_from_last_node = index > page_list.count/2; + if (scan_from_last_node) { + page = page_list.last; + if (page_list.count > 0) { + for (MSF_UInt i = page_list.count - 1; i > index; i -= 1) { + page = page->prev; + if (!page) { + return 0; + } + } + } + } else { + page = page_list.first; + for (MSF_UInt i = 0; i < index; i += 1) { + page = page->next; + if (!page) { + return 0; + } + } + } + return page; +} + +internal void +msf_page_list_push_extant_page_arr(Arena *arena, MSF_PageList *list, + MSF_PageDataList page_data_list, MSF_UInt page_size, + MSF_PageNumber *pn_arr, MSF_UInt pn_count) +{ + U64 node_size = msf_get_data_node_size(page_size); + U64 data_max = page_data_list.count * node_size; + for (MSF_PageNumber *pn_ptr = pn_arr, *pn_opl = pn_ptr + pn_count; pn_ptr < pn_opl; pn_ptr += 1) { + // is page number valid? + Assert(*pn_ptr * page_size + page_size <= data_max); + + // init page node + MSF_PageNode *page_node = msf_page_list_push(arena, list); + page_node->pn = *pn_ptr; + } +} + +internal void +msf_page_list_push_extant_page(Arena *arena, MSF_PageList *list, MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber pn) +{ + msf_page_list_push_extant_page_arr(arena, list, page_data_list, page_size, &pn, 1); +} + +#if LNK_PARANOID +internal void +msf_check_fpm_bits_for_page_list(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber active_fpm, MSF_PageList page_list, MSF_UInt test_state) +{ + for (MSF_PageNode *page_node = page_list.first; page_node != 0; page_node = page_node->next) { + MSF_UInt state = msf_get_fpm_page_bit_state(page_data_list, page_size, active_fpm, page_node->pn); + if (state != test_state) { + //Assert(!"state bit doesn't match"); + } + } +} +#endif + +//////////////////////////////// + +internal MSF_UInt +msf_count_pages(MSF_UInt page_size, U64 data_size) +{ + MSF_UInt page_count = CeilIntegerDiv(data_size, page_size); + return page_count; +} + +internal MSF_PageNumber +msf_get_page_count_cap(MSF_PageDataList page_data_list, MSF_UInt page_size) +{ + U64 node_size = msf_get_data_node_size(page_size); + U64 file_size = page_data_list.count * node_size; + U64 count = CeilIntegerDiv(file_size, (U64)page_size); + return safe_cast_u32(count); +} + +//////////////////////////////// + +internal U32Array +msf_fpm_data_from_pn(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber fpm_pn) +{ + String8 raw_fpm = msf_data_from_pn(page_data_list, page_size, fpm_pn); + U32Array fpm_data; + fpm_data.count = raw_fpm.size / sizeof(fpm_data.v[0]); + fpm_data.v = (U32*)raw_fpm.str; + return fpm_data; +} + +internal MSF_UInt +msf_get_fpm_interval_correct(MSF_UInt page_size) +{ + return page_size * MSF_BITS_PER_CHAR; +} + +internal MSF_UInt +msf_get_fpm_interval_wrong(MSF_UInt page_size) +{ + return page_size; +} + +internal MSF_UInt +msf_get_fpm_idx_from_pn(MSF_UInt page_size, MSF_PageNumber pn) +{ + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(page_size); + MSF_UInt fpm_idx = pn / fpm_interval_correct; + return fpm_idx; +} + +internal MSF_UInt +msf_get_fpm_page_count(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_UInt fpm_interval) +{ + U64 node_size = msf_get_data_node_size(page_size); + U64 file_size = (U64)page_data_list.count * node_size; + U64 file_page_count = CeilIntegerDiv(file_size, page_size); + U64 fpm_page_count = CeilIntegerDiv(file_page_count, (U64)fpm_interval); + return safe_cast_u32(fpm_page_count); +} + +internal MSF_PageNumberArray +msf_get_fpm_page_arr(Arena *arena, MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_UInt active_fpm) +{ + Assert(active_fpm == MSF_FPM0 || active_fpm == MSF_FPM1); + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(page_size); + MSF_UInt fpm_interval_wrong = msf_get_fpm_interval_wrong(page_size); + MSF_UInt page_count = msf_get_page_count_cap(page_data_list, page_size); + MSF_PageNumberArray arr; + arr.count = CeilIntegerDiv(page_count, fpm_interval_correct); + arr.v = push_array(arena, MSF_PageNumber, arr.count); + for (MSF_UInt interval_idx = 0; interval_idx < arr.count; interval_idx += 1) { + arr.v[interval_idx] = active_fpm + interval_idx * fpm_interval_wrong; + } + return arr; +} + +internal MSF_PageNumber +msf_get_fpm_from_page_number(MSF_UInt page_size, MSF_PageNumber active_fpm, MSF_PageNumber pn) +{ + Assert(active_fpm == 1 || active_fpm == 2); + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(page_size); + MSF_UInt fpm_interval_wrong = msf_get_fpm_interval_wrong(page_size); + MSF_PageNumber fpm_pn = active_fpm; + fpm_pn += (pn / fpm_interval_correct) * fpm_interval_wrong; + return fpm_pn; +} + +internal MSF_UInt +msf_get_fpm_page_bit_state(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber active_fpm, MSF_PageNumber pn) +{ + // fetch FPM + MSF_PageNumber fpm_pn = msf_get_fpm_from_page_number(page_size, active_fpm, pn); + U32Array fpm_data = msf_fpm_data_from_pn(page_data_list, page_size, fpm_pn); + + // get page bit + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(page_size); + MSF_UInt page_bit_idx = pn % fpm_interval_correct; + MSF_UInt state = bit_array_get_bit32(fpm_data, page_bit_idx); + + return state; +} + +internal void +msf_set_fpm_bit_(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber active_fpm, MSF_PageNumber pn, B32 state) +{ + // fetch FPM + MSF_PageNumber fpm_pn = msf_get_fpm_from_page_number(page_size, active_fpm, pn); + U32Array fpm_data = msf_fpm_data_from_pn(page_data_list, page_size, fpm_pn); + + // set page bit + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(page_size); + MSF_UInt page_bit_idx = pn % fpm_interval_correct; + bit_array_set_bit32(fpm_data, page_bit_idx, state); +} + +internal void +msf_set_fpm_bit(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber active_fpm, MSF_PageNumber pn, B32 state) +{ + msf_set_fpm_bit_(page_data_list, page_size, active_fpm, pn, state); +} + +internal B32 +msf_grow(MSF_Context *msf, MSF_PageNumber new_page_count) +{ + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(msf->page_size); + MSF_UInt fpm_interval_wrong = msf_get_fpm_interval_wrong(msf->page_size); + + // check alloc limit + U64 new_page_count64 = AlignPow2((U64)new_page_count, (U64)fpm_interval_correct); + B32 is_overflowed = new_page_count64 > MSF_PN_MAX; + if (is_overflowed) { + return 0; + } + + // check can only grow MSF here + new_page_count = safe_cast_u32(new_page_count64); + if (new_page_count < msf->page_count) { + return 0; + } + + // compute number of FPM pages to allocate + // + // we allocate 8 times more FPMs because in MS impl they use wrong interval: + // https://github.com/microsoft/microsoft-pdb/blob/master/PDB/msf/msf.cpp#L509 + // + MSF_PageNumber prev_fpm_page_cap_wrong = msf_get_fpm_page_count(msf->page_data_list, msf->page_size, fpm_interval_wrong); + MSF_PageNumber curr_fpm_page_cap_wrong = CeilIntegerDiv(new_page_count, fpm_interval_wrong); + MSF_PageNumber alloc_count_wrong = curr_fpm_page_cap_wrong - prev_fpm_page_cap_wrong; + MSF_PageNumber next_pn_wrong = prev_fpm_page_cap_wrong * fpm_interval_wrong; + MSF_PageNumber end_pn_wrong = next_pn_wrong + alloc_count_wrong * fpm_interval_wrong; + + // compute correct number of FPM pages to grow + MSF_PageNumber prev_fpm_page_cap_correct = msf_get_fpm_page_count(msf->page_data_list, msf->page_size, fpm_interval_correct); + MSF_PageNumber curr_fpm_page_cap_correct = CeilIntegerDiv(new_page_count, fpm_interval_correct); + MSF_PageNumber alloc_count_correct = curr_fpm_page_cap_correct - prev_fpm_page_cap_correct; + MSF_PageNumber next_pn_correct = prev_fpm_page_cap_correct * fpm_interval_correct; + MSF_PageNumber end_pn_correct = next_pn_correct + alloc_count_correct * fpm_interval_correct; + + MSF_PageNumber to_alloc = alloc_count_correct; + + // are there unused data nodes? + if (msf->page_data_pool.count) { + MSF_PageNumber pool_alloc_count = Min(msf->page_data_pool.count, alloc_count_correct); + MSF_PageDataList page_data_list = msf_page_data_list_pop(&msf->page_data_pool, pool_alloc_count); + msf_page_data_list_concat_in_place(&msf->page_data_list, &page_data_list); + to_alloc -= pool_alloc_count; + } + + // push enough data nodes to encompass allocated FPMs + msf_page_data_list_push(msf->arena, &msf->page_data_list, msf->page_size, to_alloc); + + // set FPM bits to free + for (MSF_PageNumber pn = next_pn_wrong; pn < end_pn_wrong; pn += fpm_interval_wrong) { + MSF_PageNumber fpm0_pn = pn + MSF_FPM0; + MSF_PageNumber fpm1_pn = pn + MSF_FPM1; + String8 fpm0_data = msf_data_from_pn(msf->page_data_list, msf->page_size, fpm0_pn); + String8 fpm1_data = msf_data_from_pn(msf->page_data_list, msf->page_size, fpm1_pn); + MemorySet(fpm0_data.str, 0xFF, msf->page_size); + MemorySet(fpm1_data.str, 0xFF, msf->page_size); + } + + // set correct FPM bits + for (MSF_PageNumber pn = next_pn_correct; pn < end_pn_correct; pn += fpm_interval_correct) { + MSF_PageNumber fpm0_pn = pn + MSF_FPM0; + MSF_PageNumber fpm1_pn = pn + MSF_FPM1; + msf_set_fpm_bit(msf->page_data_list, msf->page_size, MSF_FPM0, fpm0_pn, MSF_PAGE_STATE_ALLOC); + msf_set_fpm_bit(msf->page_data_list, msf->page_size, MSF_FPM0, fpm1_pn, MSF_PAGE_STATE_ALLOC); + msf_set_fpm_bit(msf->page_data_list, msf->page_size, MSF_FPM1, fpm0_pn, MSF_PAGE_STATE_ALLOC); + msf_set_fpm_bit(msf->page_data_list, msf->page_size, MSF_FPM1, fpm1_pn, MSF_PAGE_STATE_ALLOC); + } + + // update context + msf->page_count += alloc_count_wrong * 2; + + return 1; +} + +#if 0 +internal B32 +msf_shrink(MSF_Context *msf, MSF_PageNumber new_page_count) +{ + MSF_UInt fpm_interval_wrong = msf_get_fpm_interval_wrong(msf->page_size); + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(msf->page_size); + + U64 new_page_count64 = AlignPow2((U64)new_page_count, (U64)fpm_interval_correct); + new_page_count = safe_cast_u32(new_page_count64); + Assert(new_page_count < msf->page_count); + + // compute number of FPM pages to deallocate + MSF_PageNumber prev_fpm_page_count_wrong = msf_get_fpm_page_count(msf->page_data_list, msf->page_size, fpm_interval_wrong); + MSF_PageNumber curr_fpm_page_count_wrong = CeilIntegerDiv(new_page_count, fpm_interval_wrong); + MSF_PageNumber dealloc_count_wrong = prev_fpm_page_count_wrong - curr_fpm_page_count_wrong; + + // compute next FPM page number + MSF_PageNumber next_pn = prev_fpm_page_count_wrong * fpm_interval_wrong; + MSF_PageNumber end_pn = next_pn - dealloc_count_wrong * fpm_interval_wrong; + + // pop data nodes + MSF_PageNumber prev_fpm_page_count_correct = msf_get_fpm_page_count(msf->page_data_list, msf->page_size, fpm_interval_correct); + MSF_PageNumber curr_fpm_page_count_correct = CeilIntegerDiv(new_page_count, fpm_interval_correct); + MSF_PageNumber dealloc_count_correct = prev_fpm_page_count_correct - curr_fpm_page_count_correct; + MSF_PageDataList free_page_data_list = msf_page_data_list_pop(&msf->page_data_list, dealloc_count_correct); + msf_page_data_list_concat_in_place(&msf->page_data_pool, &free_page_data_list); + + for (MSF_PageNumber pn = next_pn; pn > end_pn; pn -= fpm_interval_wrong) { + MSF_PageNumber fpm0_pn = pn + MSF_FPM0; + MSF_PageNumber fpm1_pn = pn + MSF_FPM1; + + // free FPM pages + msf_set_fpm_bit(msf->page_data_list, msf->page_size, 1, fpm0_pn, MSF_PAGE_STATE_FREE); + msf_set_fpm_bit(msf->page_data_list, msf->page_size, 1, fpm1_pn, MSF_PAGE_STATE_FREE); + msf_set_fpm_bit(msf->page_data_list, msf->page_size, 2, fpm0_pn, MSF_PAGE_STATE_FREE); + msf_set_fpm_bit(msf->page_data_list, msf->page_size, 2, fpm1_pn, MSF_PAGE_STATE_FREE); + } + + // update context + msf->page_count -= dealloc_count_wrong * 2; + + return true; +} +#endif + +internal MSF_PageNumber * +msf_alloc_pn_arr(Arena *arena, MSF_Context *msf, MSF_UInt alloc_count) +{ + // make sure FPM has enough space for new page numbers + // + // we grow FPM at correct intervals here because we pre-alloc unused FPM pages ahead of time + MSF_UInt curr_page_cap = msf_get_page_count_cap(msf->page_data_list, msf->page_size); + MSF_UInt new_page_count = msf->page_count + alloc_count; + if (new_page_count > curr_page_cap) { + B32 is_fpm_alloced = msf_grow(msf, new_page_count); + if (!is_fpm_alloced) { + return 0; + } + } + + Temp scratch = scratch_begin(&arena, 1); + + // reserve memory for page numbers + MSF_PageNumber *pn_arr = push_array(arena, MSF_PageNumber, alloc_count); + + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(msf->page_size); + MSF_UInt fpm_interval_wrong = msf_get_fpm_interval_wrong(msf->page_size); + + // get first FPM page + MSF_PageNumberArray fpm_pn_arr = msf_get_fpm_page_arr(scratch.arena, msf->page_data_list, msf->page_size, msf->active_fpm); + + for (MSF_UInt alloc_idx = 0; alloc_idx < alloc_count; ) { + // get FPM bits + MSF_UInt fpm_idx = msf->fpm_rover / fpm_interval_correct; + Assert(fpm_idx < fpm_pn_arr.count); + MSF_PageNumber fpm_pn = fpm_pn_arr.v[fpm_idx]; + U32Array fpm_data = msf_fpm_data_from_pn(msf->page_data_list, msf->page_size, fpm_pn); + + // scan FPM for free bit + MSF_UInt fpm_rover_page_relative = msf->fpm_rover % fpm_interval_correct; + U32 bit_idx = bit_array_scan_left_to_right32(fpm_data, fpm_rover_page_relative, fpm_interval_correct, MSF_PAGE_STATE_FREE); + + B32 is_full = (bit_idx >= fpm_interval_correct); + if (is_full) { + msf->fpm_rover = (fpm_idx + 1) * fpm_interval_correct; + continue; + } + + // compute page number + MSF_PageNumber pn = bit_idx + (fpm_idx * fpm_interval_correct); + + // make sure unused FPMs aren't allocated for regular streams, + // we used to mark with free bits unused FPMs but in VS2022 + // update they started to check for these bits and VS began + // to error out with "PDB format is not supported" message + B32 is_pn_valid = (pn % fpm_interval_wrong) != MSF_FPM0 && + (pn % fpm_interval_wrong) != MSF_FPM1; + if (is_pn_valid) { + // update FPM + bit_array_set_bit32(fpm_data, bit_idx, MSF_PAGE_STATE_ALLOC); + + // store page number + pn_arr[alloc_idx++] = pn; + } + + // advance FPM rover + msf->fpm_rover = pn + 1; + } + + // update context + msf->page_count += alloc_count; + + scratch_end(scratch); + return pn_arr; +} + +internal void +msf_free_pn_arr(MSF_Context *msf, MSF_PageNumber *pn_arr, MSF_UInt pn_count) +{ + // set FPM bits + for (MSF_UInt i = 0; i < pn_count; i += 1) { + MSF_PageNumber pn = pn_arr[i]; + msf_set_fpm_bit(msf->page_data_list, msf->page_size, msf->active_fpm, pn, MSF_PAGE_STATE_FREE); + + // update FPM cursor + msf->fpm_rover = Min(msf->fpm_rover, pn); + } + + // update context + Assert(msf->page_count >= pn_count); + msf->page_count -= pn_count; +} + +internal MSF_PageList +msf_alloc_pages(MSF_Context *msf, MSF_UInt alloc_count) +{ + Temp scratch = scratch_begin(0, 0); + + MSF_PageList alloc_list = {0}; + MSF_PageNumber *pn_arr = msf_alloc_pn_arr(scratch.arena, msf, alloc_count); + if (pn_arr) { + for (MSF_UInt page_idx = 0; page_idx < alloc_count; page_idx += 1) { + // get page node + MSF_PageNode *page_node = 0; + if (msf->page_pool.count) { + page_node = msf_page_list_pop_last(&msf->page_pool); + msf_page_list_push_node(&alloc_list, page_node); + } else { + page_node = msf_page_list_push(msf->arena, &alloc_list); + } + + // copy page number + page_node->pn = pn_arr[page_idx]; + } + } + + scratch_end(scratch); + return alloc_list; +} + +internal void +msf_free_pages(MSF_Context *msf, MSF_PageList *page_list) +{ + Temp scratch = scratch_begin(0, 0); + + // free page numbers + MSF_PageNumber *pn_arr = msf_page_list_to_arr(scratch.arena, *page_list); + msf_free_pn_arr(msf, pn_arr, page_list->count); + + // push free nodes + msf_page_list_concat_in_place(&msf->page_pool, page_list); + + scratch_end(scratch); +} + +internal MSF_PageNumber +msf_find_max_pn_(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumberArray fpm_pn_arr) +{ + MSF_PageNumber max_pn = 0; + + MSF_UInt fpm_interval_correct = msf_get_fpm_interval_correct(page_size); + MSF_UInt fpm_interval_wrong = msf_get_fpm_interval_wrong(page_size); + MSF_UInt fpm_page_count = fpm_interval_correct / fpm_interval_wrong; + for (MSF_Int fpm_pn_idx = (MSF_Int)fpm_pn_arr.count - 1; fpm_pn_idx >= 0; fpm_pn_idx -= 1) { + MSF_PageNumber fpm_pn = fpm_pn_arr.v[fpm_pn_idx]; + U32Array fpm_data = msf_fpm_data_from_pn(page_data_list, page_size, fpm_pn); + + // we have to work around the fact that FPM bits are always alloced + // and also there is a trail of unused FPM groups too + U32 bit_idx = max_U32; + for (MSF_Int i = fpm_page_count - 1; i >= 0; i -= 1) { + U32 fpm_lo = i * fpm_interval_wrong + 3; // skip first page bit and FPM group bits + U32 fpm_hi = i * fpm_interval_wrong + fpm_interval_wrong; + bit_idx = bit_array_scan_right_to_left32(fpm_data, fpm_lo, fpm_hi, MSF_PAGE_STATE_ALLOC); + if (bit_idx <= fpm_interval_correct) { + break; + } + } + + // check first page bit + if (bit_idx >= fpm_interval_correct) { + bit_idx = bit_array_scan_left_to_right32(fpm_data, 0, 1, MSF_PAGE_STATE_ALLOC); + if (bit_idx >= fpm_interval_correct) { + continue; + } + } + + // compute max page number + MSF_PageNumber pn = bit_idx + (MSF_UInt)fpm_pn_idx * fpm_interval_correct; + max_pn = Max(max_pn, pn); + + break; + } + + return max_pn; +} + +internal MSF_PageNumber +msf_find_max_pn(MSF_PageDataList page_data_list, MSF_UInt page_size) +{ + Temp scratch = scratch_begin(0, 0); + MSF_PageNumberArray fpm0_pn_arr = msf_get_fpm_page_arr(scratch.arena, page_data_list, page_size, MSF_FPM0); + MSF_PageNumberArray fpm1_pn_arr = msf_get_fpm_page_arr(scratch.arena, page_data_list, page_size, MSF_FPM1); + MSF_PageNumber fpm0_max = msf_find_max_pn_(page_data_list, page_size, fpm0_pn_arr); + MSF_PageNumber fpm1_max = msf_find_max_pn_(page_data_list, page_size, fpm1_pn_arr); + MSF_PageNumber max_pn = Max(fpm0_max, fpm1_max); + scratch_end(scratch); + return max_pn; +} + +//////////////////////////////// + +internal B32 +msf_write__(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNode **page_ptr, MSF_UInt *pos_ptr, void *buffer, MSF_UInt buffer_size) +{ + MSF_PageNode *start_page = *page_ptr; + MSF_UInt start_pos = *pos_ptr; + + MSF_UInt buffer_pos = 0; + while (*page_ptr) { + MSF_UInt page_offset = *pos_ptr % page_size; + + // compute copy size + MSF_UInt buffer_bytes_left = buffer_size - buffer_pos; + MSF_UInt page_bytes_left = page_size - page_offset; + MSF_UInt copy_size = Min(buffer_bytes_left, page_bytes_left); + + // fetch page bytes + MSF_PageNumber page_number = (*page_ptr)->pn; + String8 page_bytes = msf_data_from_pn(page_data_list, page_size, page_number); + + // copy bytes to buffer + U8 *buffer_copy_ptr = (U8*)buffer + buffer_pos; + U8 *page_bytes_ptr = page_bytes.str + page_offset; + MemoryCopy(page_bytes_ptr, buffer_copy_ptr, copy_size); + + // advance + buffer_pos += copy_size; + *pos_ptr += copy_size; + + // have we used all bytes in this page? + if (page_bytes_left <= copy_size) { + *page_ptr = (*page_ptr)->next; + } + + // have we copied all bytes? + if (buffer_bytes_left <= copy_size) { + break; + } + } + + B32 is_write_ok = (buffer_pos == buffer_size); + + // not enough bytes to perform write - restore positions + if (!is_write_ok) { + *page_ptr = start_page; + *pos_ptr = start_pos; + } + + return is_write_ok; +} + +internal MSF_UInt +msf_read__(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNode **page_ptr, MSF_UInt *pos_ptr, void *buffer, MSF_UInt buffer_size) +{ + MSF_UInt buffer_pos = 0; + while (*page_ptr) { + MSF_UInt page_offset = *pos_ptr % page_size; + + // compute copy size + MSF_UInt buffer_bytes_left = buffer_size - buffer_pos; + MSF_UInt page_bytes_left = page_size - page_offset; + MSF_UInt copy_size = Min(buffer_bytes_left, page_bytes_left); + + // fetch page bytes + MSF_PageNumber page_number = (*page_ptr)->pn; + String8 page_bytes = msf_data_from_pn(page_data_list, page_size, page_number); + + // copy bytes to buffer + U8 *buffer_ptr = (U8*)buffer + buffer_pos; + U8 *page_bytes_ptr = page_bytes.str + page_offset; + MemoryCopy(buffer_ptr, page_bytes_ptr, copy_size); + + // advance + buffer_pos += copy_size; + *pos_ptr += copy_size; + + // no more bytes left in this page + if (page_bytes_left <= copy_size) { + *page_ptr = (*page_ptr)->next; + } + + // have we copied all bytes? + if (buffer_bytes_left <= copy_size) { + break; + } + } + + MSF_UInt bytes_read = buffer_pos; + //Assert(bytes_read == buffer_size); + + return bytes_read; +} + +internal B32 +msf_write(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageList page_list, MSF_UInt offset, void *buffer, MSF_UInt buffer_size) +{ + MSF_UInt page_idx = offset / page_size; + MSF_PageNode *page = msf_page_from_index(page_list, page_idx); + B32 is_write_ok = msf_write__(page_data_list, page_size, &page, &offset, buffer, buffer_size); + return is_write_ok; +} + +internal MSF_UInt +msf_read(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageList page_list, MSF_UInt offset, void *buffer, MSF_UInt buffer_size) +{ + MSF_UInt page_idx = offset / page_size; + MSF_PageNode *page = msf_page_from_index(page_list, page_idx); + MSF_UInt bytes_read = msf_read__(page_data_list, page_size, &page, &offset, buffer, buffer_size); + return bytes_read; +} + +//////////////////////////////// + +internal MSF_StreamNode * +msf_stream_alloc_(Arena *arena, MSF_StreamList *list) +{ + Assert(list->count < MSF_STREAM_NUMBER_MAX); + MSF_UInt sn = list->count; + MSF_StreamNode *stream_node = msf_stream_list_push(arena, list); + MSF_Stream *stream = &stream_node->data; + stream->sn = safe_cast_u16(sn); + return stream_node; +} + +internal MSF_StreamNumber +msf_stream_alloc_ex(MSF_Context *msf, MSF_UInt size) +{ + MSF_StreamNode *node = msf_stream_alloc_(msf->arena, &msf->st); + MSF_Stream *stream = &node->data; + msf_stream_resize_ex(msf, stream, size); + return stream->sn; +} + +internal MSF_StreamNumber +msf_stream_alloc(MSF_Context *msf) +{ + return msf_stream_alloc_ex(msf, 0); +} + +internal B32 +msf_stream_resize_ex(MSF_Context *msf, MSF_Stream *stream, MSF_UInt size) +{ + MSF_UInt new_page_count = msf_count_pages(msf->page_size, size); + MSF_UInt cur_page_count = stream->page_list.count; + + if (new_page_count > cur_page_count) { + MSF_UInt alloc_count = new_page_count - cur_page_count; + MSF_PageList page_list = msf_alloc_pages(msf, alloc_count); + msf_page_list_concat_in_place(&stream->page_list, &page_list); + } else { + MSF_PageList free_page_list = {0}; + for (MSF_UInt i = cur_page_count; i > new_page_count; i -= 1) { + MSF_PageNode *page_node = msf_page_list_pop_last(&stream->page_list); + msf_page_list_push_node(&free_page_list, page_node); + } + msf_free_pages(msf, &free_page_list); + } + + // update stream + stream->size = Min(stream->size, stream->page_list.count * msf->page_size); + stream->pos = Min(stream->pos, stream->size); + stream->pos_page = 0; + + return 1; +} + +internal B32 +msf_stream_resize(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt new_size) +{ + MSF_Stream *stream = msf_find_stream(msf, sn); + B32 is_resized = 0; + if (stream) { + is_resized = msf_stream_resize_ex(msf, stream, new_size); + } + return is_resized; +} + +internal B32 +msf_stream_free(MSF_Context *msf, MSF_StreamNumber sn) +{ + B32 is_free_ok = 0; + MSF_StreamNode *stream_node = msf_find_stream_node(msf, sn); + if (stream_node) { + msf_stream_list_remove(&msf->st, stream_node); + msf_stream_resize_ex(msf, &stream_node->data, 0); + stream_node->data.size = MSF_DELETED_STREAM_STAMP; + is_free_ok = 1; + } + return is_free_ok; +} + +internal void +msf_stream_set_size(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt size) +{ + MSF_Stream *stream = msf_find_stream(msf, sn); + if (stream) { + stream->size = Min(size, stream->page_list.count * msf->page_size); + } else { + Assert(!"invalid stream number"); + } +} + +internal MSF_UInt +msf_stream_get_size(MSF_Context *msf, MSF_StreamNumber sn) +{ + MSF_UInt size = MSF_UINT_MAX; + MSF_Stream *stream = msf_find_stream(msf, sn); + if (stream) { + size = stream->size; + } + return size; +} + +internal MSF_UInt +msf_stream_get_cap__(MSF_Context *msf, MSF_Stream *stream) +{ + return stream->page_list.count * msf->page_size; +} + +internal MSF_UInt +msf_stream_get_cap(MSF_Context *msf, MSF_StreamNumber sn) +{ + MSF_Stream *stream = msf_find_stream(msf, sn); + MSF_UInt cap = 0; + if (stream) { + cap = msf_stream_get_cap__(msf, stream); + } + return cap; +} + +internal MSF_UInt +msf_stream_get_pos__(MSF_Context *msf, MSF_Stream *stream) +{ + return stream->pos; +} + +internal MSF_UInt +msf_stream_get_pos(MSF_Context *msf, MSF_StreamNumber sn) +{ + MSF_Stream *stream = msf_find_stream(msf, sn); + MSF_UInt pos = MSF_UINT_MAX; + if (stream) { + pos = msf_stream_get_pos__(msf, stream); + } + return pos; +} + +internal B32 +msf_stream_seek__(MSF_Context *msf, MSF_Stream *stream, MSF_UInt new_pos) +{ (void)msf; + stream->pos = Min(new_pos, stream->size); + stream->pos_page = 0; + return 1; +} + +internal B32 +msf_stream_seek(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt new_pos) +{ + B32 is_seek_ok = 0; + MSF_Stream *stream = msf_find_stream(msf, sn); + if (stream) { + is_seek_ok = msf_stream_seek__(msf, stream, new_pos); + } else { + Assert(!"failed to stream seek"); + } + return is_seek_ok; +} + +internal B32 +msf_stream_seek_start(MSF_Context *msf, MSF_StreamNumber sn) +{ + return msf_stream_seek(msf, sn, 0); +} + +internal B32 +msf_stream_seek_end(MSF_Context *msf, MSF_StreamNumber sn) +{ + MSF_UInt end = msf_stream_get_size(msf, sn); + return msf_stream_seek(msf, sn, end); +} + + +internal B32 +msf_stream_write__(MSF_Context *msf, MSF_Stream *stream, void *buffer, MSF_UInt buffer_size) +{ + B32 is_write_ok = 0; + + // are we writing over limit? + Assert((U64)stream->pos + (U64)buffer_size <= (U64)MSF_UINT_MAX); + + // make sure we have enough space to write buffer + MSF_UInt stream_cap = msf_stream_get_cap__(msf, stream); + MSF_UInt stream_pos_opl = stream->pos + buffer_size; + B32 grow_stream = stream_pos_opl > stream_cap; + if (grow_stream) { + B32 is_resize_ok = msf_stream_resize_ex(msf, stream, stream_pos_opl); + if (!is_resize_ok) { + goto exit; + } + } + + if (buffer) { + // lookup page for current stream position + if (!stream->pos_page) { + MSF_UInt page_idx = stream->pos / msf->page_size; + stream->pos_page = msf_page_from_index(stream->page_list, page_idx); + } + + // make write + is_write_ok = msf_write__(msf->page_data_list, msf->page_size, &stream->pos_page, &stream->pos, buffer, buffer_size); + } else { + stream->pos += buffer_size; + stream->pos_page = 0; + is_write_ok = 1; + } + + // update stream size + stream->size = Max(stream->size, stream->pos); + +exit:; + Assert(is_write_ok); + return is_write_ok; +} + +internal MSF_UInt +msf_stream_reserve__(MSF_Context *msf, MSF_Stream *stream, MSF_UInt res) +{ + Temp scratch = scratch_begin(0,0); + +#if PROFILE_TELEMETRY + String8 size_string = str8_from_memory_size2(scratch.arena, res); + ProfBeginDynamic("MSF Reserve %.*s", str8_varg(size_string)); +#endif + + B32 is_ok = 1; + + MSF_UInt cap = msf_stream_get_cap__(msf, stream); + MSF_UInt pos = msf_stream_get_pos__(msf, stream); + MSF_UInt cur = cap - pos; + + if (cur < res) { + is_ok = msf_stream_write__(msf, stream, 0, res); + AssertAlways(is_ok); + + is_ok = msf_stream_seek__(msf, stream, pos); + AssertAlways(is_ok); + } + + scratch_end(scratch); + ProfEnd(); + return is_ok; +} + +internal B32 +msf_stream_reserve(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt res) +{ + MSF_Stream *stream = msf_find_stream(msf, sn); + B32 is_res_ok = 0; + if (stream) { + is_res_ok = msf_stream_reserve__(msf, stream, res); + } + return is_res_ok; +} + +internal B32 +msf_stream_write(MSF_Context *msf, MSF_StreamNumber sn, void *buffer, MSF_UInt buffer_size) +{ + B32 is_write_ok = 0; + MSF_Stream *stream = msf_find_stream(msf, sn); + if (stream) { + is_write_ok = msf_stream_write__(msf, stream, buffer, buffer_size); + } + return is_write_ok; +} + +internal B32 +msf_stream_write_string(MSF_Context *msf, MSF_StreamNumber sn, String8 string) +{ + return msf_stream_write(msf, sn, string.str, string.size); +} + +internal B32 +msf_stream_write_list(MSF_Context *msf, MSF_StreamNumber sn, String8List list) +{ + B32 is_write_ok = 0; + MSF_Stream *stream = msf_find_stream(msf, sn); + if (stream) { + for (String8Node *node = list.first; node != 0; node = node->next) { + is_write_ok = msf_stream_write__(msf, stream, node->string.str, node->string.size); + if (!is_write_ok) { + break; + } + } + } + return is_write_ok; +} + +internal B32 +msf_stream_write_uint(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt value) +{ + return msf_stream_write_struct(msf, sn, &value); +} + +internal B32 +msf_stream_write_cstr(MSF_Context *msf, MSF_StreamNumber sn, String8 string) +{ + B32 is_string_written = msf_stream_write_string(msf, sn, string); + B32 is_null_written = msf_stream_write(msf, sn, 0, 1); + return is_string_written && is_null_written; +} + +internal B32 +msf_stream_write_u8(MSF_Context *msf, MSF_StreamNumber sn, U8 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal B32 +msf_stream_write_u16(MSF_Context *msf, MSF_StreamNumber sn, U16 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal B32 +msf_stream_write_u32(MSF_Context *msf, MSF_StreamNumber sn, U32 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal B32 +msf_stream_write_u64(MSF_Context *msf, MSF_StreamNumber sn, U64 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal B32 +msf_stream_write_s8(MSF_Context *msf, MSF_StreamNumber sn, S8 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal B32 +msf_stream_write_s16(MSF_Context *msf, MSF_StreamNumber sn, S16 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal B32 +msf_stream_write_s32(MSF_Context *msf, MSF_StreamNumber sn, S32 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal B32 +msf_stream_write_s64(MSF_Context *msf, MSF_StreamNumber sn, S64 value) +{ + return msf_stream_write(msf, sn, &value, sizeof(value)); +} + +internal +THREAD_POOL_TASK_FUNC(msf_write_task) +{ + ProfBeginFunction(); + MSF_WriteTask *task = raw_task; + + Rng1U64 range = task->range_arr[task_id]; + String8 data = str8_substr(task->data, range); + MSF_UInt data_pos = range.min + task->stream_pos; + + MSF_UInt page_idx = data_pos / task->page_size; + MSF_PageNode *page = msf_page_from_index(task->page_list, page_idx); + + if (!msf_write__(task->page_data_list, task->page_size, &page, &data_pos, data.str, data.size)) { + InvalidPath; + } + ProfEnd(); +} + +internal B32 +msf_stream_write_parallel(TP_Context *tp, MSF_Context *msf, MSF_StreamNumber sn, void *buffer, MSF_UInt buffer_size) +{ + Temp scratch = scratch_begin(0,0); + +#if PROFILE_TELEMETRY + String8 buffer_size_string = str8_from_memory_size2(scratch.arena, buffer_size); + ProfBeginDynamic("MSF Write Parallel [%.*s]", str8_varg(buffer_size_string)); +#endif + + MSF_Stream *stream = msf_find_stream(msf, sn); + + B32 is_write_ok = msf_stream_reserve__(msf, stream, buffer_size); + + if (is_write_ok) { + Temp scratch = scratch_begin(0,0); + + U64 expected_pos = stream->pos + buffer_size; + + U64 pre_size = Min(AlignPadPow2(stream->pos, msf->page_size), buffer_size); + U64 mid_size = AlignDownPow2(buffer_size - pre_size, msf->page_size); + U64 end_size = buffer_size - (pre_size + mid_size); + + U8 *pre_ptr = (U8*)buffer; + U8 *mid_ptr = (U8*)buffer + pre_size; + U8 *end_ptr = (U8*)buffer + pre_size + mid_size; + + ProfBegin("Write Buffer Pre"); + B32 is_pre_written = msf_stream_write__(msf, stream, pre_ptr, pre_size); + AssertAlways(is_pre_written); + ProfEnd(); + + // write buffer mid + if (mid_size > 0) { + Assert(stream->pos % msf->page_size == 0); + Assert(mid_size % msf->page_size == 0); + + MSF_WriteTask task; + task.page_size = msf->page_size; + task.page_data_list = msf->page_data_list; + task.page_list = stream->page_list; + task.stream_pos = stream->pos; + task.data = str8(mid_ptr, mid_size); + task.range_arr = tp_divide_work(scratch.arena, mid_size, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, msf_write_task, &task); + + // we rely on low-level msf_write__ to copy bytes which doesn't advance stream pos + U64 after_mid = stream->pos + mid_size; + B32 is_seek_ok = msf_stream_seek__(msf, stream, after_mid); + AssertAlways(is_seek_ok); + } + + ProfBegin("Write Buffer End"); + B32 is_end_ok = msf_stream_write__(msf, stream, end_ptr, end_size); + AssertAlways(is_end_ok); + ProfEnd(); + + // did we write bytes correctly? + AssertAlways(stream->pos == expected_pos); + + scratch_end(scratch); + } + + scratch_end(scratch); + ProfEnd(); + return is_write_ok; +} + +internal B32 +msf_stream_write_string_parallel(TP_Context *tp, MSF_Context *msf, MSF_StreamNumber sn, String8 string) +{ + return msf_stream_write_parallel(tp, msf, sn, string.str, string.size); +} + +//////////////////////////////// + +internal MSF_UInt +msf_stream_read__(MSF_Context *msf, MSF_Stream *stream, void *buffer, MSF_UInt buffer_size) +{ + // are we reading over limit? + Assert((U64)stream->pos + (U64)buffer_size <= (U64)MSF_UINT_MAX); + + // lookup page for current stream position + if (!stream->pos_page) { + MSF_UInt pos_page_idx = stream->pos / msf->page_size; + stream->pos_page = msf_page_from_index(stream->page_list, pos_page_idx); + } + + MSF_UInt bytes_read = msf_read__(msf->page_data_list, msf->page_size, &stream->pos_page, &stream->pos, buffer, buffer_size); + return bytes_read; +} + +internal MSF_UInt +msf_stream_read(MSF_Context *msf, MSF_StreamNumber sn, void *buffer, MSF_UInt buffer_size) +{ + MSF_Stream *stream = msf_find_stream(msf, sn); + if (stream) { + return msf_stream_read__(msf, stream, buffer, buffer_size); + } + return 0; +} + +internal S8 +msf_stream_read_s8(MSF_Context *msf, MSF_StreamNumber sn) +{ + S8 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal S16 +msf_stream_read_s16(MSF_Context *msf, MSF_StreamNumber sn) +{ + S16 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal S32 +msf_stream_read_s32(MSF_Context *msf, MSF_StreamNumber sn) +{ + S32 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal S64 +msf_stream_read_s64(MSF_Context *msf, MSF_StreamNumber sn) +{ + S64 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal U8 +msf_stream_read_u8(MSF_Context *msf, MSF_StreamNumber sn) +{ + U8 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal U16 +msf_stream_read_u16(MSF_Context *msf, MSF_StreamNumber sn) +{ + U16 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal U32 +msf_stream_read_u32(MSF_Context *msf, MSF_StreamNumber sn) +{ + U32 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal U64 +msf_stream_read_u64(MSF_Context *msf, MSF_StreamNumber sn) +{ + U64 result = 0; + msf_stream_read_struct(msf, sn, &result); + return result; +} + +internal String8 +msf_stream_read_block(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, U64 block_size) +{ + U8 *block_buffer = push_array(arena, U8, block_size); + MSF_UInt block_read = msf_stream_read(msf, sn, block_buffer, block_size); + Assert((U64)block_read == block_size); + String8 block = str8(block_buffer, block_size); + return block; +} + +internal String8 +msf_stream_read_string(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn) +{ + MSF_UInt start_pos = msf_stream_get_pos(msf, sn); + U64 size = 0; + for (;; size += 1) { + U8 cp = msf_stream_read_u8(msf, sn); + if (cp == 0) { + break; + } + } + + msf_stream_seek(msf, sn, start_pos); + String8 string = msf_stream_read_block(arena, msf, sn, size); + msf_stream_seek(msf, sn, start_pos + size + 1); // skip null + + return string; +} + +internal void +msf_stream_align(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt align) +{ + MSF_UInt pos = msf_stream_get_pos(msf, sn); + MSF_UInt pos_aligned = AlignPow2(pos, align); + msf_stream_seek(msf, sn, pos_aligned); +} + +//////////////////////////////// + +internal MSF_Context * +msf_alloc__(MSF_UInt page_size, MSF_PageNumber active_fpm) +{ + ProfBeginFunction(); + Assert(active_fpm == MSF_FPM0 || active_fpm == MSF_FPM1); + Assert(IsPow2(page_size)); + + Arena *arena = arena_alloc(); + + MSF_Context *msf = push_array(arena, MSF_Context, 1); + msf->arena = arena; + msf->page_size = page_size; + msf->active_fpm = active_fpm; + + ProfEnd(); + return msf; +} + +internal MSF_Context * +msf_alloc(MSF_UInt page_size, MSF_UInt active_fpm) +{ + MSF_Context *msf = msf_alloc__(page_size, active_fpm); + + // reserve first page for header + msf->header_page_list = msf_alloc_pages(msf, 1); + Assert(msf->header_page_list.count > 0); + Assert(msf->header_page_list.first->pn == 0); + + // reserve root page close to start of the file so we don't have to seek too far (not required) + msf->root_page_list = msf_alloc_pages(msf, 1); + Assert(msf->root_page_list.count == 1); + Assert(msf->root_page_list.first->pn == 3); + + return msf; +} + +internal MSF_StreamNode * +msf_find_stream_node(MSF_Context *msf, MSF_StreamNumber sn) +{ + MSF_StreamNode *node; + for (node = msf->st.first; node != 0; node = node->next) { + if (node->data.sn == sn) { + break; + } + } + return node; +} + +internal MSF_Stream * +msf_find_stream(MSF_Context *msf, MSF_StreamNumber sn) +{ + MSF_StreamNode *node = msf_find_stream_node(msf, sn); + MSF_Stream *data = 0; + if (node) { + data = &node->data; + } + return data; +} + +internal MSF_Error +msf_open_header(Arena *arena, MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageList *page_list) +{ + ProfBeginFunction(); + msf_page_list_push_extant_page(arena, page_list, page_data_list, page_size, 0); + ProfEnd(); + return MSF_Error_OK; +} + +internal MSF_Error +msf_open_root(Arena *arena, MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber root_pn, MSF_UInt stream_table_size, MSF_PageList *page_list) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + MSF_PageNumber st_page_count = msf_count_pages(page_size, stream_table_size); + MSF_UInt st_pn_size = sizeof(MSF_PageNumber) * st_page_count; + MSF_PageNumber root_pn_count = msf_count_pages(page_size, st_pn_size); + MSF_PageNumber *root_pn_arr = push_array(scratch.arena, MSF_PageNumber, root_pn_count); + for (MSF_UInt i = 0; i < root_pn_count; i += 1) { + root_pn_arr[i] = root_pn + i; + } + msf_page_list_push_extant_page_arr(arena, page_list, page_data_list, page_size, root_pn_arr, root_pn_count); + scratch_end(scratch); + ProfEnd(); + return MSF_Error_OK; +} + +internal MSF_Error +msf_open_stream_table_page_list(Arena *arena, MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageList root_page_list, MSF_UInt stream_table_size, MSF_PageList *page_list) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + MSF_Error error = MSF_Error_OK; + MSF_UInt st_pn_count = msf_count_pages(page_size, stream_table_size); + MSF_UInt st_pn_size = st_pn_count * sizeof(MSF_PageNumber); + MSF_PageNumber *st_pn_arr = push_array(scratch.arena, MSF_PageNumber, st_pn_count); + MSF_UInt st_pn_read_size = msf_read(page_data_list, page_size, root_page_list, 0, st_pn_arr, st_pn_size); + if (st_pn_read_size == st_pn_size) { + msf_page_list_push_extant_page_arr(arena, page_list, page_data_list, page_size, st_pn_arr, st_pn_count); + } else { + error = MSF_OpenError_UNABLE_TO_READ_STREAM_TABLE_PAGE_NUMBERS; + } + scratch_end(scratch); + ProfEnd(); + return error; +} + +internal MSF_Error +msf_open_stream_table(Arena *arena, MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageList st_page_list, MSF_UInt stream_table_size, MSF_StreamList *stream_list) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + MSF_Error error = MSF_Error_OK; + + // read out entire stream table + U8 *st_buffer = push_array(scratch.arena, U8, stream_table_size); + MSF_UInt st_read_size = msf_read(page_data_list, page_size, st_page_list, 0, st_buffer, stream_table_size); + if (st_read_size != stream_table_size) { + error = MSF_OpenError_INVALID_STREAM_TABLE; + goto exit; + } + + // setup buffer reader + String8 st_data = str8(st_buffer, st_read_size); + U64 st_cursor = 0; + + MSF_UInt stream_count = 0; + st_cursor += str8_deserial_read_struct(st_data, st_cursor, &stream_count); + + // stream count is a 32-bit but stream number is 16-bit?! + if (stream_count > MSF_STREAM_NUMBER_MAX) { + error = MSF_OpenError_STREAM_COUNT_OVERFLOW; + goto exit; + } + + // is there enoguh bytes to read streams sizes? + U64 size_arr_end = st_cursor + (U64)stream_count * sizeof(MSF_UInt); + if (size_arr_end > st_data.size) { + error = MSF_OpenError_UNABLE_TO_READ_STREAM_SIZES; + goto exit; + } + + // make pointer to stream sizes array + MSF_UInt *stream_size_arr = (MSF_UInt*)(st_buffer + st_cursor); + st_cursor += sizeof(stream_size_arr[0]) * stream_count; + + U64 arena_pos_before_stream_allocations = arena_pos(arena); + + // open streams + for (MSF_UInt stream_idx = 0; stream_idx < stream_count; stream_idx += 1) { + MSF_UInt stream_size = stream_size_arr[stream_idx]; + B32 is_present = stream_size != MSF_DELETED_STREAM_STAMP; + if (is_present) { + MSF_PageNumber pn_count = msf_count_pages(page_size, stream_size); + + // is there enough bytes in buffer to build stream page list? + MSF_UInt st_pn_end = st_cursor + pn_count * sizeof(MSF_PageNumber); + if (st_pn_end > stream_table_size) { + break; + } + + // setup page number array + MSF_PageNumber *pn_arr = (MSF_PageNumber*)(st_buffer + st_cursor); + st_cursor += sizeof(pn_arr[0]) * pn_count; + + // build stream page list + MSF_PageList page_list = {0}; + msf_page_list_push_extant_page_arr(arena, &page_list, page_data_list, page_size, pn_arr, pn_count); + + // alloc stream with opened pages + MSF_StreamNode *stream_node = msf_stream_alloc_(arena, stream_list); + stream_node->data.size = stream_size; + stream_node->data.page_list = page_list; + } + // stream was deleted but slot was kept to be reused in subsequent allocations + else { + MSF_StreamNode *stream_node = msf_stream_alloc_(arena, stream_list); + stream_node->data.size = stream_size; + } + } + + if (stream_list->count != stream_count) { + arena_pop_to(arena, arena_pos_before_stream_allocations); + error = MSF_OpenError_INVALID_STREAM_TABLE; + goto exit; + } + +exit:; + scratch_end(scratch); + ProfEnd(); + return error; +} + +internal MSF_Error +msf_open(String8 data, MSF_Context **msf_out) +{ + ProfBeginFunction(); + + MSF_Error error = MSF_Error_OK; + MSF_Context *msf = 0; + MSF_PageDataList page_data_list = {0}; + + // are there enough bytes for header? + if (sizeof(MSF_Header70) > data.size) { + error = MSF_OpenError_NOT_ENOUGH_BYTES_TO_READ_HEADER; + goto exit; + } + + // is this MSF 7.0? + MSF_Header70 *header = (MSF_Header70*)data.str; + if (MemoryCompare(header->magic, msf_msf70_magic, sizeof(msf_msf70_magic)) != 0) { + error = MSF_OpenError_INVALID_MAGIC; + goto exit; + } + + // validate page size + if (!IsPow2(header->page_size)) { + error = MSF_OpenError_PAGE_SIZE_IS_NOT_POW2; + goto exit; + } + + // validate page count + MSF_UInt file_page_count = msf_count_pages(header->page_size, data.size); + if (file_page_count != header->page_count) { + error = MSF_OpenError_PAGE_COUNT_DOESNT_MATCH_DATA_SIZE; + goto exit; + } + + // validate FPM + if (header->page_size < MSF_MIN_PAGE_SIZE) { + error = MSF_OpenError_INVALID_PAGE_SIZE; + goto exit; + } + if (header->page_size > MSF_MAX_PAGE_SIZE) { + error = MSF_OpenError_INVALID_PAGE_SIZE; + goto exit; + } + + // is there enough bytes to initialize PDB? + MSF_UInt check_size = header->page_size*3 + header->stream_table_size; + if (check_size > data.size) { + error = MSF_OpenError_NOT_ENOUGH_PAGES_TO_INIT; + goto exit; + } + + // validate FPM + if (header->active_fpm != MSF_FPM0 && header->active_fpm != MSF_FPM1) { + error = MSF_OpenError_INVALID_ACTIVE_FPM; + goto exit; + } + + // is there enough bytes to initialize root stream? + MSF_UInt root_pn_offset = OffsetOf(MSF_Header70, root_pn); + if (root_pn_offset + header->stream_table_size > data.size) { + error = MSF_OpenError_INVALID_ROOT_STREAM_PAGE_NUMBER; + goto exit; + } + + // validate root directory + MSF_UInt root_directory_page_count = msf_count_pages(header->page_size, header->stream_table_size); + MSF_UInt root_directory_max_page_count = header->page_size / sizeof(MSF_UInt); + if (root_directory_page_count > root_directory_max_page_count) { + error = MSF_Error_STREAM_TABLE_HAS_TOO_MANY_PAGES; + goto exit; + } + + // allocate MSF context and don't reserve special pages + msf = msf_alloc__(header->page_size, header->active_fpm); + + // divide data into fixed size nodes (with 4KB page each node is 128MB) + msf_set_page_data_list(msf->arena, &page_data_list, header->page_size, data); + + do { + MSF_PageList header_page_list = {0}; + error = msf_open_header(msf->arena, page_data_list, header->page_size, &header_page_list); + if (error != MSF_Error_OK) { + break; + } + + MSF_PageList root_page_list = {0}; + error = msf_open_root(msf->arena, page_data_list, header->page_size, header->root_pn, header->stream_table_size, &root_page_list); + if (error != MSF_Error_OK) { + break; + } + + MSF_PageList st_page_list = {0}; + error = msf_open_stream_table_page_list(msf->arena, page_data_list, header->page_size, root_page_list, header->stream_table_size, &st_page_list); + if (error != MSF_Error_OK) { + break; + } + + MSF_StreamList stream_list = {0}; + error = msf_open_stream_table(msf->arena, page_data_list, header->page_size, st_page_list, header->stream_table_size, &stream_list); + if (error != MSF_Error_OK) { + break; + } + + Assert(msf->page_size == header->page_size); + Assert(msf->active_fpm == header->active_fpm); + msf->page_count = header->page_count; + msf->page_data_list = page_data_list; + msf->header_page_list = header_page_list; + msf->root_page_list = root_page_list; + msf->st_page_list = st_page_list; + msf->st = stream_list; + + *msf_out = msf; + +#if LNK_PARANOID + msf_check_fpm_bits_for_page_list(page_data_list, msf->page_size, msf->active_fpm, header_page_list, MSF_PAGE_STATE_ALLOC); + msf_check_fpm_bits_for_page_list(page_data_list, msf->page_size, msf->active_fpm, root_page_list, MSF_PAGE_STATE_ALLOC); + msf_check_fpm_bits_for_page_list(page_data_list, msf->page_size, msf->active_fpm, st_page_list, MSF_PAGE_STATE_ALLOC); + for (MSF_StreamNode *stream_node = stream_list.first; stream_node != 0; stream_node = stream_node->next) { + msf_check_fpm_bits_for_page_list(page_data_list, msf->page_size, msf->active_fpm, stream_node->data.page_list, MSF_PAGE_STATE_ALLOC); + } +#endif + } while(0); + +exit:; + if (error != MSF_Error_OK) { + if (msf) { + msf_release(&msf); + } + } + + ProfEnd(); + return error; +} + +internal void +msf_release(MSF_Context **msf_ptr) +{ + arena_release((*msf_ptr)->arena); + *msf_ptr = 0; +} + +internal String8List +msf_build_stream_table_data(Arena *arena, MSF_StreamList *st, MSF_UInt page_size, MSF_UInt page_count) +{ + ProfBeginFunction(); + + MSF_UInt *stream_count_ptr = push_array(arena, MSF_UInt, 1); + *stream_count_ptr = st->count; + + MSF_UInt *stream_size_arr = push_array(arena, MSF_UInt, st->count); + MSF_UInt stream_page_count = 0; + + MSF_PageNumber *stream_pages_arr = push_array(arena, MSF_PageNumber, page_count); + + for (MSF_StreamNode *stream_node = st->first; stream_node != 0; stream_node = stream_node->next) { + MSF_Stream *stream = &stream_node->data; + + // is page list correct? + MSF_UInt expected_stream_page_count = msf_count_pages(page_size, stream->size); + if (expected_stream_page_count > stream->page_list.count) { + Assert(!"invalid page list "); + } + + // store stream sizes + stream_size_arr[stream->sn] = stream->size; + + // store stream pages + for (MSF_PageNode *page_node = stream->page_list.first; page_node != 0; page_node = page_node->next) { + // first three pages are reserved for header, FPM0, and FPM1 + Assert(page_node->pn > 2); + + // it's not necessarily a bug to use interval FPM pages, + // but for sake of correctness make sure there is no stream + // aside from FPM that uses these pages + // + // also, actual FPM pages should be asserted on: pn % (msf->page_size * MSF_BITS_PER_CHAR) + Assert((page_node->pn % page_size) != 1); + Assert((page_node->pn % page_size) != 2); + + // is there a stream with too many page nodes? + Assert(stream_page_count < page_count); + + // is this page number allocated? + //Assert(msf_get_fpm_page_bit_state(msf, page_node->pn) == MSF_PAGE_STATE_ALLOC); + + stream_pages_arr[stream_page_count] = page_node->pn; + stream_page_count += 1; + } + } + + // on disk stream table: + // MSF_UInt stream_count; + // MSF_UInt stream_size[stream_count]; + // MSF_PageNumber pages[stream_count][*]; + String8List st_data_list = {0}; + str8_list_push(arena, &st_data_list, str8((U8*)stream_count_ptr, sizeof(*stream_count_ptr))); + str8_list_push(arena, &st_data_list, str8((U8*)stream_size_arr, sizeof(*stream_size_arr) * (*stream_count_ptr))); + str8_list_push(arena, &st_data_list, str8((U8*)stream_pages_arr, sizeof(*stream_pages_arr) * stream_page_count)); + + ProfEnd(); + return st_data_list; +} + +internal MSF_Error +msf_build_stream_table(MSF_Context *msf, MSF_UInt *stream_table_size_out) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + MSF_Error error = MSF_Error_OK; + + String8List st_data_list = msf_build_stream_table_data(scratch.arena, &msf->st, msf->page_size, msf->page_count); + + MSF_UInt st_page_count = msf_count_pages(msf->page_size, st_data_list.total_size); + msf_free_pages(msf, &msf->st_page_list); // TODO: page reuse + msf->st_page_list = msf_alloc_pages(msf, st_page_count); + + MSF_UInt cursor = 0; + for (String8Node *node = st_data_list.first; node != 0; node = node->next) { + B32 is_data_written = msf_write(msf->page_data_list, msf->page_size, msf->st_page_list, cursor, node->string.str, node->string.size); + if (!is_data_written) { + error = MSF_BuildError_UNABLE_TO_WRITE_STREAM_TABLE; + goto exit; + } + cursor += node->string.size; + } + + *stream_table_size_out = st_data_list.total_size; + + exit:; + scratch_end(scratch); + ProfEnd(); + return error; +} + +internal MSF_Error +msf_build_root_directory(MSF_Context *msf) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + MSF_Error error = MSF_Error_OK; + + // MS impl doesn't handle root directory with page count above 1. + MSF_UInt max_page_count_in_root_directory = msf->page_size / sizeof(MSF_PageNumber); + if (msf->st_page_list.count > max_page_count_in_root_directory) { + error = MSF_Error_STREAM_TABLE_HAS_TOO_MANY_PAGES; + goto exit; + } + + // collect stream table page numbers + MSF_PageNumber *pn_arr = push_array(scratch.arena, MSF_PageNumber, msf->st_page_list.count); + MSF_UInt pn_count = 0; + for (MSF_PageNode *page = msf->st_page_list.first; page != 0; page = page->next) { + pn_arr[pn_count++] = page->pn; + } + + MSF_UInt root_page_count = msf_count_pages(msf->page_size, pn_count * sizeof(pn_arr[0])); + Assert(root_page_count == 1); + + msf_free_pages(msf, &msf->root_page_list); // TODO: page reuse + msf->root_page_list = msf_alloc_pages(msf, root_page_count); + B32 is_root_written = msf_write(msf->page_data_list, msf->page_size, msf->root_page_list, 0, pn_arr, sizeof(pn_arr[0]) * pn_count); + if (!is_root_written) { + error = MSF_BuildError_UNABLE_TO_WRITE_ROOT_DIRECTORY; + goto exit; + } + +exit:; + scratch_end(scratch); + ProfEnd(); + return error; +} + +internal MSF_Error +msf_build_header(MSF_Context *msf, MSF_UInt stream_table_size) +{ + ProfBeginFunction(); + MSF_Error error = MSF_Error_OK; + + MSF_Header70 header; + MemoryCopy(&header.magic[0], &msf_msf70_magic[0], sizeof(msf_msf70_magic)); + header.page_size = msf->page_size; + header.active_fpm = msf->active_fpm; + header.page_count = msf->page_count; + header.stream_table_size = stream_table_size; + header.unknown = 0; + header.root_pn = msf->root_page_list.first->pn; + + B32 is_header_written = msf_write(msf->page_data_list, msf->page_size, msf->header_page_list, 0, &header, sizeof(header)); + if (!is_header_written) { + error = MSF_BuildError_UNABLE_TO_WRITE_HEADER; + goto exit; + } + + exit:; + ProfEnd(); + return error; +} + +internal MSF_Error +msf_build(MSF_Context *msf) +{ + ProfBeginFunction(); + + MSF_Error err; + do { + MSF_UInt stream_table_size; + err = msf_build_stream_table(msf, &stream_table_size); + if (err != MSF_Error_OK) { + break; + } + + err = msf_build_root_directory(msf); + if (err != MSF_Error_OK) { + break; + } + + err = msf_build_header(msf, stream_table_size); + if (err != MSF_Error_OK) { + break; + } + } while (0); + + ProfEnd(); + return err; +} + +internal String8List +msf_get_page_data_nodes(Arena *arena, MSF_Context *msf) +{ + String8List list; MemoryZeroStruct(&list); + + U64 total_size = msf_get_save_size(msf); + U64 bytes_left = total_size; + U64 node_size = msf_get_data_node_size(msf->page_size); + + for (MSF_PageDataNode *data_node = msf->page_data_list.first; data_node != 0; data_node = data_node->next) { + // compute byte count for the node + U64 to_copy = Min(bytes_left, node_size); + bytes_left -= to_copy; + + String8 data = str8(data_node->data, to_copy); + str8_list_push(arena, &list, data); + } + return list; +} + +internal U64 +msf_get_save_size(MSF_Context *msf) +{ +#if 0 + MSF_PageNumber max_pn = msf_find_max_pn(msf->page_data_list, msf->page_size); + U64 size = ((U64)max_pn + 1) * (U64)msf->page_size; + Assert(msf_count_pages(size, msf->page_size) == msf->page_count); +#else + U64 size = (U64)msf->page_count * msf->page_size; +#endif + return size; +} + +internal B32 +msf_save(MSF_Context *msf, void *buffer, U64 buffer_size) +{ + ProfBeginFunction(); + + U64 node_size = msf_get_data_node_size(msf->page_size); + U64 cursor = 0; + + for (MSF_PageDataNode *node = msf->page_data_list.first; node != 0; node = node->next) { + // compute byte count for the copy + U64 bytes_in_buffer = buffer_size - cursor; + U64 to_copy = Min(bytes_in_buffer, node_size); + + // copy MSF bytes to output buffer + U8 *dst = (U8 *)buffer + cursor; + U8 *src = node->data; + MemoryCopy(dst, src, to_copy); + + // advance cursor + cursor += to_copy; + + // is output buffer full? + if (to_copy == 0) { + break; + } + } + + B32 is_save_ok = (cursor == buffer_size); + Assert(is_save_ok); + + ProfEnd(); + return is_save_ok; +} + +internal MSF_Error +msf_save_arena(Arena *arena, MSF_Context *msf, String8 *data_out) +{ + ProfBeginFunction(); + MSF_Error err = msf_build(msf); + if (err == MSF_Error_OK) { + U64 buffer_size = msf_get_save_size(msf); + U8 *buffer = push_array(arena, U8, buffer_size); + B32 is_saved = msf_save(msf, buffer, buffer_size); + if (is_saved) { + *data_out = str8(buffer, buffer_size); + } else { + arena_pop(arena, buffer_size); + } + } + ProfEnd(); + return err; +} + +internal char * +msf_error_to_string(MSF_Error code) +{ + char *str = ""; + switch (code) { + case MSF_Error_OK: break; + + case MSF_Error_STREAM_TABLE_HAS_TOO_MANY_PAGES: str = "stream table exceeds page limit"; break; + + case MSF_OpenError_NOT_ENOUGH_BYTES_TO_READ_HEADER: str = "input does not have enough bytes to read header"; break; + case MSF_OpenError_INVALID_MAGIC: str = "magic value does not match"; break; + case MSF_OpenError_PAGE_SIZE_IS_NOT_POW2: str = "page size is not power of two"; break; + case MSF_OpenError_INVALID_PAGE_SIZE: str = "invalid page size"; break; + case MSF_OpenError_NOT_ENOUGH_PAGES_TO_INIT: str = "not enough pages to initialize MSF"; break; + case MSF_OpenError_INVALID_ROOT_STREAM_PAGE_NUMBER: str = "invalid root stream page number"; break; + case MSF_OpenError_UNABLE_TO_READ_STREAM_TABLE_PAGE_NUMBERS: str = "unable to read stream table's page numbers"; break; + case MSF_OpenError_STREAM_COUNT_OVERFLOW: str = "stream count is overflown"; break; + case MSF_OpenError_UNABLE_TO_READ_STREAM_SIZES: str = "unable to read streams sizes"; break; + case MSF_OpenError_INVALID_STREAM_TABLE: str = "invalid stream table"; break; + case MSF_OpenError_INVALID_ACTIVE_FPM: str = "invalid active FPM"; break; + case MSF_OpenError_PAGE_COUNT_DOESNT_MATCH_DATA_SIZE: str = "page count from MSF header does not match data page count"; break; + + case MSF_BuildError_UNABLE_TO_WRITE_STREAM_TABLE: str = "unable to write stream table"; break; + case MSF_BuildError_UNABLE_TO_WRITE_STREAM_TABLE_PAGE_NUMBER_DIRECTORY: str = "unable to write stream table page number directory"; break; + case MSF_BuildError_UNABLE_TO_WRITE_ROOT_DIRECTORY: str = "unable to write root directory"; break; + case MSF_BuildError_UNABLE_TO_WRITE_HEADER: str = "unable to write header"; break; + } + + return str; +} + +//////////////////////////////// + +/* + Multi-Stream-Format is a database type of format for storing debug info + but in principle can store anything you want. MSF divides file + into fixed-sized pages (default page size is 4KiB) and puts them + together into streams. A stream is made up from a non-contigous + number of pages and supports following operations: alloc, free, open, write, read. + Current MSF 7.0 allows creating up to 64K of streams, where each stream can potentially + contain 2GiB of data (assuming default page size). + + Free Page Map assigns a bit to each page to indicate page alloc state. 0 = allocated and 1 = free. + FPM is alloced at fixed intervals of 'page_size * MSF_BITS_PER_CHAR'. At the begining of interval + two pages are reserved for status bits. The 'active_fpm' field in the MSF header tells which FPM page + is in use. On commit time MSF alternates between two pages, this way they support atomic read and write. + + FPM Bug: + Let's say you have a MSF file with page size 0x1000 bytes, you can represent 0x1000 * 8 = 0x8000 pages + or 0x8000 * 0x1000 = 128MiB. And when file exceeds this size a new FPM group should be allocated + at page numbers 0x8001 and 0x8002. However, in MS impl there is a bug where they don't multiply + interval by 8 and each FPM group is allocated at intervals of page size or 0x1000, so each FPM group is placed + at page numbers 0x1001, 0x1002, 0x2001, 0x2002, and so on. This way MSF files end up allocating 8 times more pages. + + Also, MS impl marks unused pages as allocated thus leaving them empty but LLVM repurposes them + for regular allocations and things work out fine because of the fact that MS computes correct + number of FPM pages when they save and load and the trailing pages aren't being touched: + https://github.com/microsoft/microsoft-pdb/blob/master/PDB/msf/msf.cpp#L2512 + + Root directory is a single paged stored as a page number in 'MSF_Header70.root_pn'. + The directory contains an array of page numbers needed to read the stream table. This is a late + addition introduced in version 7.0 that lets us have bigger stream tables. However, there is a limit + if stream table exceeds root directory, MSF becomes invalid. MS impl isn't + clear what should happen in this case, so we tried to contiguously allocate root pages + but VS and LLVM error out. In practice you can double page size to work around the limit. + +TODO: explain stream table + + */ + +#if 0 + +internal void +msf_bytedump_stream(char *file_name, MSF_Context *msf, MSF_StreamNumber sn, U64 start, U64 byte_count) +{ + Temp scratch = scratch_begin(0, 0); + U64 pos = msf_stream_get_pos(msf, sn); + msf_stream_seek(msf, sn, start); + U64 buffer_size = byte_count; + U8 *buffer = push_array(scratch.arena, U8, buffer_size); + MSF_UInt read_size = msf_stream_read(msf, sn, buffer, buffer_size); + os_write_file(str8_cstring(file_name), str8(buffer, read_size)); + msf_stream_seek(msf, sn, pos); + scratch_end(scratch); +} + +internal void +msf_hexdump_stream(FILE *file, MSF_Context *msf, MSF_StreamNumber sn, U64 start, U64 byte_count, U64 stride) +{ + Temp scratch = scratch_begin(0, 0); + U8 *row_buffer = push_array(scratch.arena, U8, stride); + U64 stream_size = msf_stream_get_size(msf, sn); + U64 cursor = start; + U64 end = Min(start + byte_count, stream_size); + while (cursor < stream_size) { + MSF_UInt read_size = msf_stream_read(msf, sn, row_buffer, stride); + + // print offset + fprintf(file, "%04llX", cursor); + + // print bytes + fprintf(file, " "); + for (U64 i = 0; i < read_size; i += 1) { + if (i > 0) { + fprintf(file, " "); + } + fprintf(file, "%02X", row_buffer[i]); + } + + // print ascii + fprintf(file, " "); + for (U64 i = 0; i < read_size; i += 1) { + U8 print_char = row_buffer[i]; + if (0x20 > print_char || print_char > 0x7E) { + print_char = '.'; + } + fprintf(file, "%c", print_char); + } + + // row is done + fprintf(file, "\n"); + + cursor += stride; + } + + scratch_end(scratch); +} + +internal void +msf_hexdump_stream_to_file(char *name, MSF_Context *msf, MSF_StreamNumber sn, U64 start, U64 byte_count, U64 stride) +{ + FILE *f = fopen(name, "w"); + msf_hexdump_stream(f, msf, sn, start, byte_count, stride); + fclose(f); +} + +#endif + +#if 0 +internal void +test_msf_open_save(void) +{ + Temp scratch = scratch_begin(0, 0); + + U32 item0 = 123; + U32 item1 = 321; + + MSF_StreamNumber stream; + String8 data; + { + MSF_Context *msf = msf_alloc(MSF_DEFAULT_PAGE_SIZE, MSF_DEFAULT_FPM); + stream = msf_stream_alloc(msf); + msf_stream_write_u32(msf, stream, item0); + msf_stream_write_u32(msf, stream, item1); + data = msf_save_arena(scratch.arena, msf); + msf_release(&msf); + } + + String8 data1; + { + MSF_Context *msf = 0; + MSF_Error err = msf_open(data, &msf); + Assert(err == MSF_Error_OK); + U32 read0 = msf_stream_read_u32(msf, stream); + Assert(read0 == item0); + U32 read1 = msf_stream_read_u32(msf, stream); + Assert(read1 == item1); + data1 = msf_save_arena(scratch.arena, msf); + msf_release(&msf); + } + + { + MSF_Context *msf = 0; + MSF_Error err = msf_open(data, &msf); + Assert(err == MSF_Error_OK); + U32 read0 = msf_stream_read_u32(msf, stream); + Assert(read0 == item0); + U32 read1 = msf_stream_read_u32(msf, stream); + Assert(read1 == item1); + msf_release(&msf); + } + + scratch_end(scratch); +} + +internal void +test_size_limit(void) +{ + Temp scratch = scratch_begin(0, 0); + + MSF_Context *msf = msf_alloc(8192, MSF_DEFAULT_FPM); + Assert(msf); + + U64 c = (1 * 1024 * 1024 * 1024) / msf->page_size; + U64 stream_count = 8; + + U64 data_size = msf->page_size; + U8 *data = push_array(scratch.arena, U8, data_size); + + for (U64 stream_idx = 0; stream_idx < stream_count; stream_idx += 1) { + MSF_StreamNumber stream = msf_stream_alloc(msf); + Assert(stream != MSF_INVALID_STREAM_NUMBER); + + MemorySet(&data[0], 1 + stream_idx, data_size); + + msf_stream_resize(msf, stream, c * msf->page_size); + + for (U64 i = 0; i < c; i += 1) { + B32 is_written = msf_stream_write(msf, stream, data, data_size); + Assert(is_written); + } + } + + //msf_grow(msf, MSF_PN_MAX); + + msf_stream_free(msf, 7); + msf_stream_free(msf, 6); + msf_stream_free(msf, 5); + + stream_count -= 3; + + String8 msf_data = msf_save_arena(scratch.arena, msf); + Assert(msf_data.size > 0); + msf_release(&msf); + + //os_write_file(str8_lit("test.msf"), msf_data); + + MSF_Error err = msf_open(msf_data, &msf); + Assert(err == MSF_Error_OK); + +#if 1 + U8 *buffer = push_array(scratch.arena, U8, data_size); + for (U64 stream_idx = 0; stream_idx < stream_count; stream_idx += 1) { + MSF_StreamNumber sn = (MSF_StreamNumber)stream_idx; + + MemorySet(&data[0], 1 + stream_idx, data_size); + + for (U64 i = 0; i < c; i += 1) { + MSF_UInt read_size = msf_stream_read(msf, sn, buffer, data_size); + Assert(read_size == data_size); + + int cmp = MemoryCompare(buffer, data, data_size); + Assert(cmp == 0); + } + } +#endif + + msf_release(&msf); + scratch_end(scratch); +} + +internal void +test_msf(void) +{ + test_size_limit(); + test_msf_open_save(); +} +#endif + diff --git a/src/linker/pdb_ext/msf_builder.h b/src/linker/pdb_ext/msf_builder.h new file mode 100644 index 00000000..40aba132 --- /dev/null +++ b/src/linker/pdb_ext/msf_builder.h @@ -0,0 +1,202 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#define MSF_PAGE_STATE_FREE 1 +#define MSF_PAGE_STATE_ALLOC 0 + +#define MSF_FPM0 1 +#define MSF_FPM1 2 + +#define MSF_DEFAULT_PAGE_SIZE 4096 +#define MSF_DEFAULT_FPM MSF_FPM0 + +typedef struct MSF_PageNumberArray +{ + U64 count; + MSF_PageNumber *v; +} MSF_PageNumberArray; + +typedef struct MSF_PageNode +{ + struct MSF_PageNode *next; + struct MSF_PageNode *prev; + MSF_PageNumber pn; +} MSF_PageNode; + +typedef struct MSF_PageList +{ + MSF_PageNode *first; + MSF_PageNode *last; + MSF_UInt count; +} MSF_PageList; + +typedef struct MSF_Stream +{ + MSF_StreamNumber sn; + MSF_UInt size; + MSF_UInt pos; + MSF_PageNode *pos_page; + MSF_PageList page_list; +} MSF_Stream; + +typedef struct MSF_StreamNode +{ + struct MSF_StreamNode *next; + struct MSF_StreamNode *prev; + MSF_Stream data; +} MSF_StreamNode; + +typedef struct MSF_StreamList +{ + MSF_UInt count; + MSF_StreamNode *first; + MSF_StreamNode *last; +} MSF_StreamList; + +typedef struct MSF_PageDataNode +{ + struct MSF_PageDataNode *next; + struct MSF_PageDataNode *prev; + U8 *data; +} MSF_PageDataNode; + +typedef struct MSF_PageDataList +{ + MSF_PageDataNode *first; + MSF_PageDataNode *last; + MSF_UInt count; +} MSF_PageDataList; + +typedef struct MSF_Context +{ + Arena *arena; + MSF_UInt page_size; + MSF_UInt active_fpm; + MSF_UInt fpm_rover; + MSF_PageNumber page_count; + MSF_PageDataList page_data_list; + MSF_PageDataList page_data_pool; + MSF_PageList header_page_list; + MSF_PageList root_page_list; + MSF_PageList st_page_list; + MSF_PageList page_pool; + MSF_StreamList st; +} MSF_Context; + +typedef enum MSF_Error +{ + MSF_Error_OK, + + // if you get this error this means stream table was divided into too many + // pages, and to fix this you need to bump up the page size + MSF_Error_STREAM_TABLE_HAS_TOO_MANY_PAGES, + + MSF_OpenError_NOT_ENOUGH_BYTES_TO_READ_HEADER, + MSF_OpenError_INVALID_MAGIC, + MSF_OpenError_PAGE_SIZE_IS_NOT_POW2, + MSF_OpenError_INVALID_PAGE_SIZE, + MSF_OpenError_NOT_ENOUGH_PAGES_TO_INIT, + MSF_OpenError_INVALID_ROOT_STREAM_PAGE_NUMBER, + MSF_OpenError_UNABLE_TO_READ_STREAM_TABLE_PAGE_NUMBERS, + MSF_OpenError_STREAM_COUNT_OVERFLOW, + MSF_OpenError_UNABLE_TO_READ_STREAM_SIZES, + MSF_OpenError_INVALID_STREAM_TABLE, + MSF_OpenError_INVALID_ACTIVE_FPM, + MSF_OpenError_PAGE_COUNT_DOESNT_MATCH_DATA_SIZE, + + MSF_BuildError_UNABLE_TO_WRITE_STREAM_TABLE, + MSF_BuildError_UNABLE_TO_WRITE_STREAM_TABLE_PAGE_NUMBER_DIRECTORY, + MSF_BuildError_UNABLE_TO_WRITE_ROOT_DIRECTORY, + MSF_BuildError_UNABLE_TO_WRITE_HEADER, +} MSF_Error; + +//////////////////////////////// + +typedef struct +{ + MSF_UInt page_size; + MSF_PageDataList page_data_list; + MSF_PageList page_list; + MSF_UInt stream_pos; + String8 data; + Rng1U64 *range_arr; +} MSF_WriteTask; + +//////////////////////////////// + +internal MSF_Context * msf_alloc(MSF_UInt page_size, MSF_UInt active_fpm); +internal MSF_Error msf_open(String8 data, MSF_Context **msf_out); +internal void msf_release(MSF_Context **msf_ptr); +internal MSF_Error msf_build(MSF_Context *msf); +internal U64 msf_get_save_size(MSF_Context *msf); +internal String8List msf_get_page_data_nodes(Arena *arena, MSF_Context *msf); +internal B32 msf_save(MSF_Context *msf, void *buffer, U64 buffer_size); +internal MSF_Error msf_save_arena(Arena *arena, MSF_Context *msf, String8 *data_out); +internal MSF_StreamNode * msf_find_stream_node(MSF_Context *msf, MSF_StreamNumber sn); +internal MSF_Stream * msf_find_stream(MSF_Context *msf, MSF_StreamNumber sn); +internal B32 msf_grow(MSF_Context *msf, MSF_PageNumber page_count); +internal MSF_PageNumber * msf_alloc_pn_arr(Arena *arena, MSF_Context *msf, MSF_UInt alloc_count); +internal void msf_free_pn_arr(MSF_Context *msf, MSF_PageNumber *pn_arr, MSF_UInt pn_count); +internal MSF_PageList msf_alloc_pages(MSF_Context *msf, MSF_UInt alloc_count); +internal void msf_free_pages(MSF_Context *msf, MSF_PageList *page_list); + +internal MSF_StreamNumber msf_stream_alloc_ex(MSF_Context *msf, MSF_UInt size); +internal MSF_StreamNumber msf_stream_alloc(MSF_Context *msf); +internal B32 msf_stream_free(MSF_Context *msf, MSF_StreamNumber sn); +internal B32 msf_stream_resize(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt new_size); +internal B32 msf_stream_resize_ex(MSF_Context *msf, MSF_Stream *stream, MSF_UInt size); +internal void msf_stream_set_size(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt size); +internal MSF_UInt msf_stream_get_size(MSF_Context *msf, MSF_StreamNumber sn); +internal MSF_UInt msf_stream_get_cap(MSF_Context *msf, MSF_StreamNumber); +internal MSF_UInt msf_stream_get_pos(MSF_Context *msf, MSF_StreamNumber sn); +internal void msf_stream_align(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt align); +internal B32 msf_stream_reserve(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt size); +internal B32 msf_stream_seek(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt new_pos); +internal B32 msf_stream_seek_start(MSF_Context *msf, MSF_StreamNumber sn); +internal B32 msf_stream_seek_end(MSF_Context *msf, MSF_StreamNumber sn); + +internal MSF_UInt msf_stream_read(MSF_Context *msf, MSF_StreamNumber sn, void *dst, MSF_UInt dst_len); +internal String8 msf_stream_read_block(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, U64 block_size); +internal String8 msf_stream_read_string(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn); +internal S8 msf_stream_read_s8(MSF_Context *msf, MSF_StreamNumber sn); +internal S16 msf_stream_read_s16(MSF_Context *msf, MSF_StreamNumber sn); +internal S32 msf_stream_read_s32(MSF_Context *msf, MSF_StreamNumber sn); +internal S64 msf_stream_read_s64(MSF_Context *msf, MSF_StreamNumber sn); +internal U8 msf_stream_read_u8(MSF_Context *msf, MSF_StreamNumber sn); +internal U16 msf_stream_read_u16(MSF_Context *msf, MSF_StreamNumber sn); +internal U32 msf_stream_read_u32(MSF_Context *msf, MSF_StreamNumber sn); +internal U64 msf_stream_read_u64(MSF_Context *msf, MSF_StreamNumber sn); +#define msf_stream_read_array(msf, sn, ptr, count) msf_stream_read(msf, sn, ptr, sizeof(*ptr) * (count)) +#define msf_stream_read_struct(msf, sn, ptr) msf_stream_read_array(msf, sn, ptr, 1) + +internal B32 msf_stream_write(MSF_Context *msf, MSF_StreamNumber sn, void *buffer, MSF_UInt buffer_size); +internal B32 msf_stream_write_string(MSF_Context *msf, MSF_StreamNumber sn, String8 string); +internal B32 msf_stream_write_list(MSF_Context *msf, MSF_StreamNumber sn, String8List list); +internal B32 msf_stream_write_uint(MSF_Context *msf, MSF_StreamNumber sn, MSF_UInt value); +internal B32 msf_stream_write_cstr(MSF_Context *msf, MSF_StreamNumber sn, String8 string); +internal B32 msf_stream_write_u8(MSF_Context *msf, MSF_StreamNumber sn, U8 value); +internal B32 msf_stream_write_u16(MSF_Context *msf, MSF_StreamNumber sn, U16 value); +internal B32 msf_stream_write_u32(MSF_Context *msf, MSF_StreamNumber sn, U32 value); +internal B32 msf_stream_write_u64(MSF_Context *msf, MSF_StreamNumber sn, U64 value); +internal B32 msf_stream_write_s8(MSF_Context *msf, MSF_StreamNumber sn, S8 value); +internal B32 msf_stream_write_s16(MSF_Context *msf, MSF_StreamNumber sn, S16 value); +internal B32 msf_stream_write_s32(MSF_Context *msf, MSF_StreamNumber sn, S32 value); +internal B32 msf_stream_write_s64(MSF_Context *msf, MSF_StreamNumber sn, S64 value); +internal B32 msf_stream_write_parallel(TP_Context *tp, MSF_Context *msf, MSF_StreamNumber sn, void *buffer, MSF_UInt buffer_size); +#define msf_stream_write_array(m, s, v, c) msf_stream_write(m, s, (void*)(v), sizeof(*(v)) * (c)) +#define msf_stream_write_struct(m, s, v ) msf_stream_write_array(m, s, v, 1) + +internal MSF_UInt msf_count_pages(MSF_UInt page_size, U64 data_size); +internal MSF_PageNumber msf_get_page_count_cap(MSF_PageDataList page_data_list, MSF_UInt page_size); +internal MSF_UInt msf_get_fpm_interval_correct(MSF_UInt page_size); +internal MSF_UInt msf_get_fpm_interval_wrong(MSF_UInt page_size); +internal MSF_UInt msf_get_fpm_idx_from_pn(MSF_UInt page_size, MSF_PageNumber pn); +internal MSF_UInt msf_get_fpm_page_bit_state(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber active_fpm, MSF_PageNumber pn); +internal void msf_set_fpm_bit(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageNumber active_fpm, MSF_PageNumber pn, B32 state); +internal B32 msf_write(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageList page_list, MSF_UInt offset, void *buffer, MSF_UInt buffer_size); +internal MSF_UInt msf_read(MSF_PageDataList page_data_list, MSF_UInt page_size, MSF_PageList page_list, MSF_UInt offset, void *buffer, MSF_UInt buffer_size); + +internal char * msf_error_to_string(MSF_Error code); + diff --git a/src/linker/pdb_ext/pdb.c b/src/linker/pdb_ext/pdb.c new file mode 100644 index 00000000..f0b3447c --- /dev/null +++ b/src/linker/pdb_ext/pdb.c @@ -0,0 +1,34 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U32 +pdb_hash_udt(CV_UDTInfo udt_info, String8 data) +{ + B32 is_fwdref = !!(udt_info.props & CV_TypeProp_FwdRef); + B32 is_scoped = !!(udt_info.props & CV_TypeProp_Scoped); + B32 has_unique_name = !!(udt_info.props & CV_TypeProp_HasUniqueName); + B32 is_anon = has_unique_name && cv_is_udt_name_anon(udt_info.name); + + U32 hash = 0; + // dbi/tpi.cpp:1918 + if (!is_fwdref && !is_scoped && !is_anon) { + hash = pdb_hash_v1(udt_info.name); + } + // dbi/tpi.cpp:1937 + else if (!is_fwdref && has_unique_name && is_scoped && !is_anon) { + hash = pdb_hash_v1(udt_info.unique_name); + } + // dbi/tpi.cpp 1338 + else { + hash = pdb_hash_v1(data); + } + + return hash; +} + +internal U32 +pdb_crc32_from_string(String8 string) +{ + return ~update_crc32(~0, string.str, string.size); +} + diff --git a/src/linker/pdb_ext/pdb.h b/src/linker/pdb_ext/pdb.h new file mode 100644 index 00000000..3101a935 --- /dev/null +++ b/src/linker/pdb_ext/pdb.h @@ -0,0 +1,7 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal U32 pdb_hash_udt(CV_UDTInfo udt_info, String8 data); +internal U32 pdb_crc32_from_string(String8 string); diff --git a/src/linker/pdb_ext/pdb_builder.c b/src/linker/pdb_ext/pdb_builder.c new file mode 100644 index 00000000..77e77376 --- /dev/null +++ b/src/linker/pdb_ext/pdb_builder.c @@ -0,0 +1,3709 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +//////////////////////////////// + +internal U64 +pdb_hash_table_compute_load_factor(U64 count) +{ + // PDB/include/map.h:cdrLoadMax() + U64 load_factor = count * 2/3 + 1; + return load_factor; +} + +internal void +pdb_hash_table_alloc(PDB_HashTable *ht, U32 max) +{ + ProfBeginFunction(); + ht->arena = arena_alloc(); + ht->bucket_arr = push_array(ht->arena, PDB_HashTableBucket, max); + ht->present_bits = bit_array_init32(ht->arena, max); + ht->deleted_bits = bit_array_init32(ht->arena, max); + ht->max = max; + ht->count = 0; + bit_array_set_bit_range32(ht->deleted_bits, rng_1u64(0, max), 1); + ProfEnd(); +} + +internal void +pdb_hash_table_release(PDB_HashTable *ht) +{ + ProfBeginFunction(); + arena_release(ht->arena); + MemoryZeroStruct(ht); + ProfEnd(); +} + +internal PDB_HashTableParseError +pdb_hash_table_from_data(PDB_HashTable *ht, + String8 data, + B32 has_local_data, + PDB_HashTableUnpackFunc *unpack_func, + void *unpack_ud, + U64 *read_bytes_out) +{ + ProfBeginFunction(); + PDB_HashTableParseError error = PDB_HashTableParseError_OK; + + U64 cursor = 0; + + U32 local_data_size = 0; + String8 local_data = str8(0,0); + U32 count = 0; + U32 max = 0; + U32Array present_bits = {0}; + U32Array deleted_bits = {0}; + + do { + error = PDB_HashTableParseError_OUT_OF_BYTES; + + if (has_local_data) { + if (cursor + sizeof(local_data_size) > data.size) { + break; + } + cursor += str8_deserial_read_struct(data, cursor, &local_data_size); + if (cursor + local_data_size > data.size) { + break; + } + cursor += str8_deserial_read_block(data, cursor, local_data_size, &local_data); + } + + if (cursor + sizeof(count) > data.size) { + break; + } + cursor += str8_deserial_read_struct(data, cursor, &count); + if (cursor + sizeof(max) > data.size) { + break; + } + cursor += str8_deserial_read_struct(data, cursor, &max); + cursor += pdb_read_bit_vector_string(data, cursor, &present_bits); + cursor += pdb_read_bit_vector_string(data, cursor, &deleted_bits); + + error = PDB_HashTableParseError_OK; + } while(0); + + if (error == PDB_HashTableParseError_OK) { + U64 load_factor = pdb_hash_table_compute_load_factor(max); + B32 is_count_ok = count < max; + B32 is_load_factor_ok = count < load_factor; + B32 is_present_bits_ok = present_bits.count <= AlignPow2(max, 32); + B32 is_deleted_bits_ok = deleted_bits.count <= AlignPow2(max, 32); + if (is_count_ok && is_load_factor_ok && is_present_bits_ok && is_deleted_bits_ok) { + Arena *arena = arena_alloc(); + PDB_HashTableBucket *bucket_arr = push_array_no_zero(arena, PDB_HashTableBucket, max); + U32Array present_bits_new = bit_array_init32(arena, max); + U32Array deleted_bits_new = bit_array_init32(arena, max); + MemoryCopyTyped(&present_bits_new.v[0], &present_bits.v[0], present_bits.count); + MemoryCopyTyped(&deleted_bits_new.v[0], &deleted_bits.v[0], deleted_bits.count); + + // unpack buckets + U64 read_count = 0; + for (U64 bucket_idx = 0; bucket_idx < max; bucket_idx += 1) { + if (bit_array_is_bit_set(present_bits_new, bucket_idx)) { + if (bit_array_is_bit_set(deleted_bits_new, bucket_idx)) { + error = PDB_HashTableParseError_CORRUPTED; + break; + } + if (read_count >= count) { + error = PDB_HashTableParseError_CORRUPTED; + break; + } + + String8 key; + String8 value; + B32 has_unpack_failed = unpack_func(unpack_ud, local_data, data, &cursor, &key, &value); + if (has_unpack_failed) { + error = PDB_HashTableParseError_CORRUPTED; + break; + } + + bucket_arr[bucket_idx].key = key; + bucket_arr[bucket_idx].value = value; + + read_count += 1; + } + } + + if (error == PDB_HashTableParseError_OK) { + ht->arena = arena; + ht->bucket_arr = bucket_arr; + ht->present_bits = present_bits_new; + ht->deleted_bits = deleted_bits_new; + ht->count = count; + ht->max = max; + + if (read_bytes_out) { + // TBH data format should tell parser upfront size of the hash table + *read_bytes_out = cursor; + } + } else { + arena_release(arena); + } + } else { + error = PDB_HashTableParseError_CORRUPTED; + } + } + + ProfEnd(); + return error; +} + +internal String8 +pdb_data_from_hash_table(Arena *arena, + PDB_HashTable *ht, + B32 has_local_data, + PDB_HashTablePackFunc *pack_func, + void *pack_ud) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + String8Array key_arr = {0}; + String8Array value_arr = {0}; + pdb_hash_table_get_present_keys_and_values(scratch.arena, ht, &key_arr, &value_arr); + + String8List local_data_srl = {0}; + String8List key_value_srl = {0}; + str8_serial_begin(scratch.arena, &local_data_srl); + str8_serial_begin(scratch.arena, &key_value_srl); + + for (U64 i = 0; i < ht->count; i += 1) { + String8 key = key_arr.v[i]; + String8 value = value_arr.v[i]; + pack_func(scratch.arena, &local_data_srl, &key_value_srl, key, value, pack_ud); + } + + // serialize hash table + String8List srl = {0}; + str8_serial_begin(scratch.arena, &srl); + if (has_local_data) { + U32 local_data_size32 = safe_cast_u32(local_data_srl.total_size); + str8_serial_push_u32(scratch.arena, &srl, local_data_size32); + str8_list_concat_in_place(&srl, &local_data_srl); + } + str8_serial_push_u32(scratch.arena, &srl, ht->count); + str8_serial_push_u32(scratch.arena, &srl, ht->max); + str8_serial_push_u32(scratch.arena, &srl, ht->present_bits.count); + str8_serial_push_array(scratch.arena, &srl, &ht->present_bits.v[0], ht->present_bits.count); + str8_serial_push_u32(scratch.arena, &srl, ht->deleted_bits.count); + str8_serial_push_array(scratch.arena, &srl, &ht->deleted_bits.v[0], ht->deleted_bits.count); + str8_list_concat_in_place(&srl, &key_value_srl); + String8 data = str8_serial_end(arena, &srl); + + scratch_end(scratch); + ProfEnd(); + return data; +} + +internal void +pdb_hash_table_grow(PDB_HashTable *ht, U64 new_capacity) +{ + ProfBeginFunction(); + PDB_HashTable new_ht; + pdb_hash_table_alloc(&new_ht, new_capacity); + for (U32 i = 0; i < ht->max; ++i) { + if (bit_array_is_bit_set(ht->present_bits, i)) { + PDB_HashTableBucket *bucket = &ht->bucket_arr[i]; + B32 is_set = pdb_hash_table_try_set(&new_ht, bucket->key, bucket->value); + Assert(is_set); + } + } + pdb_hash_table_release(ht); + *ht = new_ht; + ProfEnd(); +} + +internal U32 +pdb_hash_table_hash(String8 key) +{ + return (U16)pdb_hash_v1(key); +} + +internal B32 +pdb_hash_table_try_set(PDB_HashTable *ht, String8 key, String8 value) +{ + ProfBeginFunction(); + B32 is_set = 0; + U32 best_ibucket = pdb_hash_table_hash(key) % ht->max; + U32 ibucket = best_ibucket; + do { + B32 is_present = pdb_hash_table_is_present(ht, ibucket); + if ( ! is_present) { + PDB_HashTableBucket *bucket = &ht->bucket_arr[ibucket]; + bucket->key = push_str8_copy(ht->arena, key); + bucket->value = push_str8_copy(ht->arena, value); + + bit_array_set_bit32(ht->present_bits, ibucket, 1); + bit_array_set_bit32(ht->deleted_bits, ibucket, 0); + + ht->count += 1; + is_set = 1; + break; + } + ibucket = (ibucket + 1) % ht->max; + } while (ibucket != best_ibucket); + ProfEnd(); + return is_set; +} + +internal void +pdb_hash_table_set(PDB_HashTable *ht, String8 key, String8 value) +{ + ProfBeginFunction(); + + // should resize? + U64 load_factor = pdb_hash_table_compute_load_factor(ht->max); + if (ht->count + 1 >= load_factor) { + pdb_hash_table_grow(ht, ht->max * 2); + } + + // set new item + B32 is_set = pdb_hash_table_try_set(ht, key, value); + AssertAlways(is_set); + + ProfEnd(); +} + +internal B32 +pdb_hash_table_get(PDB_HashTable *ht, String8 key, String8 *value_out) +{ + ProfBeginFunction(); + B32 is_get_ok = 0; + U32 best_ibucket = pdb_hash_table_hash(key) % ht->max; + U32 ibucket = best_ibucket; + do { + B32 is_present = pdb_hash_table_is_present(ht, ibucket); + if (is_present) { + PDB_HashTableBucket *bucket = &ht->bucket_arr[ibucket]; + B32 is_match = str8_match(bucket->key, key, 0); + if (is_match) { + *value_out = bucket->value; + is_get_ok = 1; + break; + } + } else { + break; + } + ibucket = (ibucket + 1) % ht->max; + } while (ibucket != best_ibucket); + ProfEnd(); + return is_get_ok; +} + +internal void +pdb_hash_table_delete(PDB_HashTable *ht, String8 key) +{ + ProfBeginFunction(); + U32 best_ibucket = pdb_hash_table_hash(key) % ht->max; + U32 ibucket = best_ibucket; + do { + B32 is_present = pdb_hash_table_is_present(ht, ibucket); + if (!is_present) { + break; + } + PDB_HashTableBucket *bucket = &ht->bucket_arr[ibucket]; + int cmp = MemoryCompare(key.str, bucket->key.str, key.size); + if (cmp == 0) { + bit_array_set_bit32(ht->present_bits, ibucket, 0); + bit_array_set_bit32(ht->deleted_bits, ibucket, 1); + ht->count -= 1; + break; + } + ibucket = (ibucket + 1) % ht->max; + } while (ibucket != best_ibucket); + ProfEnd(); +} + +internal B32 +pdb_hash_table_is_present(PDB_HashTable *ht, U32 k) +{ + Assert(k < ht->max); + return bit_array_is_bit_set(ht->present_bits, k); +} + +internal B32 +pdb_hash_table_is_deleted(PDB_HashTable *ht, U32 k) +{ + Assert(k < ht->max); + return bit_array_is_bit_set(ht->deleted_bits, k); +} + +internal void +pdb_hash_table_get_present_keys_and_values(Arena *arena, PDB_HashTable *ht, String8Array *keys_out, String8Array *values_out) +{ + *keys_out = str8_array_reserve(arena, ht->count); + *values_out = str8_array_reserve(arena, ht->count); + for (U64 bucket_idx = 0; bucket_idx < ht->max; bucket_idx += 1) { + if (bit_array_is_bit_set(ht->present_bits, bucket_idx)) { + PDB_HashTableBucket *bucket = &ht->bucket_arr[bucket_idx]; + Assert(keys_out->count < ht->count); + keys_out->v[keys_out->count++] = bucket->key; + values_out->v[values_out->count++] = bucket->value; + } + } +} + +//////////////////////////////// + +PDB_HASH_TABLE_UNPACK_FUNC(pdb_named_stream_ht_unpack) +{ + Assert(!ud); + + U32 key_data_offset = max_U32; + *key_value_cursor += str8_deserial_read_struct(key_value_data, *key_value_cursor, &key_data_offset); + + U8 *cstr_ptr = local_data.str + key_data_offset; + U8 *cstr_opl = local_data.str + local_data.size; + String8 stream_name = str8_cstring_capped(cstr_ptr, cstr_opl); + + // NOTE: stream number is U16 but in the reference they cast to U32 + String8 stream_number = {0}; + *key_value_cursor += str8_deserial_read_block(key_value_data, *key_value_cursor, sizeof(U32), &stream_number); + + *key_out = stream_name; + *value_out = stream_number; + + return 0; +} + +PDB_HASH_TABLE_UNPACK_FUNC(pdb_hash_adj_ht_unpack) +{ + Assert(local_data.size == 0); + + if (*key_value_cursor + sizeof(PDB_StringOffset) > key_value_data.size){ + return 1; + } + PDB_StringOffset string_offset = 0; + *key_value_cursor += str8_deserial_read_struct(key_value_data, *key_value_cursor, &string_offset); + + if (*key_value_cursor + sizeof(CV_TypeIndex) > key_value_data.size) { + return 1; + } + String8 type_index = {0}; + *key_value_cursor += str8_deserial_read_block(key_value_data, *key_value_cursor, sizeof(CV_TypeIndex), &type_index); + + PDB_StringTable *strtab = (PDB_StringTable*)ud; + String8 type_name = pdb_strtab_string_from_offset(strtab, string_offset); + + *key_out = type_name; + *value_out = type_index; + + return 0; +} + +PDB_HASH_TABLE_UNPACK_FUNC(pdb_src_header_block_ht_unpack) +{ + if (*key_value_cursor + sizeof(PDB_StringOffset) > key_value_data.size) { + return 1; + } + PDB_StringOffset path_offset = 0; + *key_value_cursor += str8_deserial_read_struct(key_value_data, *key_value_cursor, &path_offset); + + if (path_offset + sizeof(PDB_SrcHeaderBlockEntry) > key_value_data.size) { + return 1; + } + String8 src_header_block_entry = {0}; + *key_value_cursor += str8_deserial_read_block(key_value_data, *key_value_cursor, sizeof(PDB_SrcHeaderBlockEntry), &src_header_block_entry); + + PDB_StringTable *strtab = (PDB_StringTable*)ud; + String8 path = pdb_strtab_string_from_offset(strtab, path_offset); + + *key_out = path; + *value_out = src_header_block_entry; + + return 0; +} + +PDB_HASH_TABLE_PACK_FUNC(pdb_named_stream_ht_pack) +{ + Assert(!ud); + Assert(value.size == sizeof(U32)); + + U64 key_data_offset = local_data_srl->total_size; + str8_serial_push_cstr(arena, local_data_srl, key); + + U32 key_data_offset32 = safe_cast_u32(key_data_offset); + str8_serial_push_u32(arena, key_value_srl, key_data_offset32); + str8_serial_push_string(arena, key_value_srl, value); +} + +PDB_HASH_TABLE_PACK_FUNC(pdb_hash_adj_ht_pack) +{ + Assert(value.size == sizeof(CV_TypeIndex)); + + PDB_StringTable *strtab = (PDB_StringTable*)ud; + + PDB_StringIndex string_idx = PDB_INVALID_STRING_INDEX; + B32 is_found = pdb_strtab_search(strtab, key, &string_idx); + Assert(is_found); + + PDB_StringOffset type_name_offset = pdb_strtab_string_to_offset(strtab, string_idx); + + str8_serial_push_struct(arena, key_value_srl, &type_name_offset); + str8_serial_push_string(arena, key_value_srl, value); +} + +PDB_HASH_TABLE_PACK_FUNC(pdb_src_header_block_ht_pack) +{ + Assert(value.size == sizeof(PDB_SrcHeaderBlockEntry)); + + PDB_StringTable *strtab = (PDB_StringTable*)ud; + + PDB_StringIndex path_idx = 0; + B32 is_found = pdb_strtab_search(strtab, key, &path_idx); + Assert(is_found); + + PDB_StringOffset path_offset = pdb_strtab_string_to_offset(strtab, path_idx); + + str8_serial_push_struct(arena, key_value_srl, &path_offset); + str8_serial_push_string(arena, key_value_srl, value); +} + +//////////////////////////////// + +internal PDB_HashTableParseError +pdb_hash_adj_hash_table_from_data(PDB_HashTable *ht, String8 data, PDB_StringTable *strtab, U64 *read_bytes_out) +{ + return pdb_hash_table_from_data(ht, data, 0, pdb_hash_adj_ht_unpack, strtab, read_bytes_out); +} + +internal PDB_HashTableParseError +pdb_src_header_block_ht_from_data(PDB_HashTable *ht, String8 data, PDB_StringTable *strtab, U64 *read_bytes_out) +{ + return pdb_hash_table_from_data(ht, data, 0, pdb_src_header_block_ht_unpack, strtab, read_bytes_out); +} + +internal PDB_HashTableParseError +pdb_named_stream_ht_from_data(PDB_HashTable *ht, String8 data, U64 *read_bytes_out) +{ + return pdb_hash_table_from_data(ht, data, 1, pdb_named_stream_ht_unpack, 0, read_bytes_out); +} + +internal String8 +pdb_data_from_hash_adj_hash_table(Arena *arena, PDB_HashTable *ht, PDB_StringTable *strtab) +{ + String8 data = pdb_data_from_hash_table(arena, ht, 0, pdb_hash_adj_ht_pack, strtab); + return data; +} + +internal String8 +pdb_data_from_src_header_block_ht(Arena *arena, PDB_HashTable *ht, PDB_StringTable *strtab) +{ + String8 data = pdb_data_from_hash_table(arena, ht, 0, pdb_src_header_block_ht_pack, strtab); + return data; +} + +internal String8 +pdb_data_from_named_stream_ht(Arena *arena, PDB_HashTable *ht) +{ + String8 data = pdb_data_from_hash_table(arena, ht, 1, pdb_named_stream_ht_pack, 0); + return data; +} + +//////////////////////////////// + +internal void +pdb_strtab_alloc(PDB_StringTable *strtab, U32 max) +{ + ProfBeginFunction(); + strtab->arena = arena_alloc(); + strtab->version = 1; + strtab->size = 0; + strtab->bucket_count = 0; + strtab->bucket_max = (U64)((F64)max * 1.3); + strtab->ibucket_array = push_array(strtab->arena, U32, strtab->bucket_max); + MemorySet(strtab->ibucket_array, 0xff, sizeof(strtab->ibucket_array[0]) * strtab->bucket_max); + strtab->bucket_array = push_array(strtab->arena, PDB_StringTableBucket *, strtab->bucket_max); + + // string table always has a null for first entry + pdb_strtab_add(strtab, str8_lit("")); + + ProfEnd(); +} + +internal PDB_StringTableOpenError +pdb_strtab_open(PDB_StringTable *strtab, MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + + PDB_StringTableOpenError err = PDB_StringTableOpenError_OK; + + Arena *arena = 0; + String8 string_buffer; + U32 bucket_max; + U32 bucket_count; + U32 *ibucket_array; + PDB_StringTableBucket **bucket_array; + + PDB_StringTableHeader header = {0}; + msf_stream_read_struct(msf, sn, &header); + + if (header.magic == PDB_StringTableHeader_MAGIC) { + if (header.version == PDB_StringTableHeader_CurrentVersion) { + Temp scratch = scratch_begin(0,0); + + arena = arena_alloc(); + + U32 string_size; + String8 offset_buffer; + + // read table data + string_size = msf_stream_read_u32(msf, sn); + string_buffer = msf_stream_read_block(arena, msf, sn, string_size); + bucket_max = msf_stream_read_u32(msf, sn); + offset_buffer = msf_stream_read_block(scratch.arena, msf, sn, bucket_max * sizeof(U32)); + bucket_count = msf_stream_read_u32(msf, sn); + + U64 expected_size = sizeof(PDB_StringTableHeader) + + string_buffer.size + + sizeof(bucket_max) + + offset_buffer.size + + sizeof(bucket_count); + U64 actual_size = msf_stream_get_size(msf, sn); + + if (expected_size <= actual_size && + string_buffer.size == string_size && + offset_buffer.size == sizeof(U32)*bucket_max && + bucket_count <= bucket_max) { + // init string table + ibucket_array = push_array_no_zero(arena, U32, bucket_max); + bucket_array = push_array_no_zero(arena, PDB_StringTableBucket *, bucket_max); + + // open buckets + PDB_StringTableBucket *node_arr = push_array_no_zero(arena, PDB_StringTableBucket, bucket_count); + U8 *string_buffer_ptr = string_buffer.str; + U8 *string_buffer_opl = string_buffer.str + string_buffer.size; + U32 *offset_array = (U32*)offset_buffer.str; + U32 bucket_read_idx = 0; + + for (U32 bucket_idx = 0; bucket_idx < bucket_max; bucket_idx += 1) { + U32 string_offset = offset_array[bucket_idx]; + + // sanity check offset + if (string_offset >= string_buffer.size) { + err = PDB_StringTableOpenError_STRING_OFFSET_OUT_OF_BOUNDS; + break; + } + + // empty bucket + else if (string_offset == 0) { + ibucket_array[bucket_idx] = 0; + bucket_array[bucket_idx] = 0; + } + + // bucket with string + else { + if (bucket_read_idx >= bucket_count) { + err = PDB_StringTableOpenError_OFFSETS_EXCEED_BUCKET_COUNT; + break; + } + + // get bucket + PDB_StringTableBucket *bucket = &node_arr[bucket_read_idx]; + + // init bucket + bucket->data = str8_cstring_capped(string_buffer_ptr + string_offset, string_buffer_opl); + bucket->offset = string_offset; + bucket->istr = bucket_read_idx; + + // assign bucket + bucket_array[bucket_idx] = bucket; + ibucket_array[bucket_idx] = bucket_read_idx; + + // advance + bucket_read_idx += 1; + } + } + } else { + err = PDB_StringTableOpenError_CORRUPTED; + } + + scratch_end(scratch); + } else { + err = PDB_StringTableOpenError_UNKNOWN_VERSION; + } + } else { + err = PDB_StringTableOpenError_BAD_MAGIC; + } + + if (err == PDB_StringTableOpenError_OK) { + strtab->arena = arena; + strtab->version = header.version; + strtab->size = string_buffer.size; + strtab->bucket_count = bucket_count; + strtab->bucket_max = bucket_max; + strtab->ibucket_array = ibucket_array; + strtab->bucket_array = bucket_array; + } else { + if (arena) { + arena_release(arena); + } + } + + ProfEnd(); + + return err; +} + +internal void +pdb_strtab_build(PDB_StringTable *strtab, MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + // serialize bucket data + U8 *string_buffer = push_array_no_zero(scratch.arena, U8, strtab->size); + U32 *bucket_offset_arr = push_array(scratch.arena, U32, strtab->bucket_max); + + for (U32 bucket_idx = 0; bucket_idx < strtab->bucket_max; bucket_idx += 1) { + PDB_StringTableBucket *bucket = strtab->bucket_array[bucket_idx]; + if (bucket) { + // store string offset + Assert(bucket->offset + bucket->data.size <= strtab->size); + bucket_offset_arr[bucket_idx] = bucket->offset; + + // write c string at bucket offset + U8 *str_ptr = string_buffer + bucket->offset; + MemoryCopy(str_ptr, bucket->data.str, bucket->data.size); + str_ptr[bucket->data.size] = '\0'; + } + } + + // fill out header + PDB_StringTableHeader header; + header.magic = PDB_StringTableHeader_MAGIC; + header.version = strtab->version; + + // reserve memory for entire string table + MSF_UInt reserve_size = sizeof(header) + + sizeof(strtab->size) + + strtab->size + + sizeof(bucket_offset_arr[0]) * strtab->bucket_max + + sizeof(strtab->bucket_count); + msf_stream_reserve(msf, sn, reserve_size); + + // write out string table + msf_stream_write_struct(msf, sn, &header); + msf_stream_write_struct(msf, sn, &strtab->size); + msf_stream_write_array (msf, sn, string_buffer, strtab->size); + msf_stream_write_struct(msf, sn, &strtab->bucket_max); + msf_stream_write_array (msf, sn, bucket_offset_arr, strtab->bucket_max); + msf_stream_write_u32(msf, sn, strtab->bucket_count - 1); // 1 for null + + scratch_end(scratch); + ProfEnd(); +} + +internal void +pdb_strtab_release(PDB_StringTable *strtab) +{ + ProfBeginFunction(); + arena_release(strtab->arena); + MemoryZeroStruct(strtab); + ProfEnd(); +} + +internal U32 +pdb_strtab_get_serialized_size(PDB_StringTable *strtab) +{ + U32 result = 0; + result += sizeof(PDB_StringTableHeader); + result += sizeof(U32); // strtab size + result += strtab->size; + result += sizeof(U32); // bucket count + result += sizeof(U32) * strtab->bucket_max; + result += sizeof(U32); // string count + return result; +} + +internal U32 +pdb_strtab_hash(PDB_StringTable *strtab, String8 string) +{ + U32 hash = 0; + switch (strtab->version) { + case 1: hash = pdb_hash_v1(string); break; + default: NotImplemented; break; + } + U32 ibucket = hash % strtab->bucket_max; + return ibucket; +} + +internal B32 +pdb_strtab_add_(PDB_StringTable *strtab, U64 hash, PDB_StringTableBucket *bucket) +{ + U64 best_bucket_idx = hash; + U64 bucket_idx = best_bucket_idx; + do { + if (strtab->bucket_array[bucket_idx] == 0) { + strtab->ibucket_array[bucket->istr] = bucket_idx; + strtab->bucket_array[bucket_idx] = bucket; + strtab->size += bucket->data.size + /* null: */ 1; + return 1; + } + bucket_idx = (bucket_idx + 1) % strtab->bucket_max; + } while (best_bucket_idx != bucket_idx); + return 0; +} + +internal void +pdb_strtab_add_cv_string_hash_table(PDB_StringTable *strtab, CV_StringHashTable string_ht) +{ + ProfBeginFunction(); + + // reserve enough slots for new strings + pdb_strtab_grow(strtab, string_ht.total_insert_count); + + // upfront push buckets + PDB_StringTableBucket *buckets = push_array_no_zero(strtab->arena, PDB_StringTableBucket, string_ht.total_insert_count); + + U64 base_offset = strtab->size; + + // proceed to fill out buckets & add them to the string table + for (U64 bucket_idx = 0, string_idx = 0; bucket_idx < string_ht.bucket_cap; ++bucket_idx) { + if (string_ht.buckets[bucket_idx] != 0) { + PDB_StringTableBucket *dst = &buckets[string_idx++]; + dst->data = string_ht.buckets[bucket_idx]->string; + dst->offset = base_offset + string_ht.buckets[bucket_idx]->u.offset; + dst->istr = strtab->bucket_count++; + + // TODO: precompute hashes in parallel + U64 hash = pdb_strtab_hash(strtab, dst->data); + B32 was_added = pdb_strtab_add_(strtab, hash, dst); + Assert(was_added); + } + } + + ProfEnd(); +} + +internal B32 +pdb_strtab_try_add(PDB_StringTable *strtab, String8 string, PDB_StringIndex *index_out) +{ + PDB_StringTableBucket *bucket = push_array(strtab->arena, PDB_StringTableBucket, 1); + bucket->data = push_str8_copy(strtab->arena, string); + bucket->offset = strtab->size; + bucket->istr = (PDB_StringIndex)strtab->bucket_count++; + + U32 hash = pdb_strtab_hash(strtab, string); + B32 was_added = pdb_strtab_add_(strtab, hash, bucket); + + *index_out = bucket->istr; + + return was_added; +} + +internal void +pdb_strtab_grow(PDB_StringTable *strtab, U64 new_max) +{ + ProfBeginFunction(); + + PDB_StringTable new_strtab; + pdb_strtab_alloc(&new_strtab, new_max); + + // start with 1 because null bucket is already added during string table alloc + for (PDB_StringIndex istr = 1; istr < strtab->bucket_max; ++istr) { + U32 ibucket = strtab->ibucket_array[istr]; + + B32 is_bucket_null = ibucket >= strtab->bucket_max; + if (is_bucket_null) { + continue; + } + + PDB_StringTableBucket *bucket = strtab->bucket_array[ibucket]; + + PDB_StringIndex new_istr; + B32 is_bucket_pushed = pdb_strtab_try_add(&new_strtab, bucket->data, &new_istr); + Assert(is_bucket_pushed); + Assert(new_istr == istr); + + U32 new_ibucket = new_strtab.ibucket_array[new_istr]; + PDB_StringTableBucket *new_bucket = new_strtab.bucket_array[new_ibucket]; + Assert(new_bucket->offset == bucket->offset); + } + + *strtab = new_strtab; + + ProfEnd(); +} + +internal PDB_StringIndex +pdb_strtab_add(PDB_StringTable *strtab, String8 string) +{ + PDB_StringIndex index = 0; + B32 is_pushed = pdb_strtab_try_add(strtab, string, &index); + if (!is_pushed) { + // increase number of slots in the hash table + pdb_strtab_grow(strtab, strtab->bucket_max * 2); + + // now we have enough slots for the new string + is_pushed = pdb_strtab_try_add(strtab, string, &index); + AssertAlways(is_pushed); + } + return index; +} + +internal B32 +pdb_strtab_search(PDB_StringTable *strtab, String8 string, PDB_StringIndex *index_out) +{ + B32 is_found = 0; + U32 best_ibucket = pdb_strtab_hash(strtab, string); + U32 ibucket = best_ibucket; + do { + PDB_StringTableBucket *bucket = strtab->bucket_array[ibucket]; + if (bucket == NULL) { + break; + } + + if (str8_match(bucket->data, string, 0)) { + *index_out = bucket->istr; + is_found = 1; + break; + } + + ibucket = (ibucket + 1) % strtab->bucket_max; + } while (ibucket != best_ibucket); + return is_found; +} + +internal String8 +pdb_strtab_string_from_offset(PDB_StringTable *strtab, PDB_StringOffset offset) +{ + String8 string = str8(0,0); + for (U32 ibucket = 0; ibucket < strtab->bucket_max; ++ibucket) { + PDB_StringTableBucket *bucket = strtab->bucket_array[ibucket]; + if (bucket) { + if (bucket->offset == offset) { + string = bucket->data; + break; + } + } + } + return string; +} + +internal PDB_StringOffset +pdb_strtab_string_to_offset(PDB_StringTable *strtab, PDB_StringIndex stridx) +{ + Assert(stridx < strtab->bucket_max); + U32 ibucket = strtab->ibucket_array[stridx]; + PDB_StringOffset offset = strtab->bucket_array[ibucket]->offset; + return offset; +} + +internal String8 +pdb_string_from_string_table_open_error(PDB_StringTableOpenError err) +{ + String8 result = str8(0,0); + switch (err) { + case PDB_StringTableOpenError_OK: break; + case PDB_StringTableOpenError_BAD_MAGIC: result = str8_lit("BAD_MAGIC"); break; + case PDB_StringTableOpenError_UNKNOWN_VERSION: result = str8_lit("UNKNOWN_VERSION"); break; + case PDB_StringTableOpenError_CORRUPTED: result = str8_lit("CORRUPTED"); break; + case PDB_StringTableOpenError_OFFSETS_EXCEED_BUCKET_COUNT: result = str8_lit("OFFSETS_EXCEED_BUCKET_COUNT"); break; + case PDB_StringTableOpenError_STRING_OFFSET_OUT_OF_BOUNDS: result = str8_lit("STRING_OFFSET_OUT_OF_BOUNDS"); break; + } + return result; +} + +//////////////////////////////// + +internal PDB_OpenTypeServerError +pdb_type_server_parse_from_data_v80(String8 data, PDB_TypeServerParse *parse) +{ + ProfBeginFunction(); + + PDB_OpenTypeServerError error = PDB_OpenTypeServerError_UNKNOWN; + + PDB_TpiHeader header; MemoryZeroStruct(&header); + str8_deserial_read_struct(data, 0, &header); + Assert(header.version == PDB_TpiVersion_IMPV80); + + if (header.ti_lo >= CV_MinComplexTypeIndex && + header.ti_lo <= header.ti_hi) { + if (header.hash_bucket_count > 0 && + header.hash_bucket_count <= PDB_TYPE_SERVER_HASH_BUCKET_COUNT_MAX) { + parse->ti_range = rng_1u64(header.ti_lo, header.ti_hi); + parse->leaf_data = str8_substr(data, rng_1u64(sizeof(PDB_TpiHeader), sizeof(PDB_TpiHeader) + header.leaf_data_size )); + error = PDB_OpenTypeServerError_OK; + } else { + error = PDB_OpenTypeServerError_INVALID_BUCKET_COUNT; + } + } else { + error = PDB_OpenTypeServerError_INVALID_TI_RANGE; + } + + ProfEnd(); + return error; +} + +internal PDB_OpenTypeServerError +pdb_type_server_parse_from_data(String8 data, PDB_TypeServerParse *parse_out) +{ + PDB_OpenTypeServerError error = PDB_OpenTypeServerError_UNKNOWN; + + PDB_TpiVersion version = 0; + str8_deserial_read_struct(data, 0, &version); + + switch (version) { + case PDB_TpiVersion_IMPV80: + error = pdb_type_server_parse_from_data_v80(data, parse_out); + break; + case PDB_TpiVersion_INTV_VC2: + case PDB_TpiVersion_IMPV40: + case PDB_TpiVersion_IMPV50_INTERIM: + case PDB_TpiVersion_IMPV70: + error = PDB_OpenTypeServerError_UNSUPPORTED_VERSION; + break; + default: Assert(!"unknown TPI version"); break; + } + + return error; +} + +internal PDB_TypeServer * +pdb_type_server_alloc(U64 bucket_cap) +{ + ProfBeginFunction(); + AssertAlways(0x1000 <= bucket_cap && bucket_cap <= 0x40000); + + Arena *arena = arena_alloc(); + PDB_TypeServer *ts = push_array(arena, PDB_TypeServer, 1); + ts->arena = arena; + ts->hash_sn = MSF_INVALID_STREAM_NUMBER; + ts->ti_lo = CV_MinComplexTypeIndex; + ts->bucket_cap = bucket_cap; + ts->buckets = push_array(arena, PDB_TypeBucket *, ts->bucket_cap); + pdb_hash_table_alloc(&ts->hash_adj, 32); + + ProfEnd(); + return ts; +} + +internal PDB_TypeServer * +pdb_type_server_open_v80(MSF_Context *msf, MSF_StreamNumber sn, PDB_StringTable *strtab) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + PDB_TypeServer *ts = NULL; + + PDB_TpiHeader header = {0}; + msf_stream_seek(msf, sn, 0); + MSF_UInt read_header_size = msf_stream_read_struct(msf, sn, &header); + + // have we read enough bytes? + if (read_header_size != sizeof(PDB_TpiHeader)) { + goto exit; + } + // is lowest non-simple type index valid? + if (header.ti_lo < CV_MinComplexTypeIndex) { + goto exit; + } + // is high non-simple type index valid? + if (header.ti_lo > header.ti_hi) { + goto exit; + } + + // validate hash bucket count + if (header.hash_bucket_count == 0) { + goto exit; + } + if (header.hash_bucket_count > PDB_TYPE_SERVER_HASH_BUCKET_COUNT_MAX) { + goto exit; + } + + // are there enough bytes in the stream to read hash values? + U64 hash_stream_size = msf_stream_get_size(msf, header.hash_sn); + if (header.hash_vals.off + header.hash_vals.size > hash_stream_size) { + goto exit; + } + + ts = pdb_type_server_alloc(header.hash_bucket_count); + + // read & parse code view types + String8 types_data = msf_stream_read_block(ts->arena, msf, sn, header.leaf_data_size); + CV_DebugT debug_t = cv_debug_t_from_data(scratch.arena, types_data, PDB_LEAF_ALIGN); + + // read hash data + U8 *hash_buffer = push_array(scratch.arena, U8, header.hash_vals.size); + msf_stream_seek(msf, header.hash_sn, header.hash_vals.off); + MSF_UInt hash_buffer_size = msf_stream_read(msf, header.hash_sn, hash_buffer, header.hash_vals.size); + Assert(hash_buffer_size == header.hash_vals.size); + + // rebuild type buckets + for (U64 cursor = 0, leaf_idx = 0; + cursor + header.hash_key_size <= hash_buffer_size; + cursor += header.hash_key_size, leaf_idx += 1) { + String8 raw_leaf = cv_debug_t_get_raw_leaf(debug_t, leaf_idx); + + str8_list_push(ts->arena, &ts->leaf_list, raw_leaf); + + // read out bucket hash + U64 hash = 0; + MemoryCopy(&hash, hash_buffer + cursor, header.hash_key_size); + + // push bucket + PDB_TypeBucket *bucket = push_array(ts->arena, PDB_TypeBucket, 1); + bucket->raw_leaf = raw_leaf; + bucket->type_index = header.ti_lo + leaf_idx; + SLLStackPush(ts->buckets[hash], bucket); + } + + // adjust type buckets + msf_stream_seek(msf, header.hash_sn, header.hash_adj.off); + String8 adjust_data = msf_stream_read_block(scratch.arena, msf, header.hash_sn, header.hash_adj.size); + + // open adjust hash table + PDB_HashTableParseError hash_adj_parse_error = pdb_hash_adj_hash_table_from_data(&ts->hash_adj, adjust_data, strtab, 0); + if (hash_adj_parse_error == PDB_HashTableParseError_OUT_OF_BYTES) { + pdb_hash_table_alloc(&ts->hash_adj, 16); + } else { + Assert(hash_adj_parse_error == PDB_HashTableParseError_OK); + } + + // grab keys and values + String8Array key_arr = {0}; + String8Array value_arr = {0}; + pdb_hash_table_get_present_keys_and_values(scratch.arena, &ts->hash_adj, &key_arr, &value_arr); + + // adjust type buckets + for (U64 i = 0; i < ts->hash_adj.count; i += 1) { + String8 type_name = key_arr.v[i]; + CV_TypeIndex type_index = *(CV_TypeIndex*)value_arr.v[i].str; + + // name -> hash + U64 hash = pdb_hash_v1(type_name); + hash %= ts->bucket_cap; + + // search for type bucket + PDB_TypeBucket *curr, *prev; + for (curr = ts->buckets[hash], prev = 0; curr != 0; prev = curr, curr = curr->next) { + if (curr->type_index == type_index) { + break; + } + } + + // move type to the head + if (prev && curr) { + prev->next = curr->next; + curr->next = ts->buckets[hash]; + ts->buckets[hash] = curr; + } + + Assert(curr); + } + + exit:; + scratch_end(scratch); + ProfEnd(); + return ts; +} + +internal PDB_TypeServer * +pdb_type_server_open(MSF_Context *msf, MSF_StreamNumber sn, PDB_StringTable *strtab) +{ + ProfBeginFunction(); + + PDB_TypeServer *ts = NULL; + + PDB_TpiVersion version = 0; + msf_stream_seek(msf, sn, 0); + msf_stream_read_struct(msf, sn, &version); + + switch (version) { + case PDB_TpiVersion_IMPV80: { + ts = pdb_type_server_open_v80(msf, sn, strtab); + } break; + case PDB_TpiVersion_INTV_VC2: + case PDB_TpiVersion_IMPV40: + case PDB_TpiVersion_IMPV50_INTERIM: + case PDB_TpiVersion_IMPV70: { + NotImplemented; + } break; + default: Assert(!"unknown TPI version"); break; + } + + ProfEnd(); + return ts; +} + +internal +THREAD_POOL_TASK_FUNC(pdb_write_type_to_bucket_map_32_task) +{ + PDB_WriteTypeToBucketMap *task = raw_task; + + U64 bucket_idx = task_id; + U32 bucket_idx32 = safe_cast_u32(bucket_idx); + + PDB_TypeServer *ts = task->ts; + PDB_TypeBucket *head = ts->buckets[bucket_idx]; + for (PDB_TypeBucket *bucket = head; bucket != 0; bucket = bucket->next) { + Assert(bucket->type_index >= ts->ti_lo); + Assert(bucket->type_index - ts->ti_lo < ts->leaf_list.node_count); + CV_TypeIndex type_idx = bucket->type_index - ts->ti_lo; + Assert(task->map[type_idx] == 0); + task->map[type_idx] = bucket_idx32; + } +} + +internal PDB_TypeHashStreamInfo +pdb_type_hash_stream_build(TP_Context *tp, + PDB_TypeServer *ts, + PDB_StringTable *strtab, + MSF_Context *msf, + PDB_TpiOffHint *hint_arr, + U64 hint_count) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + // write (type index -> bucket index) map + // + // zero-out entire map so non-UDTs type indices, that are NOT in the hash table, + // map to zero offset + U32 *type_to_bucket_map = push_array(scratch.arena, U32, ts->leaf_list.node_count); + { + ProfBegin("Bucket Map"); + PDB_WriteTypeToBucketMap type_to_bucket_task; + type_to_bucket_task.ts = ts; + type_to_bucket_task.map = type_to_bucket_map; + tp_for_parallel(tp, 0, ts->bucket_cap, pdb_write_type_to_bucket_map_32_task, &type_to_bucket_task); + ProfEnd(); + } + + // write bucket adjust info + String8 hash_adj_data = pdb_data_from_hash_adj_hash_table(scratch.arena, &ts->hash_adj, strtab); + + ProfBegin("MSF Write"); + + // write data to stream + if (ts->hash_sn == MSF_INVALID_STREAM_NUMBER) { + ts->hash_sn = msf_stream_alloc(msf); + } + msf_stream_seek_start(msf, ts->hash_sn); + + PDB_OffsetSize hash_vals; + hash_vals.off = msf_stream_get_pos(msf, ts->hash_sn); + hash_vals.size = sizeof(type_to_bucket_map[0]) * ts->leaf_list.node_count; + msf_stream_write(msf, ts->hash_sn, &type_to_bucket_map[0], hash_vals.size); + + PDB_OffsetSize hint_offs; + hint_offs.off = msf_stream_get_pos(msf, ts->hash_sn); + hint_offs.size = sizeof(hint_arr[0]) * hint_count; + msf_stream_write(msf, ts->hash_sn, &hint_arr[0], hint_offs.size); + + PDB_OffsetSize hash_adj; + hash_adj.off = msf_stream_get_pos(msf, ts->hash_sn); + hash_adj.size = hash_adj_data.size; + msf_stream_write_string(msf, ts->hash_sn, hash_adj_data); + + ProfEnd(); + + // fill out result + PDB_TypeHashStreamInfo result; + result.hash_vals = hash_vals; + result.ti_offs = hint_offs; + result.hash_adj = hash_adj; + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal +THREAD_POOL_TASK_FUNC(pdb_write_types_task) +{ + ProfBeginFunction(); + + PDB_WriteTypesTask *task = raw_task; + + String8Node *node = task->lf_arr[task_id]; + Rng1U64 range = task->lf_range_arr[task_id]; + U64 cursor = task->lf_cursor_arr[task_id]; + + for (U64 lf_idx = range.min; lf_idx < range.max; node = node->next, lf_idx += 1) { + if (lf_idx % PDB_TYPE_HINT_STEP == 0) { + U64 off_idx = lf_idx / PDB_TYPE_HINT_STEP; + Assert(off_idx < task->hint_count); + Assert(cursor < PDB_TYPE_OFFSET_MAX); + task->hint_arr[off_idx].itype = task->ti_lo + lf_idx; + task->hint_arr[off_idx].off = (PDB_TypeOffset)cursor; + } + + // copy leaf data + MemoryCopy(task->lf_buf + cursor, node->string.str, node->string.size); + cursor += node->string.size; + } + + ProfEnd(); +} + +internal void +pdb_type_server_build(TP_Context *tp, PDB_TypeServer *ts, PDB_StringTable *strtab, MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + ProfBeginDynamic("Prepare Buffers [Leaf Count: %llu]", ts->leaf_list.node_count); + + U64 hint_count = CeilIntegerDiv(ts->leaf_list.node_count, PDB_TYPE_HINT_STEP); + PDB_TpiOffHint *hint_arr = push_array_no_zero(scratch.arena, PDB_TpiOffHint, hint_count); + String8Node **lf_arr = push_array_no_zero(scratch.arena, String8Node *, tp->worker_count); + U64 *lf_cursor_arr = push_array_no_zero(scratch.arena, U64, tp->worker_count); + Rng1U64 *lf_range_arr = tp_divide_work(scratch.arena, ts->leaf_list.node_count, tp->worker_count); + + U64 lf_buf_size = 0; + U64 lf_node_idx = 0; + U64 lf_arr_idx = 0; + for (String8Node *lf = ts->leaf_list.first; lf != 0; lf = lf->next) { + if (lf_node_idx == lf_range_arr[lf_arr_idx].min) { // :thread_pool_dummy_range + lf_cursor_arr[lf_arr_idx] = lf_buf_size; + lf_arr[lf_arr_idx] = lf; + lf_arr_idx += 1; + } + lf_buf_size += AlignPow2(lf->string.size, PDB_LEAF_ALIGN); + lf_node_idx += 1; + } + + ProfEnd(); + + ProfBegin("Write Type Data & Hints"); + + PDB_WriteTypesTask write_types_task; + write_types_task.align = PDB_LEAF_ALIGN; + write_types_task.ti_lo = ts->ti_lo; + write_types_task.ti_hi = ts->ti_lo + ts->leaf_list.node_count; + write_types_task.hint_count = hint_count; + write_types_task.hint_arr = hint_arr; + write_types_task.lf_arr = lf_arr; + write_types_task.lf_range_arr = lf_range_arr; + write_types_task.lf_cursor_arr = lf_cursor_arr; + write_types_task.lf_buf = push_array_no_zero(scratch.arena, U8, lf_buf_size); + write_types_task.lf_buf_size = lf_buf_size; + tp_for_parallel(tp, 0, tp->worker_count, pdb_write_types_task, &write_types_task); + + ProfEnd(); + + // build type lookup accelerator + PDB_TypeHashStreamInfo hash_stream_info = pdb_type_hash_stream_build(tp, ts, strtab, msf, hint_arr, hint_count); + + // fill out header + PDB_TpiHeader header; + header.version = PDB_TpiVersion_IMPV80; + header.header_size = sizeof(header); + header.ti_lo = ts->ti_lo; + header.ti_hi = ts->ti_lo + ts->leaf_list.node_count; + header.leaf_data_size = safe_cast_u32(lf_buf_size); + header.hash_sn = ts->hash_sn; + header.hash_sn_aux = MSF_INVALID_STREAM_NUMBER; + header.hash_key_size = sizeof(U32); + header.hash_bucket_count = ts->bucket_cap; + header.hash_vals = hash_stream_info.hash_vals; + header.itype_offs = hash_stream_info.ti_offs; + header.hash_adj = hash_stream_info.hash_adj; + + // write type server to stream + ProfBegin("MSF Commit"); + msf_stream_seek_start(msf, sn); + msf_stream_write_struct(msf, sn, &header); + msf_stream_write_parallel(tp, msf, sn, write_types_task.lf_buf, lf_buf_size); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); +} + +internal void +pdb_type_server_release(PDB_TypeServer **ts_ptr) +{ + ProfBeginFunction(); + arena_release((*ts_ptr)->arena); + *ts_ptr = 0; + ProfEnd(); +} + +internal String8Node * +pdb_type_server_make_leaf(PDB_TypeServer *ts, CV_LeafKind kind, String8 data) +{ + ProfBeginFunction(); + + String8 leaf = cv_serialize_leaf_ex(ts->arena, kind, data, PDB_LEAF_ALIGN); + String8Node *node = str8_list_push(ts->arena, &ts->leaf_list, leaf); + + ProfEnd(); + return node; +} + +internal U32 +pdb_type_server_hash(String8 data) +{ + U32 hash = pdb_hash_v1(data); + return hash; +} + +internal PDB_TypeBucket * +pdb_type_server_push_udt_arr(PDB_TypeServer *ts, U64 count, U32 *hash_arr, String8 *raw_leaf_arr) +{ + // check if type server already contains this leaf and if so move + // it to the head of bucket list. +#if 0 + B32 is_udt = pdb_is_udt(kind); + if (is_udt) { + PDB_UDTInfo udt_info = pdb_get_udt_info(kind, data); + U32 udt_hash = pdb_hash_udt(udt_info, data) % ts->bucket_count; + U64 match_count = 0; + for (PDB_TypeBucket *curr = ts->bucket_table[udt_hash], *prev = NULL; + curr != NULL; + prev = curr, curr = curr->next) { + if (curr->leaf->kind == kind) { + PDB_UDTInfo this_udt_info = pdb_get_udt_info(curr->leaf->kind, curr->leaf->data); + if (str8_match(udt_info.name, this_udt_info.name)) { + B32 is_data_match = curr->leaf->data.size == data.size && + MemoryCompare(curr->leaf->data.str, data.str, data.size) == 0; + if (is_data_match) { + B32 is_not_head = (match_count > 0); + if (is_not_head) { + // move bucket to head + prev->next = curr->next; + curr->next = ts->bucket_table[udt_hash]; + ts->bucket_table[udt_hash] = curr; + + // update hash adjust + pdb_hash_table_delete(&ts->hash_adj, udt_info.name); + pdb_hash_table_set(&ts->hash_adj, udt_info.name, str8((U8*)&curr->leaf->type_index, sizeof(curr->leaf->type_index))); + } + + return curr->leaf; + } + match_count += 1; + } + } + } + } +#endif + + PDB_TypeBucket *bucket_arr = push_array_no_zero(ts->arena, PDB_TypeBucket, count); + + for (U64 leaf_idx = 0; leaf_idx < count; leaf_idx += 1) { + U32 hash = hash_arr[leaf_idx]; + String8 raw_leaf = raw_leaf_arr[leaf_idx]; + + CV_Leaf leaf = cv_leaf_from_string(raw_leaf); + + // make sure we push a complete UDT + Assert(cv_is_udt(leaf.kind)); + Assert(!(cv_get_udt_info(leaf.kind, leaf.data).props & CV_TypeProp_FwdRef)); + + PDB_TypeBucket *bucket = &bucket_arr[leaf_idx]; + bucket->next = 0; + bucket->raw_leaf = raw_leaf; + bucket->type_index = ts->ti_lo + ts->leaf_list.node_count + leaf_idx; + + U32 bucket_idx = hash % ts->bucket_cap; + SLLStackPush(ts->buckets[bucket_idx], bucket); + } + + return bucket_arr; +} + +internal PDB_TypeBucket * +pdb_type_server_push_udt(PDB_TypeServer *ts, U32 hash, String8 raw_leaf) +{ + return pdb_type_server_push_udt_arr(ts, 1, &hash, &raw_leaf); +} + +internal void +pdb_type_server_push(PDB_TypeServer *ts, String8 raw_leaf) +{ + ProfBeginFunction(); + + CV_Leaf leaf; + cv_deserial_leaf(raw_leaf, 0, 1, &leaf); + + if (cv_is_udt(leaf.kind)) { + CV_UDTInfo udt_info = cv_get_udt_info(leaf.kind, leaf.data); + B32 is_complete = !(udt_info.props & CV_TypeProp_FwdRef); + if (is_complete) { + U32 hash = pdb_hash_udt(udt_info, leaf.data); + pdb_type_server_push_udt(ts, hash, raw_leaf); + } + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(pdb_count_udt_task) +{ + PDB_PushLeafTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + for (U64 leaf_idx = range.min; leaf_idx < range.max; ++leaf_idx) { + CV_Leaf leaf = cv_debug_t_get_leaf(task->debug_t, leaf_idx); + if (cv_is_udt(leaf.kind)) { + CV_UDTInfo udt_info = cv_get_udt_info(leaf.kind, leaf.data); + if (~udt_info.props & CV_TypeProp_FwdRef) { + ++task->udt_counts[task_id]; + } + } + } +} + +internal +THREAD_POOL_TASK_FUNC(pdb_push_udt_leaf_task) +{ + PDB_PushLeafTask *task = raw_task; + PDB_TypeServer *type_server = task->type_server; + Rng1U64 range = task->ranges[task_id]; + U64 bucket_cursor = task->udt_offsets[task_id]; + CV_DebugT debug_t = task->debug_t; + PDB_TypeBucket *new_buckets = task->udt_buckets; + + U64 type_ht_cap = type_server->bucket_cap; + PDB_TypeBucket **type_ht_buckets = type_server->buckets; + U64 base_type_index = type_server->ti_lo + type_server->leaf_list.node_count; + + for (U64 leaf_idx = range.min; leaf_idx < range.max; ++leaf_idx) { + CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, leaf_idx); + if (cv_is_udt(leaf.kind)) { + CV_UDTInfo udt_info = cv_get_udt_info(leaf.kind, leaf.data); + if (~udt_info.props & CV_TypeProp_FwdRef) { + // hash udt and compute bucket index + U32 hash = pdb_hash_udt(udt_info, leaf.data); + U32 bucket_idx = hash % type_ht_cap; + + // fill out & insert bucket + PDB_TypeBucket *bucket = &new_buckets[bucket_cursor++]; + bucket->raw_leaf = cv_debug_t_get_raw_leaf(debug_t, leaf_idx); + bucket->type_index = base_type_index + leaf_idx; + bucket->next = ins_atomic_ptr_eval_assign(&type_ht_buckets[bucket_idx], bucket); + } + } + } +} + +internal void +pdb_type_server_push_parallel(TP_Context *tp, PDB_TypeServer *type_server, CV_DebugT debug_t) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + PDB_PushLeafTask task = {0}; + task.debug_t = debug_t; + task.type_server = type_server; + task.ranges = tp_divide_work(scratch.arena, debug_t.count, tp->worker_count); + + ProfBegin("Count UDT"); + task.udt_counts = push_array(scratch.arena, U64, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, pdb_count_udt_task, &task); + ProfEnd(); + + ProfBegin("Push UDT Leaves"); + U64 total_udt_count = sum_array_u64(tp->worker_count, task.udt_counts); + task.udt_offsets = offsets_from_counts_array_u64(scratch.arena, task.udt_counts, tp->worker_count); + task.udt_buckets = push_array_no_zero(type_server->arena, PDB_TypeBucket, total_udt_count); + tp_for_parallel(tp, 0, tp->worker_count, pdb_push_udt_leaf_task, &task); + ProfEnd(); + + ProfBegin("Append New Leaves"); + String8List new_leaves = cv_str8_list_from_debug_t_parallel(tp, type_server->arena, debug_t); + str8_list_concat_in_place(&type_server->leaf_list, &new_leaves); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); +} + +#if 0 +internal CV_LeafNode * +pdb_type_server_leaf_from_string(PDB_TypeServer *ts, String8 string) +{ + ProfBeginFunction(); + U32 hash = pdb_hash_v1(string); + U32 bucket_idx = hash % ts->bucket_count; + PDB_TypeBucket *head_bucket = ts->bucket_table[bucket_idx]; + CV_LeafNode *result = 0; + for (PDB_TypeBucket *i = head_bucket; i != 0; i = i->next) { + CV_LeafNode *leaf = i->leaf_node; + String8 leaf_name = cv_get_leaf_name(leaf->data.kind, leaf->data.data); + if (str8_match(leaf_name, string, 0)) { + result = leaf; + break; + } + } + ProfEnd(); + return result; +} +#endif + +//////////////////////////////// + +#if 0 +internal PDB_TypeIndexMap * +pdb_load_types_from_leaf_list(PDB_TypeServer **type_server_arr, CV_LeafList leaf_list) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + // 1. redistribute leaves in parallel + CV_LeafList leaf_list_arr[CV_TypeIndexSource_COUNT] = {0}; + for (CV_LeafNode *curr = leaf_list.first, *next = 0; curr != 0; curr = next) { + next = curr->next; + curr->next = 0; + CV_TypeIndexSource ti_source = cv_type_index_source_from_leaf_kind(curr->data.kind); + CV_LeafList *list = &leaf_list_arr[ti_source]; + SLLQueuePush(list->first, list->last, curr); + list->count += 1; + } + + // 2. reserve type leafs on main thread + PDB_TypeLeaf *leaf_arr_arr[CV_TypeIndexSource_COUNT]; + for (U64 source_idx = 0; source_idx < ArrayCount(leaf_list_arr); source_idx += 1) { + PDB_TypeServer *type_server = type_server_arr[source_idx]; + CV_LeafList input_leaf_list = leaf_list_arr[source_idx]; + PDB_TypeLeaf *leaf_arr = pdb_type_server_reserve(type_server, input_leaf_list.count); + leaf_arr_arr[source_idx] = leaf_arr; + } + + // 3. populate type index map in parallel + PDB_TypeIndexMap *ti_map = pdb_type_index_map_alloc(); + for (U64 source_idx = 0; source_idx < ArrayCount(leaf_list_arr); source_idx += 1) { + CV_LeafList input_leaf_list = leaf_list_arr[source_idx]; + PDB_TypeLeaf *leaf_arr = leaf_arr_arr[source_idx]; + for (U64 leaf_idx = 0; leaf_idx < input_leaf_list.count; leaf_idx += 1) { + CV_TypeIndex external_ti = ti_map->min_itype[source_idx] + leaf_idx; + CV_TypeIndex internal_ti = leaf_arr[leaf_idx].type_index; + pdb_type_index_map_add(ti_map, (CV_TypeIndexSource)source_idx, external_ti, internal_ti); + } + } + + // 4. patch type indices in parallel + for (U64 source_idx = 0; source_idx < ArrayCount(leaf_list_arr); source_idx += 1) { + CV_LeafList list = leaf_list_arr[source_idx]; + for (CV_LeafNode *node = list.first; node != 0; node = node->next) { + Temp temp = temp_begin(scratch.arena); + + // get offsets for type indices in data blob + CV_Leaf *leaf = &node->data; + CV_TypeIndexInfoList ti_info_list = cv_get_leaf_type_index_offsets(temp.arena, leaf->kind, leaf->data); + + for (CV_TypeIndexInfo *ti_info = ti_info_list.first; ti_info != 0; ti_info = ti_info->next) { + Assert(ti_info->offset + sizeof(CV_TypeIndex) <= leaf->data.size); + CV_TypeIndex *ti_ptr = (CV_TypeIndex *)(leaf->data.str + ti_info->offset); + CV_TypeIndex external_ti = *ti_ptr; + + B32 is_complex_type = external_ti >= ti_map->min_itype[ti_info->source]; + if (is_complex_type) { + // search external type index + CV_TypeIndex internal_tpi_idx = pdb_type_index_map_search(ti_map, CV_TypeIndexSource_TPI, external_ti); + CV_TypeIndex internal_ipi_idx = pdb_type_index_map_search(ti_map, CV_TypeIndexSource_IPI, external_ti); + + // error checks + if (internal_tpi_idx == 0 && internal_ipi_idx == 0) { + lnk_invalid_path("unable to find match for external type index 0x%X", external_ti); + continue; + } + if (internal_tpi_idx != 0 && internal_ipi_idx != 0) { + lnk_invalid_path("both TPI and IPI matched for external type index 0x%X", external_ti); + continue; + } + + // rewrite index + CV_TypeIndex internal_ti = internal_tpi_idx ? internal_tpi_idx : internal_ipi_idx; + *ti_ptr = internal_ti; + } + } + + temp_end(temp); + } + } + + // 5. push types to hash table on main thread + for (U64 source_idx = 0; source_idx < ArrayCount(leaf_list_arr); source_idx += 1) { + PDB_TypeServer *type_server = type_server_arr[source_idx]; + CV_LeafList list = leaf_list_arr[source_idx]; + PDB_TypeLeaf *leaf_arr = leaf_arr_arr[source_idx]; + U64 leaf_idx = 0; + for (CV_LeafNode *node = list.first; node != 0; node = node->next, leaf_idx += 1) { + CV_Leaf *external_leaf = &node->data; + + // move patched type data + PDB_TypeLeaf *internal_leaf = leaf_arr + leaf_idx; + internal_leaf->kind = external_leaf->kind; + internal_leaf->data = push_str8_copy(type_server->arena, external_leaf->data); + + // push leaf to type server + pdb_type_server_push_(type_server, internal_leaf); + } + } + + scratch_end(scratch); + ProfEnd(); + return ti_map; +} +#endif + +//////////////////////////////// + +internal PDB_InfoContext * +pdb_info_alloc(U32 age, COFF_TimeStamp time_stamp, OS_Guid guid) +{ + ProfBeginFunction(); + Arena *arena = arena_alloc(); + PDB_InfoContext *info = push_array(arena, PDB_InfoContext, 1); + info->arena = arena; + info->flags = PDB_FeatureFlag_HAS_ID_STREAM; + info->time_stamp = time_stamp; + info->age = age; + info->guid = guid; + pdb_strtab_alloc(&info->strtab, 0x3fff); + pdb_hash_table_alloc(&info->named_stream_ht, 4); + pdb_hash_table_alloc(&info->src_header_block_ht, 8); + ProfEnd(); + return info; +} + +internal PDB_InfoContext * +pdb_info_open(MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + COFF_TimeStamp time_stamp = 0; + U32 age = 0; + OS_Guid guid = {0}; + PDB_FeatureFlags flags = 0; + PDB_HashTable named_stream_ht = {0}; + + U64 info_size = msf_stream_get_size(msf, sn); + String8 info_data = msf_stream_read_block(scratch.arena, msf, sn, info_size); + + PDB_InfoVersion version = 0; + str8_deserial_read_struct(info_data, 0, &version); + + switch (version) { + case PDB_InfoVersion_VC70: { + U64 cursor = 0; + + // read header + PDB_InfoHeaderV70 header; + cursor += str8_deserial_read_struct(info_data, cursor, &header); + + time_stamp = header.time_stamp; + age = header.age; + guid = header.guid; + + // open named stream hash table + String8 named_stream_ht_data = str8_skip(info_data, cursor); + U64 named_stream_ht_size = 0; + PDB_HashTableParseError named_stream_ht_error = pdb_named_stream_ht_from_data(&named_stream_ht, named_stream_ht_data, &named_stream_ht_size); + if (named_stream_ht_error == PDB_HashTableParseError_OK) { + cursor += named_stream_ht_size; + + // read PDB features + while (cursor < info_data.size) { + PDB_FeatureSig sig = 0; + cursor += str8_deserial_read_struct(info_data, cursor, &sig); + switch (sig) { + case PDB_FeatureSig_NULL: break; + case PDB_FeatureSig_VC140: { + flags |= PDB_FeatureFlag_HAS_ID_STREAM; + } break; + case PDB_FeatureSig_NO_TYPE_MERGE: { + flags |= PDB_FeatureFlag_NO_TYPE_MERGE; + } break; + case PDB_FeatureSig_MINIMAL_DEBUG_INFO: { + flags |= PDB_FeatureFlag_MINIMAL_DBG_INFO; + } break; + default: Assert(!"unknown feature sig"); break; + } + } + } else { + Assert(!"unable to open named stream hash table"); + } + } break; + case PDB_InfoVersion_VC2: + case PDB_InfoVersion_VC4: + case PDB_InfoVersion_VC41: + case PDB_InfoVersion_VC50: + case PDB_InfoVersion_VC98: + case PDB_InfoVersion_VC70_DEP: + case PDB_InfoVersion_VC80: + case PDB_InfoVersion_VC110: + case PDB_InfoVersion_VC140: { + NotImplemented; + } break; + default: Assert(!"invalid info stream version"); break; + } + + // open string table + PDB_StringTable strtab = {0}; + MSF_StreamNumber strtab_sn = pdb_find_named_stream(&named_stream_ht, PDB_NAMES_STREAM_NAME); + if (strtab_sn != MSF_INVALID_STREAM_NUMBER) { + PDB_StringTableOpenError err = pdb_strtab_open(&strtab, msf, strtab_sn); + Assert(err == PDB_StringTableOpenError_OK); + } + + // open injected source files + PDB_HashTable src_header_block_ht = {0}; + MSF_StreamNumber src_header_block_sn = pdb_find_named_stream(&named_stream_ht, PDB_SRC_HEADER_BLOCK_STREAM_NAME); + if (src_header_block_sn != MSF_INVALID_STREAM_NUMBER) { + U64 src_header_block_stream_size = msf_stream_get_size(msf, src_header_block_sn); + String8 src_header_block_data = msf_stream_read_block(scratch.arena, msf, src_header_block_sn, src_header_block_stream_size); + PDB_HashTableParseError err = pdb_src_header_block_ht_from_data(&src_header_block_ht, src_header_block_data, &strtab, 0); + Assert(err == PDB_HashTableParseError_OK); + } + + // fill out info + Arena *arena = arena_alloc(); + PDB_InfoContext *info = push_array_no_zero(arena, PDB_InfoContext, 1); + info->arena = arena; + info->time_stamp = time_stamp; + info->age = age; + info->guid = guid; + info->flags = flags; + info->named_stream_ht = named_stream_ht; + info->src_header_block_ht = src_header_block_ht; + info->strtab = strtab; + + scratch_end(scratch); + ProfEnd(); + return info; +} + +internal void +pdb_info_build_src_header_block(PDB_InfoContext *info, MSF_Context *msf) +{ + Temp scratch = scratch_begin(0,0); + + // was stream allocated? + MSF_StreamNumber src_header_block_sn = pdb_find_named_stream(&info->named_stream_ht, PDB_SRC_HEADER_BLOCK_STREAM_NAME); + if (src_header_block_sn == MSF_INVALID_STREAM_NUMBER) { + src_header_block_sn = pdb_push_named_stream(&info->named_stream_ht, msf, PDB_SRC_HEADER_BLOCK_STREAM_NAME); + } + + // build the hash table + String8 hash_table_data = pdb_data_from_src_header_block_ht(scratch.arena, &info->src_header_block_ht, &info->strtab); + AssertAlways(hash_table_data.size); + + // compute stream size + U64 src_header_stream_size = 0; + src_header_stream_size += sizeof(PDB_SrcHeaderBlockHeader); + src_header_stream_size += hash_table_data.size; + + // fill out header + PDB_SrcHeaderBlockHeader src_header; + src_header.version = PDB_SRC_HEADER_BLOCK_MAGIC_V1; + src_header.stream_size = src_header_stream_size; + src_header.file_time = 0; + src_header.age = 0; + MemoryZeroStruct(&src_header.pad); + + // write to stream + B32 is_header_written = msf_stream_write_struct(msf, src_header_block_sn, &src_header); + B32 is_hash_table_written = msf_stream_write_string(msf, src_header_block_sn, hash_table_data); + AssertAlways(is_header_written); + AssertAlways(is_hash_table_written); + AssertAlways(msf_stream_get_size(msf, src_header_block_sn) == src_header.stream_size); + + scratch_end(scratch); +} + +internal void +pdb_info_build_link_info(PDB_InfoContext *info, MSF_Context *msf) +{ + MSF_StreamNumber linkinfo_sn = pdb_find_named_stream(&info->named_stream_ht, PDB_LINK_INFO_STREAM_NAME); + if (linkinfo_sn == MSF_INVALID_STREAM_NUMBER) { + linkinfo_sn = pdb_push_named_stream(&info->named_stream_ht, msf, PDB_LINK_INFO_STREAM_NAME); + } + // TODO: populate LINKINFO +} + +internal void +pdb_info_build_names(PDB_InfoContext *info, MSF_Context *msf) +{ + MSF_StreamNumber strtab_sn = pdb_find_named_stream(&info->named_stream_ht, PDB_NAMES_STREAM_NAME); + if (strtab_sn == MSF_INVALID_STREAM_NUMBER) { + strtab_sn = pdb_push_named_stream(&info->named_stream_ht, msf, PDB_NAMES_STREAM_NAME); + } + pdb_strtab_build(&info->strtab, msf, strtab_sn); +} + +internal void +pdb_info_build(PDB_InfoContext *info, MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + // finalize named streams + pdb_info_build_src_header_block(info, msf); + pdb_info_build_link_info(info, msf); + pdb_info_build_names(info, msf); + + // serialize named streams hash table + String8 named_stream_ht_data = pdb_data_from_named_stream_ht(scratch.arena, &info->named_stream_ht); + + // fill out header + PDB_InfoHeaderV70 header; + header.version = PDB_InfoVersion_VC70; + header.time_stamp = info->time_stamp; + header.age = info->age; + header.guid = info->guid; + + // layout info stream + String8List info_srl = {0}; + str8_serial_begin(scratch.arena, &info_srl); + str8_serial_push_struct(scratch.arena, &info_srl, &header); + str8_serial_push_string(scratch.arena, &info_srl, named_stream_ht_data); + if (info->flags & PDB_FeatureFlag_HAS_ID_STREAM) { + str8_serial_push_u32(scratch.arena, &info_srl, PDB_FeatureSig_VC140); + } + if (info->flags & PDB_FeatureFlag_NO_TYPE_MERGE) { + str8_serial_push_u32(scratch.arena, &info_srl, PDB_FeatureSig_NO_TYPE_MERGE); + } + if (info->flags & PDB_FeatureFlag_MINIMAL_DBG_INFO) { + str8_serial_push_u32(scratch.arena, &info_srl, PDB_FeatureSig_MINIMAL_DEBUG_INFO); + } + + // write info to MSF + msf_stream_seek_start(msf, sn); + msf_stream_resize(msf, sn, info_srl.total_size); + msf_stream_write_list(msf, sn, info_srl); + + scratch_end(scratch); + ProfEnd(); +} + +internal void +pdb_info_release(PDB_InfoContext **info_ptr) +{ + ProfBeginFunction(); + arena_release((*info_ptr)->arena); + *info_ptr = NULL; + ProfEnd(); +} + +internal MSF_StreamNumber +pdb_push_named_stream(PDB_HashTable *named_stream_ht, MSF_Context *msf, String8 name) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + MSF_StreamNumber sn = msf_stream_alloc(msf); + String8 name_cstr = push_cstr(scratch.arena, name); + U32 sn32 = (U32)sn; + pdb_hash_table_set(named_stream_ht, name_cstr, str8_struct(&sn32)); + scratch_end(scratch); + ProfEnd(); + return sn; +} + +internal MSF_StreamNumber +pdb_find_named_stream(PDB_HashTable *named_stream_ht, String8 name) +{ + ProfBeginFunction(); + MSF_StreamNumber result = MSF_INVALID_STREAM_NUMBER; + String8 value; + if (pdb_hash_table_get(named_stream_ht, name, &value)) { + Assert(value.size == sizeof(U32)); + result = *(MSF_StreamNumber*)value.str; + } + ProfEnd(); + return result; +} + +internal PDB_SrcError +pdb_add_src(PDB_InfoContext *info, MSF_Context *msf, String8 file_path, String8 file_data, PDB_SrcCompType comp) +{ + Temp scratch = scratch_begin(0,0); + PDB_SrcError error_status = PDB_SrcError_UNKNOWN; + + if (comp == PDB_SrcComp_NULL) { + // process path so it passes VS validity checks + String8 virt_path = file_path; + String8 work_dir = os_get_current_path(scratch.arena); + virt_path = path_absolute_dst_from_relative_dst_src(scratch.arena, virt_path, work_dir); + virt_path = lower_from_str8(scratch.arena, virt_path); + virt_path = path_convert_slashes(scratch.arena, virt_path, PathStyle_UnixAbsolute); + + String8 dummy_value; + B32 is_virt_path_present = pdb_hash_table_get(&info->src_header_block_ht, virt_path, &dummy_value); + if (!is_virt_path_present) { + String8 stream_name = push_str8f(scratch.arena, "/src/files/%S", virt_path); + MSF_StreamNumber sn = pdb_find_named_stream(&info->named_stream_ht, stream_name); + B32 is_name_free = (sn == MSF_INVALID_STREAM_NUMBER); + if (is_name_free) { + sn = pdb_push_named_stream(&info->named_stream_ht, msf, stream_name); + B32 is_file_data_written = msf_stream_write_string(msf, sn, file_data); + if (is_file_data_written) { + // add command line path + PDB_StringIndex file_path_stridx; + if (!pdb_strtab_search(&info->strtab, file_path, &file_path_stridx)) { + file_path_stridx = pdb_strtab_add(&info->strtab, file_path); + } + + // add virtual path + PDB_StringIndex virt_path_stridx; + if (!pdb_strtab_search(&info->strtab, virt_path, &virt_path_stridx)) { + virt_path_stridx = pdb_strtab_add(&info->strtab, virt_path); + } + + // string indices -> offsets + PDB_StringOffset file_path_stroff = pdb_strtab_string_to_offset(&info->strtab, file_path_stridx); + PDB_StringOffset virt_path_stroff = pdb_strtab_string_to_offset(&info->strtab, virt_path_stridx); + + // fill out entry + PDB_SrcHeaderBlockEntry entry; + entry.size = sizeof(entry); + entry.version = PDB_SRC_HEADER_BLOCK_MAGIC_V1; + entry.file_crc = pdb_crc32_from_string(file_data); + entry.file_size = file_data.size; + entry.file_path = file_path_stroff; + entry.obj = 0; // null string offset + entry.virt_path = virt_path_stroff; + entry.comp = comp; + entry.flags = 0; + MemorySet(&entry.pad[0], 0, sizeof(entry.pad)); + MemorySet(&entry.reserved[0], 0, sizeof(entry.reserved)); + + // add to hash table { path, entry } + String8 key = virt_path; + String8 val = str8_struct(&entry); + pdb_hash_table_set(&info->src_header_block_ht, key, val); + + error_status = PDB_SrcError_OK; + } else { + error_status = PDB_SrcError_UNABLE_TO_WRITE_DATA; + } + } else { + error_status = PDB_SrcError_DUPLICATE_NAME_STREAM; + } + } else { + error_status = PDB_SrcError_DUPLICATE_ENTRY; + } + } else { + error_status = PDB_SrcError_UNSUPPORTED_COMPRESSION; + } + + scratch_end(scratch); + return error_status; +} + +//////////////////////////////// + +internal PDB_GsiContext * +gsi_alloc(void) +{ + ProfBeginFunction(); + Arena *arena = arena_alloc(); + PDB_GsiContext *gsi = push_array(arena, PDB_GsiContext, 1); + gsi->arena = arena; + gsi->word_size = PDB_GSI_V70_WORD_SIZE; + gsi->symbol_align = PDB_GSI_V70_SYMBOL_ALIGN; + gsi->bucket_count = PDB_GSI_V70_BUCKET_COUNT; + gsi->bucket_arr = push_array(arena, CV_SymbolList, gsi->bucket_count); + ProfEnd(); + return gsi; +} + +internal PDB_GsiContext * +gsi_open(MSF_Context *msf, MSF_StreamNumber sn, String8 symbol_data) +{ + ProfBeginFunction(); + + PDB_GsiHeader header = {0}; + msf_stream_read_struct(msf, sn, &header); + + Arena *arena = arena_alloc(); + PDB_GsiContext *gsi = push_array(arena, PDB_GsiContext, 1); + gsi->arena = arena; + gsi->word_size = PDB_GSI_V70_WORD_SIZE; + gsi->symbol_align = PDB_GSI_V70_SYMBOL_ALIGN; + gsi->bucket_count = PDB_GSI_V70_BUCKET_COUNT; + gsi->bucket_arr = push_array(gsi->arena, CV_SymbolList, gsi->bucket_count); + + if (header.signature == PDB_GsiSignature_Basic) { + if (header.version == PDB_GsiVersion_V70) { + Temp scratch = scratch_begin(0, 0); + + Assert(header.bucket_data_size >= PDB_GSI_V70_BITMAP_SIZE); // TODO: error handle + + U64 hash_record_count = header.hash_record_arr_size / sizeof(PDB_GsiHashRecord); + PDB_GsiHashRecord *hash_record_array = push_array(scratch.arena, PDB_GsiHashRecord, hash_record_count); + msf_stream_read_array(msf, sn, &hash_record_array[0], hash_record_count); + + U32 *bitmap = push_array(scratch.arena, U32, PDB_GSI_V70_BITMAP_COUNT); + msf_stream_read_array(msf, sn, &bitmap[0], PDB_GSI_V70_BITMAP_COUNT); + + U32 compressed_offset_count = (header.bucket_data_size - PDB_GSI_V70_BITMAP_SIZE) / sizeof(U32); + U32 *compressed_offset_array = push_array(scratch.arena, U32, compressed_offset_count); + msf_stream_read_array(msf, sn, &compressed_offset_array[0], compressed_offset_count); + + U32 *compressed_offset_ptr = &compressed_offset_array[0]; + U32 *compressed_offset_opl = &compressed_offset_array[0] + compressed_offset_count; + + U32 compressed_offset_max = (header.bucket_data_size / sizeof(PDB_GsiHashRecord)) * sizeof(PDB_GsiHashRecordOffsetCalc); + + for (U32 imask = 0; imask < PDB_GSI_V70_BITMAP_COUNT; imask += 1) { + for (U32 ibit = 0; ibit < PDB_GSI_V70_WORD_SIZE; ibit += 1) { + B32 is_bucket_compressed = !!(bitmap[imask] & (1 << ibit)); + if (is_bucket_compressed) { + Assert(compressed_offset_ptr < compressed_offset_opl); + + U32 next_compressed_offset = compressed_offset_max; + if (compressed_offset_ptr + 1 < compressed_offset_opl) { + next_compressed_offset = compressed_offset_ptr[1]; + } + U32 compressed_count = (next_compressed_offset - *compressed_offset_ptr) / sizeof(PDB_GsiHashRecordOffsetCalc); + + U64 hash_record_index = *compressed_offset_ptr / sizeof(PDB_GsiHashRecordOffsetCalc); + Assert(hash_record_index < hash_record_count); + + for (PDB_GsiHashRecord *hash_record_ptr = &hash_record_array[hash_record_index], *hash_record_opl = hash_record_ptr + compressed_count; + hash_record_ptr < hash_record_opl; + hash_record_ptr += 1) { + Assert(hash_record_ptr->symbol_off > 0); + Assert(hash_record_ptr->cref > 0); + + U32 symbol_off = hash_record_ptr->symbol_off -1; + U8 *symbol_ptr = symbol_data.str + symbol_off; + U16 *size_ptr = (U16*)symbol_ptr; + CV_SymKind *kind_ptr = (CV_SymKind*)(size_ptr + 1); + U8 *data_ptr = (U8*)(kind_ptr + 1); + + if (*size_ptr >= sizeof(*kind_ptr)) { + CV_Symbol symbol; + symbol.kind = *kind_ptr; + symbol.data = str8(data_ptr, *size_ptr - sizeof(*kind_ptr)); + gsi_push(gsi, &symbol); + } else { + Assert(!"invalid global codeview symbol"); + } + } + + compressed_offset_ptr += 1; + } + } + } + + scratch_end(scratch); + } else { + Assert(!"unknown GSI version"); + } + } + + // check if buckets are sorted +#if 0 + { + for (U64 i = 0; i < gsi->bucket_count; ++i) { + CV_SymbolList *bucket = &gsi->bucket_arr[i]; + for (CV_SymbolNode *prev = bucket->first, *curr = bucket->first ? bucket->first->next : NULL; + curr != NULL; + prev = curr, curr = curr->next) { + String8 a = pdb_get_symbol_name(prev->symbol.kind, prev->symbol.data); + String8 b = pdb_get_symbol_name(curr->symbol.kind, curr->symbol.data); + int compar = string_compar(a, b, false); + Assert(compar >= 0); + } + } + } +#endif + + ProfEnd(); + return gsi; +} + +internal void +gsi_release(PDB_GsiContext **gsi_ptr) +{ + ProfBeginFunction(); + arena_release((*gsi_ptr)->arena); + *gsi_ptr = NULL; + ProfEnd(); +} + +internal void +gsi_write_build_result(TP_Context *tp, PDB_GsiBuildResult build, MSF_Context *msf, MSF_StreamNumber gsi_sn, MSF_StreamNumber symbols_sn) +{ + ProfBeginFunction(); + + // reserve stream memory + U64 hash_record_arr_size = sizeof(build.hash_record_arr[0]) * build.hash_record_count; + U64 bitmap_size = sizeof(build.bitmap[0]) * build.bitmap_count; + U64 compressed_bucket_arr_size = sizeof(build.compressed_bucket_arr[0]) * build.compressed_bucket_count; + U64 gsi_size = sizeof(build.header) + hash_record_arr_size + bitmap_size + compressed_bucket_arr_size; + + ProfBegin("GSI Reserve"); + msf_stream_reserve(msf, gsi_sn, gsi_size); + ProfEnd(); + + ProfBegin("Symbol Data Reserve"); + msf_stream_reserve(msf, symbols_sn, build.symbol_data.size); + ProfEnd(); + + // write gsi stream + msf_stream_write_struct(msf, gsi_sn, &build.header); + + ProfBegin("Hash Record Write"); + msf_stream_write_parallel(tp, msf, gsi_sn, &build.hash_record_arr[0], hash_record_arr_size); + ProfEnd(); + + ProfBegin("Bitmap Write"); + msf_stream_write(msf, gsi_sn, &build.bitmap[0], bitmap_size); + ProfEnd(); + + ProfBegin("Compressed Bucket Write"); + msf_stream_write(msf, gsi_sn, &build.compressed_bucket_arr[0], compressed_bucket_arr_size); + ProfEnd(); + + // write symbol stream + ProfBegin("Symbol Data Write"); + msf_stream_write_string_parallel(tp, msf, symbols_sn, build.symbol_data); + ProfEnd(); + + ProfEnd(); +} + +int +gsi_hash_record_compar_is_before(void *a_, void *b_) +{ + PDB_GsiSortRecord *a = (PDB_GsiSortRecord*)a_; + PDB_GsiSortRecord *b = (PDB_GsiSortRecord*)b_; + + int is_before; + + if (a->name.size != b->name.size) { + is_before = a->name.size < b->name.size; + } else { + int cmp = str8_compar_ignore_case(&a->name, &b->name); + if (cmp == 0) { + cmp = u64_compar(&a->offset, &b->offset); + } + is_before = cmp < 0; + } + + return is_before; +} + +int +psi_addr_map_compar_is_before(void *a_, void *b_) +{ + PDB_GsiSortRecord *a = (PDB_GsiSortRecord*)a_; + PDB_GsiSortRecord *b = (PDB_GsiSortRecord*)b_; + + int is_before; + + if (a->isect_off.isect != b->isect_off.isect) { + is_before = a->isect_off.isect < b->isect_off.isect; + } else if (a->isect_off.off != b->isect_off.off) { + is_before = a->isect_off.off < b->isect_off.off; + } else { + is_before = a->name.size < b->name.size; + } + + return is_before; +} + +internal void +gsi_record_sort_by_name(PDB_GsiSortRecord *arr, U64 count) +{ + ProfBeginFunction(); + radsort(arr, count, gsi_hash_record_compar_is_before); + ProfEnd(); +} + +internal void +gsi_record_sort_by_sc(PDB_GsiSortRecord *arr, U64 count) +{ + ProfBeginFunction(); + radsort(arr, count, psi_addr_map_compar_is_before); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(gsi_size_buckets_task) +{ + U64 bucket_idx = task_id; + PDB_GsiSerializeSymbolsTask *task = raw_task; + CV_SymbolList *bucket_list = &task->bucket_arr[bucket_idx]; + for (CV_SymbolNode *node = bucket_list->first; node != 0; node = node->next) { + task->bucket_size_arr[bucket_idx] += cv_compute_symbol_record_size(&node->data, task->symbol_align); + } +} + +internal +THREAD_POOL_TASK_FUNC(gsi_serialize_pub32) +{ + U64 bucket_idx = task_id; + PDB_GsiSerializeSymbolsTask *task = raw_task; + + CV_SymbolList bucket_list = task->bucket_arr[bucket_idx]; + PDB_GsiSortRecord *sort_record_arr = task->sort_record_arr_arr[bucket_idx]; + U64 buffer_size = task->bucket_size_arr[bucket_idx]; + U64 buffer_base = task->bucket_off_arr[bucket_idx]; + U8 *buffer = task->buffer + buffer_base; + + U64 sort_idx = 0; + U64 buffer_cursor = 0; + + for (CV_SymbolNode *node = bucket_list.first; node != 0; node = node->next) { + CV_Symbol *symbol = &node->data; + Assert(symbol->kind == CV_SymKind_PUB32); + + CV_SymPub32 *pub32 = (CV_SymPub32 *)symbol->data.str; + U8 *str_ptr = (U8 *)(pub32 + 1); + U64 str_size = symbol->data.size - sizeof(*pub32); + String8 name = str8(str_ptr, str_size); + + // init sort record + PDB_GsiSortRecord *sr = &sort_record_arr[sort_idx]; + sr->isect_off = isect_off(pub32->sec, pub32->off); + sr->name = name; + sr->offset = buffer_cursor; + + // serialize symbol + U64 serial_size = cv_serialize_symbol_to_buffer(buffer, buffer_cursor, buffer_size, symbol, task->symbol_align); + + // advance + sort_idx += 1; + buffer_cursor += serial_size; + } + + Assert(sort_idx == bucket_list.count); + Assert(buffer_cursor == buffer_size); + + // sort symbols by name within bucket + gsi_record_sort_by_name(sort_record_arr, bucket_list.count); +} + +internal +THREAD_POOL_TASK_FUNC(gsi_serialize_symbols_task) +{ + U64 bucket_idx = task_id; + PDB_GsiSerializeSymbolsTask *task = raw_task; + + CV_SymbolList bucket_list = task->bucket_arr[bucket_idx]; + PDB_GsiSortRecord *sort_record_arr = task->sort_record_arr_arr[bucket_idx]; + U64 buffer_size = task->bucket_size_arr[bucket_idx]; + U64 buffer_base = task->bucket_off_arr[bucket_idx]; + U8 *buffer = task->buffer + buffer_base; + + U64 sort_idx = 0; + U64 buffer_cursor = 0; + + for (CV_SymbolNode *node = bucket_list.first; node != 0; node = node->next) { + CV_Symbol *symbol = &node->data; + + // init sort record + PDB_GsiSortRecord *sr = &sort_record_arr[sort_idx]; + //sr->isect_off = isect_off(0,0); + sr->name = cv_name_from_symbol(symbol->kind, symbol->data); + sr->offset = buffer_cursor; + + // serialize symbol + U64 serial_size = cv_serialize_symbol_to_buffer(buffer, buffer_cursor, buffer_size, symbol, task->symbol_align); + + // advance + sort_idx += 1; + buffer_cursor += serial_size; + } + + Assert(sort_idx == bucket_list.count); + Assert(buffer_cursor == buffer_size); + + // sort symbols by name within bucket + gsi_record_sort_by_name(sort_record_arr, bucket_list.count); +} + +internal PDB_GsiBuildResult +gsi_build_ex(TP_Context *tp, Arena *arena, PDB_GsiContext *gsi, U64 symbol_data_base, B32 is_pub32, U64 msf_page_size) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena,1); + + ProfBegin("Serialize & Sort Symbols"); + + PDB_GsiSerializeSymbolsTask serial_task; + serial_task.symbol_align = gsi->symbol_align; + serial_task.bucket_arr = gsi->bucket_arr; + serial_task.bucket_size_arr = push_array(scratch.arena, U64, gsi->bucket_count); + + // estimate each bucket size + tp_for_parallel(tp, 0, gsi->bucket_count, gsi_size_buckets_task, &serial_task); + + // prepare serial buffer + U64 buffer_size = sum_array_u64(gsi->bucket_count, serial_task.bucket_size_arr); + serial_task.buffer = push_array_no_zero(arena, U8, buffer_size); + serial_task.bucket_off_arr = push_array_copy_u64(scratch.arena, serial_task.bucket_size_arr, gsi->bucket_count); + counts_to_offsets_array_u64(gsi->bucket_count, serial_task.bucket_off_arr); + + // prepare GSI records + serial_task.sort_record_arr_arr = push_array_no_zero(scratch.arena, PDB_GsiSortRecord *, gsi->bucket_count); + serial_task.sort_record_arr = push_array_no_zero(arena, PDB_GsiSortRecord, gsi->symbol_count); + for (U64 bucket_idx = 0, cursor = 0; bucket_idx < gsi->bucket_count; bucket_idx += 1) { + serial_task.sort_record_arr_arr[bucket_idx] = serial_task.sort_record_arr + cursor; + cursor += gsi->bucket_arr[bucket_idx].count; + } + + // fill out sort records & serialize symbols + TP_TaskFunc *serial_func = is_pub32 ? gsi_serialize_pub32 : gsi_serialize_symbols_task; + tp_for_parallel(tp, 0, gsi->bucket_count, serial_func, &serial_task); + + ProfEnd(); + + U64 bitmap_count = (gsi->bucket_count / gsi->word_size) + 1; // ms-pdb allocates extra bucket and funnels free buckets there + U64 compressed_offset_count = 0; + U64 hash_record_count = gsi->symbol_count; + U32 *bitmap = push_array(arena, U32, bitmap_count); + U32 *compressed_offset_arr = push_array_no_zero(arena, U32, gsi->bucket_count); + PDB_GsiHashRecord *hash_record_arr = push_array_no_zero(arena, PDB_GsiHashRecord, hash_record_count); + + // offsets for symbol stream are shifted by one to tell apart from null and zero (see GSI1::fixSymRecs) + U64 offset_cursor = (1 + symbol_data_base); + U64 hash_idx = 0; + + ProfBegin("Write Bitmap & Record Offsets"); + for (U64 bucket_idx = 0; bucket_idx < gsi->bucket_count; bucket_idx += 1) { + // set bit for each occupied bucket + CV_SymbolList bucket_list = gsi->bucket_arr[bucket_idx]; + if (bucket_list.count) { + U64 word_idx = bucket_idx / gsi->word_size; + Assert(word_idx < bitmap_count); + bitmap[word_idx] |= 1 << (bucket_idx % gsi->word_size); + compressed_offset_arr[compressed_offset_count] = hash_idx * sizeof(PDB_GsiHashRecordOffsetCalc); // store in-memory offset for first bucket + compressed_offset_count += 1; + } + + // write out sorted hash records + PDB_GsiSortRecord *sort_record_arr = serial_task.sort_record_arr_arr[bucket_idx]; + for (U64 sr_idx = 0; sr_idx < gsi->bucket_arr[bucket_idx].count; sr_idx += 1, hash_idx += 1) { + PDB_GsiHashRecord *hr = &hash_record_arr[hash_idx]; + hr->symbol_off = offset_cursor + sort_record_arr[sr_idx].offset; + hr->cref = 1; + } + + // advance offset cursor + offset_cursor += serial_task.bucket_size_arr[bucket_idx]; + } + ProfEnd(); + + // fill out header + PDB_GsiHeader header; + header.signature = PDB_GsiSignature_Basic; + header.version = PDB_GsiVersion_V70; + header.hash_record_arr_size = sizeof(hash_record_arr[0]) * hash_record_count; + header.bucket_data_size = sizeof(bitmap[0]) * bitmap_count + sizeof(compressed_offset_arr[0]) * compressed_offset_count; + + // fill out result + PDB_GsiBuildResult result; + result.header = header; + result.hash_record_count = hash_record_count; + result.hash_record_arr = hash_record_arr; + result.sort_record_arr = serial_task.sort_record_arr; + result.bitmap_count = bitmap_count; + result.bitmap = bitmap; + result.compressed_bucket_count = compressed_offset_count; + result.compressed_bucket_arr = compressed_offset_arr; + result.total_hash_size = sizeof(header) + header.hash_record_arr_size + header.bucket_data_size; + result.symbol_data = str8(serial_task.buffer, buffer_size); + + scratch_end(scratch); + ProfEnd(); + return result; +} + +internal void +gsi_build(TP_Context *tp, PDB_GsiContext *gsi, MSF_Context *msf, MSF_StreamNumber sn, MSF_StreamNumber symbols_sn) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + U64 symbol_data_base = msf_stream_get_pos(msf, symbols_sn); + PDB_GsiBuildResult build = gsi_build_ex(tp, scratch.arena, gsi, symbol_data_base, /* is_pub32: */ 0, msf->page_size); + gsi_write_build_result(tp, build, msf, sn, symbols_sn); + + scratch_end(scratch); + ProfEnd(); +} + +internal U32 +gsi_hash(PDB_GsiContext *gsi, String8 input) +{ (void)gsi; + U32 hash = pdb_hash_v1(input); + return hash; +} + +internal void +gsi_push_(PDB_GsiContext *gsi, U32 hash, CV_SymbolNode *node) +{ + U64 bucket_idx = hash % gsi->bucket_count; + CV_SymbolList *list = &gsi->bucket_arr[bucket_idx]; + cv_symbol_list_push_node(list, node); + gsi->symbol_count += 1; +} + +internal CV_SymbolNode * +gsi_push(PDB_GsiContext *gsi, CV_Symbol *symbol) +{ + String8 name = cv_name_from_symbol(symbol->kind, symbol->data); + U32 hash = gsi_hash(gsi, name); + + CV_SymbolNode *node = push_array_no_zero(gsi->arena, CV_SymbolNode, 1); + node->next = 0; + node->prev = 0; + node->data = *symbol; + + gsi_push_(gsi, hash, node); + + return node; +} + +internal +THREAD_POOL_TASK_FUNC(gsi_symbol_hasher_task) +{ + ProfBeginFunction(); + GSI_SymbolHasherTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + for (U64 symbol_idx = range.min; symbol_idx < range.max; ++symbol_idx) { + CV_SymbolNode *symbol = task->symbols[symbol_idx]; + String8 name = cv_name_from_symbol(symbol->data.kind, symbol->data.data); + task->hashes[symbol_idx] = gsi_hash(task->gsi, name); + } + ProfEnd(); +} + +internal void +gsi_push_many_arr(TP_Context *tp, PDB_GsiContext *gsi, U64 count, CV_SymbolNode **symbols) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + ProfBegin("Hash UDT Names"); + GSI_SymbolHasherTask task = {0}; + task.gsi = gsi; + task.ranges = tp_divide_work(scratch.arena, count, tp->worker_count); + task.symbols = symbols; + task.hashes = push_array_no_zero(scratch.arena, U32, count); + tp_for_parallel(tp, 0, tp->worker_count, gsi_symbol_hasher_task, &task); + ProfEnd(); + + for (U64 i = 0; i < count; ++i) { + gsi_push_(gsi, task.hashes[i], symbols[i]); + } + + scratch_end(scratch); + ProfEnd(); +} + +internal void +gsi_push_many_list(PDB_GsiContext *gsi, U64 count, U32 *hash_arr, CV_SymbolList *list) +{ + Assert(count == list->count); + + U64 hash_idx = 0; + for (CV_SymbolNode *curr = list->first, *next = 0; curr != 0; curr = next, ++hash_idx) { + next = curr->next; + + curr->prev = 0; + curr->next = 0; + + gsi_push_(gsi, hash_arr[hash_idx], curr); + } + + MemoryZeroStruct(list); +} + +internal void +gsi_push_many_and_remove_duplicates(TP_Context *tp, CV_SymbolList *list) +{ + +} + +internal CV_SymbolNode * +gsi_search(PDB_GsiContext *gsi, CV_Symbol *symbol) +{ + String8 name = cv_name_from_symbol(symbol->kind, symbol->data); + U32 hash = gsi_hash(gsi, name); + U64 ibucket = hash % gsi->bucket_count; + + CV_SymbolList bucket_list = gsi->bucket_arr[ibucket]; + for (CV_SymbolNode *node = bucket_list.first; node != 0; node = node->next) { + String8 that_name = cv_name_from_symbol(node->data.kind, node->data.data); + if (str8_match(name, that_name, 0)) { + return node; + } + } + + return NULL; +} + +//////////////////////////////// + +internal PDB_PsiContext * +psi_alloc(void) +{ + ProfBeginFunction(); + Arena *arena = arena_alloc(); + PDB_PsiContext *psi = push_array(arena, PDB_PsiContext, 1); + psi->arena = arena; + psi->gsi = gsi_alloc(); + ProfEnd(); + return psi; +} + +internal PDB_PsiContext * +psi_open(MSF_Context *msf, MSF_StreamNumber sn, String8 symbol_data) +{ + ProfBeginFunction(); + + Arena *arena = arena_alloc(); + PDB_PsiContext *psi = push_array(arena, PDB_PsiContext, 1); + psi->arena = arena; + + // TODO: read out address table + + PDB_PsiHeader header = {0}; + msf_stream_read_struct(msf, sn, &header); + + psi->gsi = gsi_open(msf, sn, symbol_data); + + ProfEnd(); + return psi; +} + +internal void +psi_build(TP_Context *tp, PDB_PsiContext *psi, MSF_Context *msf, MSF_StreamNumber sn, MSF_StreamNumber symbols_sn) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + U64 symbol_data_base = msf_stream_get_pos(msf, symbols_sn); + PDB_GsiBuildResult gsi_build = gsi_build_ex(tp, scratch.arena, psi->gsi, symbol_data_base, /* is_pub32: */ 1, msf->page_size); + + ProfBegin("Address Map"); + + ProfBegin("Sort"); + gsi_record_sort_by_sc(gsi_build.sort_record_arr, gsi_build.hash_record_count); + ProfEnd(); + + ProfBegin("Offset Fill"); + U64 addr_map_count = gsi_build.hash_record_count; + U64 addr_map_size = addr_map_count * sizeof(U32); + U32 *addr_map = push_array_no_zero(scratch.arena, U32, addr_map_count); + for (U64 i = 0; i < addr_map_count; i += 1) { + addr_map[i] = gsi_build.sort_record_arr[i].offset; + } + ProfEnd(); + + ProfEnd(); + + PDB_PsiHeader header; + header.sym_hash_size = gsi_build.total_hash_size; + header.addr_map_size = addr_map_size; + header.thunk_count = 0; + header.thunk_size = 0; + header.isec_thunk_table = 0; + header.padding = 0; + header.sec_thunk_table_off = 0; + header.sec_count = 0; + + ProfBegin("MSF Write"); + msf_stream_write_struct(msf, sn, &header); + gsi_write_build_result(tp, gsi_build, msf, sn, symbols_sn); + msf_stream_write_array(msf, sn, &addr_map[0], addr_map_count); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); +} + +internal void +psi_release(PDB_PsiContext **psi_ptr) +{ + ProfBeginFunction(); + gsi_release(&(*psi_ptr)->gsi); + arena_release((*psi_ptr)->arena); + *psi_ptr = NULL; + ProfEnd(); +} + +internal CV_SymbolNode * +psi_push(PDB_PsiContext *psi, CV_Pub32Flags flags, U32 offset, U16 isect, String8 name) +{ + CV_Symbol pub = cv_make_pub32(psi->arena, flags, offset, isect, name); + CV_SymbolNode *node = gsi_push(psi->gsi, &pub); + return node; +} + +//////////////////////////////// + +internal void +dbi_sec_contrib_list_push_node(PDB_DbiSectionContribList *list, PDB_DbiSectionContribNode *node) +{ + node->next = 0; + SLLQueuePush(list->first, list->last, node); + list->count += 1; +} + +internal PDB_DbiSectionContribNode * +dbi_sec_contrib_list_push(Arena *arena, PDB_DbiSectionContribList *list) +{ + PDB_DbiSectionContribNode *node = push_array_no_zero(arena, PDB_DbiSectionContribNode, 1); + node->next = 0; + dbi_sec_contrib_list_push_node(list, node); + return node; +} + +internal void +dbi_sec_list_concat_arr(PDB_DbiSectionContribList *list, U64 count, PDB_DbiSectionContribList *to_concat) +{ + SLLConcatInPlaceArray(list, to_concat, count); +} + +internal PDB_DbiContext * +dbi_alloc(COFF_MachineType machine, U32 age) +{ + ProfBeginFunction(); + Arena *arena = arena_alloc(); + PDB_DbiContext *dbi = push_array(arena, PDB_DbiContext, 1); + dbi->arena = arena; + dbi->age = age; + dbi->machine = machine; + dbi->globals_sn = MSF_INVALID_STREAM_NUMBER; + dbi->publics_sn = MSF_INVALID_STREAM_NUMBER; + dbi->symbols_sn = MSF_INVALID_STREAM_NUMBER; + pdb_strtab_alloc(&dbi->ec_names, 8); + for (U64 istream = 0; istream < ArrayCount(dbi->dbg_streams); istream += 1) { + dbi->dbg_streams[istream] = MSF_INVALID_STREAM_NUMBER; + } + ProfEnd(); + return dbi; +} + +internal String8List * +dbi_open_file_info(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + MSF_UInt file_info_pos = sizeof(PDB_DbiHeader) + + dbi_header->module_info_size + + dbi_header->sec_con_size + + dbi_header->sec_map_size; + msf_stream_seek(msf, sn, file_info_pos); + + U16 mod_count = msf_stream_read_u16(msf, sn); + U16 total_file_count16 = msf_stream_read_u16(msf, sn); + + CV_ModIndex *imod_array = push_array(scratch.arena, CV_ModIndex, mod_count); + msf_stream_read_array(msf, sn, &imod_array[0], mod_count); + + U16 *mod_file_count = push_array(scratch.arena, U16, mod_count); + msf_stream_read_array(msf, sn, &mod_file_count[0], mod_count); + + U64 total_file_count = 0; + for (U16 imod = 0; imod < mod_count; imod += 1) { + total_file_count += mod_file_count[imod]; + } + + U32 *file_name_offset_array = push_array(scratch.arena, U32, total_file_count); + msf_stream_read_array(msf, sn, &file_name_offset_array[0], total_file_count); + + U64 file_name_buffer_offset = sizeof(mod_count) + + sizeof(total_file_count16) + + sizeof(imod_array[0]) * mod_count + + sizeof(mod_file_count[0]) * mod_count + + sizeof(file_name_offset_array[0]) * total_file_count; + Assert(dbi_header->file_info_size >= file_name_buffer_offset); + U64 file_name_buffer_size = dbi_header->file_info_size - file_name_buffer_offset; + char *file_name_buffer = push_array(arena, char, file_name_buffer_size + 1); + msf_stream_read_array(msf, sn, &file_name_buffer[0], file_name_buffer_size); + + String8List *file_info = push_array(arena, String8List, mod_count + 1); + + U32 *file_name_offset_ptr = &file_name_offset_array[0]; + for (U64 mod_idx = 0; mod_idx < mod_count; ++mod_idx) { + String8List *file_list = &file_info[mod_idx]; + U16 file_count = mod_file_count[mod_idx]; + for (U16 ifile = 0; ifile < file_count; ifile += 1, file_name_offset_ptr += 1) { + Assert(*file_name_offset_ptr <= file_name_buffer_size); + String8 file_path = str8_cstring(file_name_buffer + *file_name_offset_ptr); + str8_list_push(arena, file_list, file_path); + } + } + + scratch_end(scratch); + ProfEnd(); + return file_info; +} + +internal PDB_DbiModuleList +dbi_open_module_info(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header, String8List *file_info) +{ + ProfBeginFunction(); + + PDB_DbiModuleList list = {0}; + + MSF_UInt module_info_pos = sizeof(PDB_DbiHeader); + msf_stream_seek(msf, sn, module_info_pos); + + MSF_UInt module_info_opl = module_info_pos + dbi_header->module_info_size; + while (msf_stream_get_pos(msf, sn) < module_info_opl) { + PDB_DbiCompUnitHeader header = {0}; + msf_stream_read_struct(msf, sn, &header); + String8 obj_path = msf_stream_read_string(arena, msf, sn); + String8 lib_path = msf_stream_read_string(arena, msf, sn); + msf_stream_align(msf, sn, PDB_MODULE_ALIGN); + + String8List source_file_list = {0}; + if (header.contribution.base.mod != CV_ModIndex_Invalid) { + source_file_list = file_info[header.contribution.base.mod]; + } + + PDB_DbiModule *mod = push_array(arena, PDB_DbiModule, 1); + mod->next = 0; + mod->sn = header.sn; + mod->imod = header.contribution.base.mod; + mod->sym_data_size = header.symbols_size; + mod->c11_data_size = header.c11_lines_size; + mod->c13_data_size = header.c13_lines_size; + mod->source_file_list = source_file_list; + mod->obj_path = obj_path; + mod->lib_path = lib_path; + mod->first_sc = header.contribution; + + SLLQueuePush(list.first, list.last, mod); + list.count += 1; + } + + ProfEnd(); + return list; +} + +internal PDB_DbiSectionContribList +dbi_open_sec_contrib(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header) +{ + ProfBeginFunction(); + + PDB_DbiSectionContribList sec_contrib = {0}; + + if (dbi_header->sec_con_size > sizeof(PDB_DbiSectionContrib)) { + Temp scratch = scratch_begin(&arena, 1); + + // seek to start of section contrib info + MSF_UInt sec_con_pos = sizeof(PDB_DbiHeader) + dbi_header->module_info_size; + msf_stream_seek(msf, sn, sec_con_pos); + + // read header + PDB_DbiSectionContribVersion version = 0; + msf_stream_read_struct(msf, sn, &version); + + // parse contrib items + switch (version) { + case PDB_DbiSectionContribVersion_1: { + U64 contrib_count = dbi_header->sec_con_size / sizeof(PDB_DbiSectionContrib); + PDB_DbiSectionContrib *src_contrib_array = push_array(scratch.arena, PDB_DbiSectionContrib, contrib_count); + MSF_UInt sec_con_read = msf_stream_read_array(msf, sn, &src_contrib_array[0], contrib_count); + Assert(sec_con_read == sizeof(src_contrib_array[0]) * contrib_count); + + PDB_DbiSectionContribNode *dst_contrib_array = push_array_no_zero(arena, PDB_DbiSectionContribNode, contrib_count); + for (U64 icontrib = 0; icontrib < contrib_count; icontrib += 1) { + dst_contrib_array[icontrib].next = 0; + dst_contrib_array[icontrib].data = src_contrib_array[icontrib]; + dbi_sec_contrib_list_push_node(&sec_contrib, &dst_contrib_array[icontrib]); + } + } break; + case PDB_DbiSectionContribVersion_2: { + NotImplemented; + } break; + default: Assert(!"unknown section contrib version"); break; + } + + // have we exhausted sec-con bytes? + Assert(sec_con_pos + dbi_header->sec_con_size == msf_stream_get_pos(msf, sn)); + scratch_end(scratch); + } + + ProfEnd(); + return sec_contrib; +} + +internal PDB_StringTable +dbi_open_ec_names(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header) +{ + ProfBeginFunction(); + PDB_StringTable ec_names = {0}; + if (dbi_header->ec_info_size >= sizeof(PDB_StringTableHeader)) { + MSF_UInt ec_names_pos = sizeof(PDB_DbiHeader) + + dbi_header->module_info_size + + dbi_header->sec_con_size + + dbi_header->sec_map_size + + dbi_header->file_info_size + + dbi_header->tsm_size; + msf_stream_seek(msf, sn, ec_names_pos); + pdb_strtab_open(&ec_names, msf, sn); + } + ProfEnd(); + return ec_names; +} + +internal void +dbi_open_dbg_streams(MSF_StreamNumber *dbg_streams, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header) +{ + ProfBeginFunction(); + Assert(dbi_header->dbg_header_size % sizeof(dbg_streams[0]) == 0); // TODO: error handle + MSF_UInt dbg_stream_pos = sizeof(PDB_DbiHeader) + + dbi_header->module_info_size + + dbi_header->sec_con_size + + dbi_header->sec_map_size + + dbi_header->file_info_size + + dbi_header->tsm_size + + dbi_header->ec_info_size; + msf_stream_seek(msf, sn, dbg_stream_pos); + msf_stream_read(msf, sn, &dbg_streams[0], dbi_header->dbg_header_size); + ProfEnd(); +} + +internal PDB_DbiSectionList +dbi_open_section_headers(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + PDB_DbiSectionList sec_list = {0}; + U64 sec_count = msf_stream_get_size(msf, sn) / sizeof(PDB_DbiSectionNode); + PDB_DbiSectionNode *sec_nodes = push_array(arena, PDB_DbiSectionNode, sec_count); + for (U64 isec = 0; isec < sec_count; isec += 1) { + PDB_DbiSectionNode *sec = &sec_nodes[isec]; + msf_stream_read_struct(msf, sn, &sec->data); + SLLQueuePush(sec_list.first, sec_list.last, sec); + sec_list.count += 1; + } + ProfEnd(); + return sec_list; +} + +internal PDB_DbiContext * +dbi_open(MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + + PDB_DbiHeader header = {0}; + msf_stream_read_struct(msf, sn, &header); + + Arena *arena = arena_alloc(); + PDB_DbiContext *dbi = push_array(arena, PDB_DbiContext, 1); + dbi->arena = arena; + dbi->age = header.age; + dbi->machine = header.machine; + dbi->globals_sn = header.gsi_sn; + dbi->publics_sn = header.psi_sn; + dbi->symbols_sn = header.sym_sn; + + if (header.sig == PDB_DbiHeaderSignature_V1) { + switch (header.version) { + case PDB_DbiVersion_41: + case PDB_DbiVersion_50: + case PDB_DbiVersion_60: + case PDB_DbiVersion_110: { + Assert(!"TODO: support for older DBI versions"); + } break; + case PDB_DbiVersion_70: { + String8List *file_info = dbi_open_file_info(dbi->arena, msf, sn, &header); + dbi->module_list = dbi_open_module_info(dbi->arena, msf, sn, &header, file_info); + dbi->sec_contrib_list = dbi_open_sec_contrib(dbi->arena, msf, sn, &header); + // TODO: section map + //dbi->sec_map = dbi_open_sec_map(dbi->arena, msf, sn, &header); + dbi->ec_names = dbi_open_ec_names(dbi->arena, msf, sn, &header); + dbi_open_dbg_streams(&dbi->dbg_streams[0], msf, sn, &header); + dbi->section_list = dbi_open_section_headers(dbi->arena, msf, dbi->dbg_streams[PDB_DbiStream_SECTION_HEADER]); + } break; + } + } + + ProfEnd(); + return dbi; + +} + +internal void +dbi_build_section_header_stream(PDB_DbiContext *dbi, MSF_Context *msf, MSF_StreamNumber sn) +{ + ProfBeginFunction(); + + U64 header_arr_size = sizeof(dbi->section_list.first->data) * dbi->section_list.count; + msf_stream_resize(msf, sn, header_arr_size); + msf_stream_seek(msf, sn, 0); + + for (PDB_DbiSectionNode *i = dbi->section_list.first; i; i = i->next) { + msf_stream_write_struct(msf, sn, &i->data); + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(dbi_build_file_info_assign_file_offsets_task) +{ + ProfBeginFunction(); + + PDB_DbiBuildFileInfoTask *task = raw_task; + PDB_DbiModule *mod = task->mod_arr[task_id]; + + task->imod_arr[mod->imod] = mod->imod; + + if (mod->imod != CV_ModIndex_Invalid) { + // assign source file count + task->source_file_name_count_arr[mod->imod] = safe_cast_u16x(mod->source_file_list.node_count); + + // assign source file offsets + U64 source_file_idx = 0; + for (String8Node *string_n = mod->source_file_list.first; string_n != 0; string_n = string_n->next, ++source_file_idx) { + CV_StringBucket *string_bucket = cv_string_hash_table_lookup(task->string_ht, string_n->string); + task->source_file_name_offset_arr[mod->imod][source_file_idx] = safe_cast_u32(string_bucket->u.offset); + } + } else { + // module was deleted don't create source file info + task->source_file_name_count_arr[mod->imod] = 0; + } + + ProfEnd(); +} + +internal String8List +dbi_build_file_info(Arena *arena, TP_Context *tp, PDB_DbiModuleList mod_list, CV_StringHashTable string_ht) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + U64 total_source_file_count = 0; + U64 mod_arr_count = 0; + PDB_DbiModule **mod_arr = push_array_no_zero(scratch.arena, PDB_DbiModule *, mod_list.count); + + for (PDB_DbiModule *mod = mod_list.first; mod != 0; mod = mod->next) { + mod_arr[mod_arr_count++] = mod; + if (mod->imod != CV_ModIndex_Invalid) { + total_source_file_count += mod->source_file_list.node_count; + } + } + + U32 **source_file_name_offsets_arr = push_array_no_zero(scratch.arena, U32 *, mod_list.count); + U32 *source_file_name_offsets = push_array_no_zero(arena, U32, total_source_file_count); + for (U64 mod_idx = 0, cursor = 0; mod_idx < mod_list.count; ++mod_idx) { + if (mod_arr[mod_idx]->imod != CV_ModIndex_Invalid) { + source_file_name_offsets_arr[mod_idx] = source_file_name_offsets + cursor; + cursor += mod_arr[mod_idx]->source_file_list.node_count; + } else { + source_file_name_offsets_arr[mod_idx] = 0; + } + } + + U16 total_source_file_count16 = Min(max_U16, total_source_file_count); + U16 mod_count16 = Min(max_U16, mod_list.count); + + PDB_DbiBuildFileInfoTask task = {0}; + task.string_ht = string_ht; + task.mod_arr = mod_arr; + task.imod_arr = push_array_no_zero(arena, U16, mod_count16); + task.source_file_name_count_arr = push_array_no_zero(arena, U16, mod_list.count); + task.source_file_name_offset_arr = source_file_name_offsets_arr; + tp_for_parallel(tp, 0, mod_arr_count, dbi_build_file_info_assign_file_offsets_task, &task); + + // pack strings + String8 string_buffer = cv_pack_string_hash_table(arena, tp, string_ht); + + // layout file info sections + String8List file_info_srl = {0}; + str8_serial_begin(arena, &file_info_srl); + str8_serial_push_u16(arena, &file_info_srl, mod_count16); + str8_serial_push_u16(arena, &file_info_srl, total_source_file_count16); + str8_list_push(arena, &file_info_srl, str8_array(task.imod_arr, mod_count16)); + str8_list_push(arena, &file_info_srl, str8_array(task.source_file_name_count_arr, mod_list.count)); + str8_list_push(arena, &file_info_srl, str8_array(source_file_name_offsets, total_source_file_count)); + str8_list_push(arena, &file_info_srl, string_buffer); + str8_serial_push_align(arena, &file_info_srl, sizeof(U32)); + + scratch_end(scratch); + ProfEnd(); + return file_info_srl; +} + +internal String8List +dbi_build_module_info(Arena *arena, PDB_DbiContext *dbi, MSF_Context *msf) +{ + ProfBeginFunction(); + + String8List module_info_list = {0}; + str8_serial_begin(arena, &module_info_list); + + for (PDB_DbiModule *mod = dbi->module_list.first; mod != 0; mod = mod->next) { + // fill out header + PDB_DbiCompUnitHeader *header = push_array(arena, PDB_DbiCompUnitHeader, 1); + header->contribution = mod->first_sc; + // we don't use these flags right now + // U16 is_written : 1 + // U16 unused : 7 + // U16 tsm_index : 8 ; index into type server map + header->flags = 0; + header->sn = mod->sn; + header->symbols_size = mod->sym_data_size; + header->c11_lines_size = mod->c11_data_size; + header->c13_lines_size = mod->c13_data_size; + header->num_contrib_files = Min(max_U16, mod->source_file_list.node_count); + header->file_names_offset = 0; // TODO: fill out the offset + // TODO: generate EC info + header->src_file = 0; + header->pdb_file = 0; + + Assert(header->sn != MSF_INVALID_STREAM_NUMBER); + + // push module info + str8_serial_push_struct(arena, &module_info_list, header); + str8_serial_push_cstr(arena, &module_info_list, mod->obj_path); + str8_serial_push_cstr(arena, &module_info_list, mod->lib_path); + str8_serial_push_align(arena, &module_info_list, PDB_MODULE_ALIGN); + } + + ProfEnd(); + return module_info_list; +} + +#if 0 +int +dbi_sc_compar(const PDB_DbiSectionContrib *a, const PDB_DbiSectionContrib *b) +{ +#if 0 + int cmp = 0; + if (a->base.sec == b->base.sec) { + if (a->base.sec_off < b->base.sec_off) { + cmp = -1; + } else if (a->base.sec_off > b->base.sec_off) { + cmp = +1; + } + } else if (a->base.sec < b->base.sec) { + cmp = -1; + } else { + cmp = +1; + } +#else +#define MAKE_SORTER(x) (((U64)(x)->base.sec << 32) | (U64)(x)->base.sec_off) + U64 l = MAKE_SORTER(a); + U64 r = MAKE_SORTER(b); + int cmp = l < r ? -1 : l > r ? + 1 : 0; +#undef MAKE_SORTER +#endif + return cmp; +} +#endif + +internal void +lnk_radix_sort_dbi_sc_array(PDB_DbiSectionContrib *arr, U64 sc_count, U64 sect_count) +{ + ProfBeginFunction(); + +#if 1 + // faster but uses more memory +# define RADIX_BIT_COUNT 16 +# define RADIX_MAX 2 +#else + // slower but uses less memory +# define RADIX_BIT_COUNT 8 +# define RADIX_MAX 4 +#endif + + Temp scratch = scratch_begin(0,0); + + PDB_DbiSectionContrib *temp_arr = push_array_no_zero(scratch.arena, PDB_DbiSectionContrib, sc_count); + PDB_DbiSectionContrib *src_arr = arr; + PDB_DbiSectionContrib *dst_arr = temp_arr; + + ProfBegin("Count Memzero"); + U32 count_8lo[256]; MemoryZeroArray(count_8lo); + U32 count_8hi[256]; MemoryZeroArray(count_8hi); + U32 count_16[1 << 16]; MemoryZeroArray(count_16); + U32 *count_arr = push_array(scratch.arena, U32, sect_count + 1); + ProfEnd(); + + ProfBegin("Histogram"); + for (U64 i = 0; i < sc_count; i += 1) { + PDB_DbiSectionContrib *sc = src_arr + i; + count_arr[sc->base.sec] += 1; + + U64 digit_8lo = (sc->base.sec_off >> 0) % ArrayCount(count_8lo); + U64 digit_8hi = (sc->base.sec_off >> 8) % ArrayCount(count_8hi); + U64 digit_16 = (sc->base.sec_off >> 16) % ArrayCount(count_16); + count_8lo[digit_8lo] += 1; + count_8hi[digit_8hi] += 1; + count_16[digit_16] += 1; + } + ProfEnd(); + + // + // sort on section offset + // + + ProfBegin("Offsets"); + U32 offset_8lo = 0; + U32 offset_8hi = 0; + for (U64 i = 1; i <= ArrayCount(count_8lo); i += 1) { + U32 current_8lo = count_8lo[i - 1]; + U32 current_8hi = count_8hi[i - 1]; + count_8lo[i - 1] = offset_8lo; + count_8hi[i - 1] = offset_8hi; + offset_8lo += current_8lo; + offset_8hi += current_8hi; + } + + U32 offset_16 = 0; + for (U64 i = 1; i <= ArrayCount(count_16); i += 1) { + U32 current_16 = count_16[i - 1]; + count_16[i - 1] = offset_16; + offset_16 += current_16; + } + ProfEnd(); + + count_8lo[0] = 0; + count_8hi[0] = 0; + count_16[0] = 0; + + ProfBegin("Order 8 Lo"); + for (U64 i = 0; i < sc_count; i += 1) { + PDB_DbiSectionContrib *sc = &src_arr[i]; + U64 digit = (sc->base.sec_off >> 0) % ArrayCount(count_8lo); + dst_arr[count_8lo[digit]++] = *sc; + } + ProfEnd(); + + ProfBegin("Order 8 Hi"); + for (U64 i = 0; i < sc_count; i += 1) { + PDB_DbiSectionContrib *sc = &dst_arr[i]; + U64 digit = (sc->base.sec_off >> 8) % ArrayCount(count_8hi); + src_arr[count_8hi[digit]++] = *sc; + } + ProfEnd(); + + ProfBegin("Order 16"); + for (U64 i = 0; i < sc_count; i += 1) { + PDB_DbiSectionContrib *sc = &src_arr[i]; + U64 digit = (sc->base.sec_off >> 16) % ArrayCount(count_16); + dst_arr[count_16[digit]++] = *sc; + } + ProfEnd(); + + // + // sort on section index + // + + ProfBegin("Section Indices"); + + U32 offset = 0; + for (U64 i = 1; i <= sect_count; i += 1) { + U32 current = count_arr[i - 1]; + count_arr[i - 1] = offset; + offset += current; + } + + count_arr[0] = 0; + + for (U64 i = 0; i < sc_count; i += 1) { + PDB_DbiSectionContrib *sc = dst_arr + i; + src_arr[count_arr[sc->base.sec]++] = *sc; + } + + ProfEnd(); + +#if 0 + for (U64 i = 1; i < sc_count; i += 1) { + U64 a = ((U64)arr[i - 1].base.sec << 32) | arr[i - 1].base.sec_off; + U64 b = ((U64)arr[i ].base.sec << 32) | arr[i ].base.sec_off; + Assert(a <= b); + } +#endif + + scratch_end(scratch); + +#undef RADIX_BIT_COUNT +#undef RADIX_MAX + + ProfEnd(); +} + +internal String8List +dbi_build_sec_con(Arena *arena, PDB_DbiContext *dbi) +{ + ProfBeginFunction(); + + PDB_DbiSectionContribVersion *version = push_array(arena, PDB_DbiSectionContribVersion, 1); + *version = PDB_DbiSectionContribVersion_1; + + // push section contribs V1 + ProfBegin("Push sect contribs [Count %llu]", dbi->sec_contrib_list.count); + PDB_DbiSectionContrib *sc_array = push_array_no_zero(arena, PDB_DbiSectionContrib, dbi->sec_contrib_list.count); + PDB_DbiSectionContrib *dst = &sc_array[0]; + for (PDB_DbiSectionContribNode *src = dbi->sec_contrib_list.first; src != 0; src = src->next, dst += 1) { + *dst = src->data; + } + ProfEnd(); + + // sort section contribs so they are binary searchable + lnk_radix_sort_dbi_sc_array(sc_array, dbi->sec_contrib_list.count, dbi->section_list.count + 1); + + // push section contrib info + ProfBegin("List Push"); + String8List sec_con_list = {0}; + str8_list_push(arena, &sec_con_list, str8((U8*)version, sizeof(*version))); + str8_list_push(arena, &sec_con_list, str8((U8*)sc_array, sizeof(sc_array[0])*dbi->sec_contrib_list.count)); + ProfEnd(); + + ProfEnd(); + return sec_con_list; +} + +internal String8List +dbi_build_sec_map(Arena *arena, PDB_DbiContext *dbi) +{ + ProfBeginFunction(); + + U64 entry_count = dbi->section_list.count + 1; + PDB_DbiSecMapEntry *entry_array = push_array(arena, PDB_DbiSecMapEntry, entry_count); + U64 isect = 0; + for (PDB_DbiSectionNode *sect = dbi->section_list.first; sect; sect = sect->next, ++isect) { + PDB_DbiSecMapEntry *s = &entry_array[isect]; + COFF_SectionHeader *coff_header = §->data; + if (coff_header->flags & COFF_SectionFlag_MEM_READ) { + s->flags |= PDB_DbiOMF_READ; + } + if (coff_header->flags & COFF_SectionFlag_MEM_WRITE) { + s->flags |= PDB_DbiOMF_WRITE; + } + if (coff_header->flags & COFF_SectionFlag_MEM_EXECUTE) { + s->flags |= PDB_DbiOMF_EXEC; + } + if (~coff_header->flags & COFF_SectionFlag_MEM_16BIT) { + s->flags |= PDB_DbiOMF_IS_32BIT_ADDR; + } + s->flags |= PDB_DbiOMF_IS_SELECTOR; // always set + s->sec_size = coff_header->vsize; + s->frame = isect + 1; + s->sec_name = max_U16; + s->class_name = max_U16; + } + // init last entry + { + PDB_DbiSecMapEntry *s = &entry_array[entry_count - 1]; + s->flags = PDB_DbiOMF_IS_32BIT_ADDR | PDB_DbiOMF_IS_ABS_ADDR; + s->sec_size = max_U32; + s->frame = isect + 1; + s->sec_name = max_U16; + s->class_name = max_U16; + } + + // init header + PDB_DbiSecMapHeader *header = push_array(arena, PDB_DbiSecMapHeader, 1); + header->section_count = entry_count; + header->segment_count = entry_count; + + // push section map info + String8List sec_map_list = {0}; + str8_list_push(arena, &sec_map_list, str8((U8*)header, sizeof(*header))); + str8_list_push(arena, &sec_map_list, str8((U8*)entry_array, sizeof(entry_array[0])*entry_count)); + + ProfEnd(); + return sec_map_list; +} + +internal String8List +dbi_build_dbg_header(Arena *arena, PDB_DbiContext *dbi, MSF_Context *msf) +{ + ProfBeginFunction(); + if (dbi->dbg_streams[PDB_DbiStream_SECTION_HEADER] == MSF_INVALID_STREAM_NUMBER) { + dbi->dbg_streams[PDB_DbiStream_SECTION_HEADER] = msf_stream_alloc(msf); + } + dbi_build_section_header_stream(dbi, msf, dbi->dbg_streams[PDB_DbiStream_SECTION_HEADER]); + + String8List dbg_header_srl = {0}; + str8_serial_begin(arena, &dbg_header_srl); + str8_serial_push_array(arena, &dbg_header_srl, dbi->dbg_streams, ArrayCount(dbi->dbg_streams)); + + ProfEnd(); + return dbg_header_srl; +} + +internal void +dbi_build(TP_Context *tp, PDB_DbiContext *dbi, MSF_Context *msf, MSF_StreamNumber dbi_sn, CV_StringHashTable string_ht) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + ProfBegin("Build"); + String8List module_info_list = dbi_build_module_info(scratch.arena, dbi, msf); + String8List sec_con_list = dbi_build_sec_con(scratch.arena, dbi); + String8List sec_map_list = dbi_build_sec_map(scratch.arena, dbi); + String8List file_info_list = dbi_build_file_info(scratch.arena, tp, dbi->module_list, string_ht); + String8List dbg_header_list = dbi_build_dbg_header(scratch.arena, dbi, msf); + String8List tsm_list = {0}; // TODO: TSM + ProfEnd(); + + PDB_DbiHeader header = {0}; + header.sig = PDB_DbiHeaderSignature_V1; + header.version = PDB_DbiVersion_70; + header.age = dbi->age; + header.gsi_sn = dbi->globals_sn; + header.build_number = PDB_DbiMakeBuildNumber(14, 11); + header.psi_sn = dbi->publics_sn; + header.pdb_version = 0; + header.sym_sn = dbi->symbols_sn; + header.pdb_version2 = 0; + header.module_info_size = module_info_list.total_size; + header.sec_con_size = sec_con_list.total_size; + header.sec_map_size = sec_map_list.total_size; + header.file_info_size = file_info_list.total_size; + header.tsm_size = tsm_list.total_size; + header.mfc_index = 0; + header.dbg_header_size = dbg_header_list.total_size; + header.ec_info_size = pdb_strtab_get_serialized_size(&dbi->ec_names); + header.flags = 0; + header.machine = dbi->machine; + header.reserved = 0; + + ProfBegin("MSF Write"); + + U64 dbi_stream_size = sizeof(header) + + module_info_list.total_size + + sec_con_list.total_size + + sec_map_list.total_size + + file_info_list.total_size + + tsm_list.total_size + + dbg_header_list.total_size; + msf_stream_resize(msf, dbi_sn, dbi_stream_size); + msf_stream_seek_start(msf, dbi_sn); + msf_stream_write(msf, dbi_sn, &header, sizeof(header)); + msf_stream_write_list(msf, dbi_sn, module_info_list); + msf_stream_write_list(msf, dbi_sn, sec_con_list); + msf_stream_write_list(msf, dbi_sn, sec_map_list); + msf_stream_write_list(msf, dbi_sn, file_info_list); + msf_stream_write_list(msf, dbi_sn, tsm_list); + pdb_strtab_build(&dbi->ec_names, msf, dbi_sn); + msf_stream_write_list(msf, dbi_sn, dbg_header_list); + ProfEnd(); + + ProfEnd(); + scratch_end(scratch); +} + +internal void +dbi_release(PDB_DbiContext **dbi_ptr) +{ + ProfBeginFunction(); + arena_release((*dbi_ptr)->arena); + *dbi_ptr = 0; + ProfEnd(); +} + +internal PDB_DbiModule * +dbi_push_module(PDB_DbiContext *dbi, String8 obj_path, String8 lib_path) +{ + // init module + PDB_DbiModule *mod = push_array(dbi->arena, PDB_DbiModule, 1); + mod->imod = safe_cast_u32(dbi->module_list.count); + mod->sn = MSF_INVALID_STREAM_NUMBER; + mod->obj_path = push_str8_copy(dbi->arena, obj_path); + mod->lib_path = push_str8_copy(dbi->arena, lib_path.size > 0 ? lib_path : obj_path); + + // push to list + SLLQueuePush(dbi->module_list.first, dbi->module_list.last, mod); + dbi->module_list.count += 1; + + return mod; +} + +internal void +dbi_module_push_section_contrib(PDB_DbiContext *dbi, + PDB_DbiModule *mod, + ISectOff isect_off, + U32 size, + U32 data_crc, + U32 reloc_crc, + COFF_SectionFlags flags) +{ + ProfBeginFunction(); + + PDB_DbiSectionContrib sc; + sc.base.sec = safe_cast_u16(isect_off.isect); + sc.base.sec_off = isect_off.off; + sc.base.size = size; + sc.base.flags = flags; + sc.base.mod = mod->imod; + sc.data_crc = data_crc; + sc.reloc_crc = reloc_crc; + + PDB_DbiSectionContribNode *node = push_array_no_zero(dbi->arena, PDB_DbiSectionContribNode, 1); + node->data = sc; + dbi_sec_contrib_list_push_node(&dbi->sec_contrib_list, node); + + // Mod1::fUpdateSecContrib + if (mod->first_sc.base.mod == 0) { + if (flags & COFF_SectionFlag_CNT_CODE) { + mod->first_sc = sc; + } + } + + ProfEnd(); +} + +internal String8 +dbi_module_read_symbol_data(Arena *arena, MSF_Context *msf, PDB_DbiModule *mod) +{ + String8 symbol_data = str8(0,0); + if (mod->sn != MSF_INVALID_STREAM_NUMBER) { + B32 is_seek_ok = msf_stream_seek(msf, mod->sn, 0); + if (is_seek_ok) { + symbol_data = msf_stream_read_block(arena, msf, mod->sn, mod->sym_data_size); + } + } + return symbol_data; +} + +internal String8 +dbi_module_read_c11_data(Arena *arena, MSF_Context *msf, PDB_DbiModule *mod) +{ + String8 c11_data = str8(0,0); + if (mod->sn != MSF_INVALID_STREAM_NUMBER) { + MSF_UInt c11_data_pos = mod->sym_data_size; + B32 is_seek_ok = msf_stream_seek(msf, mod->sn, c11_data_pos); + if (is_seek_ok) { + c11_data = msf_stream_read_block(arena, msf, mod->sn, mod->c13_data_size); + } + } + return c11_data; +} + +internal String8 +dbi_module_read_c13_data(Arena *arena, MSF_Context *msf, PDB_DbiModule *mod) +{ + String8 c13_data = str8(0,0); + if (mod->sn != MSF_INVALID_STREAM_NUMBER) { + MSF_UInt c13_data_pos = mod->sym_data_size + mod->c11_data_size; + B32 is_seek_ok = msf_stream_seek(msf, mod->sn, c13_data_pos); + if (is_seek_ok) { + c13_data = msf_stream_read_block(arena, msf, mod->sn, mod->c13_data_size); + } + } + return c13_data; +} + +internal void +dbi_push_section(PDB_DbiContext *dbi, COFF_SectionHeader *hdr) +{ + ProfBeginFunction(); + + PDB_DbiSectionNode *n = push_array(dbi->arena, PDB_DbiSectionNode, 1); + n->data = *hdr; + n->next = 0; + SLLQueuePush(dbi->section_list.first, dbi->section_list.last, n); + dbi->section_list.count += 1; + + ProfEnd(); +} + +//////////////////////////////// + +internal MSF_Context * +pdb_alloc_msf(U64 page_size) +{ + ProfBeginFunction(); + MSF_Context *msf = msf_alloc(page_size, MSF_DEFAULT_FPM); + MSF_StreamNumber null_sn = msf_stream_alloc(msf); + MSF_StreamNumber info_sn = msf_stream_alloc(msf); + MSF_StreamNumber tpi_sn = msf_stream_alloc(msf); + MSF_StreamNumber dbi_sn = msf_stream_alloc(msf); + MSF_StreamNumber ipi_sn = msf_stream_alloc(msf); + Assert(null_sn == 0); + Assert(info_sn == PDB_FixedStream_Info); + Assert(dbi_sn == PDB_FixedStream_Dbi); + Assert(tpi_sn == PDB_FixedStream_Tpi); + Assert(ipi_sn == PDB_FixedStream_Ipi); + ProfEnd(); + return msf; +} + +internal PDB_Context * +pdb_alloc(U64 page_size, COFF_MachineType machine, COFF_TimeStamp time_stamp, U32 age, OS_Guid guid) +{ + ProfBeginFunction(); + Arena *arena = arena_alloc(); + PDB_Context *pdb = push_array(arena, PDB_Context, 1); + pdb->arena = arena; + pdb->msf = pdb_alloc_msf(page_size); + pdb->info = pdb_info_alloc(age, time_stamp, guid); + pdb->dbi = dbi_alloc(machine, age); + pdb->gsi = gsi_alloc(); + pdb->psi = psi_alloc(); + pdb->type_servers[CV_TypeIndexSource_NULL] = push_array(arena, PDB_TypeServer, 1); + for (U64 i = CV_TypeIndexSource_NULL + 1; i < ArrayCount(pdb->type_servers); ++i) { + pdb->type_servers[i] = pdb_type_server_alloc(PDB_TYPE_SERVER_HASH_BUCKET_COUNT_CURRENT); + } + ProfEnd(); + return pdb; +} + +internal PDB_Context * +pdb_open(String8 data) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0, 0); + + PDB_Context *pdb = 0; + + MSF_Context *msf = 0; + MSF_Error msf_err = msf_open(data, &msf); + if (msf_err == MSF_Error_OK) { + Arena *arena = arena_alloc(); + pdb = push_array(arena, PDB_Context, 1); + pdb->arena = arena; + pdb->msf = msf; + pdb->info = pdb_info_open(pdb->msf, PDB_FixedStream_Info); + pdb->dbi = dbi_open(pdb->msf, PDB_FixedStream_Dbi); + if (pdb->dbi) { + MSF_UInt sym_data_size = msf_stream_get_size(pdb->msf, pdb->dbi->symbols_sn); + String8 symbol_data = msf_stream_read_block(scratch.arena, pdb->msf, pdb->dbi->symbols_sn, sym_data_size); + pdb->gsi = gsi_open(pdb->msf, pdb->dbi->globals_sn, symbol_data); + pdb->psi = psi_open(pdb->msf, pdb->dbi->publics_sn, symbol_data); + } + PDB_StringTable *strtab = &pdb->info->strtab; + pdb->type_servers[CV_TypeIndexSource_NULL] = push_array(pdb->arena, PDB_TypeServer, 1); + pdb->type_servers[CV_TypeIndexSource_TPI] = pdb_type_server_open(pdb->msf, PDB_FixedStream_Tpi, strtab); + if (pdb->info->flags & PDB_FeatureFlag_HAS_ID_STREAM) { + pdb->type_servers[CV_TypeIndexSource_IPI] = pdb_type_server_open(pdb->msf, PDB_FixedStream_Ipi, strtab); + } + } + + scratch_end(scratch); + ProfEnd(); + return pdb; +} + +internal void +pdb_release(PDB_Context **pdb_ptr) +{ + ProfBeginFunction(); + PDB_Context *pdb = *pdb_ptr; + msf_release(&pdb->msf); + dbi_release(&pdb->dbi); + gsi_release(&pdb->gsi); + for (U64 i = 1; i < ArrayCount(pdb->type_servers); ++i) { + pdb_type_server_release(&pdb->type_servers[i]); + } + arena_release(pdb->arena); + *pdb_ptr = 0; + ProfEnd(); +} + +internal void +pdb_set_machine(PDB_Context *pdb, COFF_MachineType machine) +{ + pdb->dbi->machine = machine; +} + +internal void +pdb_set_guid(PDB_Context *pdb, OS_Guid guid) +{ + pdb->info->guid = guid; +} + +internal void +pdb_set_time_stamp(PDB_Context *pdb, COFF_TimeStamp time_stamp) +{ + pdb->info->time_stamp = time_stamp; +} + +internal void +pdb_set_age(PDB_Context *pdb, U32 age) +{ + pdb->dbi->age = age; + pdb->info->age = age; +} + +internal COFF_MachineType +pdb_get_machine(PDB_Context *pdb) +{ + return pdb->dbi->machine; +} + +internal COFF_TimeStamp +pdb_get_time_stamp(PDB_Context *pdb) +{ + return pdb->info->time_stamp; +} + +internal U32 +pdb_get_age(PDB_Context *pdb) +{ + return pdb->info->age; +} + +internal OS_Guid +pdb_get_guid(PDB_Context *pdb) +{ + return pdb->info->guid; +} + +internal void +pdb_build(TP_Context *tp, TP_Arena *pool_temp, PDB_Context *pdb, CV_StringHashTable string_ht) +{ + ProfBeginFunction(); + + PDB_InfoContext *info = pdb->info; + PDB_StringTable *strtab = &info->strtab; + PDB_DbiContext *dbi = pdb->dbi; + PDB_TypeServer *tpi = pdb->type_servers[CV_TypeIndexSource_TPI]; + PDB_TypeServer *ipi = pdb->type_servers[CV_TypeIndexSource_IPI]; + + if (dbi->globals_sn == MSF_INVALID_STREAM_NUMBER) { + dbi->globals_sn = msf_stream_alloc(pdb->msf); + } + if (dbi->publics_sn == MSF_INVALID_STREAM_NUMBER) { + dbi->publics_sn = msf_stream_alloc(pdb->msf); + } + if (dbi->symbols_sn == MSF_INVALID_STREAM_NUMBER) { + dbi->symbols_sn = msf_stream_alloc(pdb->msf); + } + + pdb_type_server_build(tp, tpi, strtab, pdb->msf, PDB_FixedStream_Tpi); + if (info->flags & PDB_FeatureFlag_HAS_ID_STREAM) { + pdb_type_server_build(tp, ipi, strtab, pdb->msf, PDB_FixedStream_Ipi); + } + + psi_build(tp, pdb->psi, pdb->msf, dbi->publics_sn, dbi->symbols_sn); + gsi_build(tp, pdb->gsi, pdb->msf, dbi->globals_sn, dbi->symbols_sn); + dbi_build(tp, pdb->dbi, pdb->msf, PDB_FixedStream_Dbi, string_ht); + pdb_info_build(pdb->info, pdb->msf, PDB_FixedStream_Info); + + ProfEnd(); +} + +//////////////////////////////// + +internal String8 +pdb_string_from_src_error(PDB_SrcError error) +{ + switch (error) { + case PDB_SrcError_OK: return str8_lit("OK"); + case PDB_SrcError_DUPLICATE_NAME_STREAM: return str8_lit("DUPLICATE_NAME_STREAM"); + case PDB_SrcError_DUPLICATE_ENTRY: return str8_lit("DUPLICATE_ENTRY"); + case PDB_SrcError_UNABLE_TO_WRITE_DATA: return str8_lit("UNABLE_TO_WRITE_DATA"); + case PDB_SrcError_UNSUPPORTED_COMPRESSION: return str8_lit("UNSUPPORTED_COMPRESSION"); + case PDB_SrcError_UNKNOWN: return str8_lit("UNKNOWN"); + } + return str8(0,0); +} + +internal String8 +pdb_string_from_open_type_server_error(PDB_OpenTypeServerError error) +{ + switch (error) { + case PDB_OpenTypeServerError_OK: return str8_lit("OK"); + case PDB_OpenTypeServerError_UNKNOWN: return str8_lit("UNKNOWN"); + case PDB_OpenTypeServerError_INVALID_BUCKET_COUNT: return str8_lit("INVALID_BUCKET_COUNT"); + case PDB_OpenTypeServerError_INVALID_TI_RANGE: return str8_lit("INVALID_TI_RANGE"); + case PDB_OpenTypeServerError_UNSUPPORTED_VERSION: return str8_lit("UNSUPPORTED_VERSION"); + } + return str8(0,0); +} + diff --git a/src/linker/pdb_ext/pdb_builder.h b/src/linker/pdb_ext/pdb_builder.h new file mode 100644 index 00000000..bfe996c0 --- /dev/null +++ b/src/linker/pdb_ext/pdb_builder.h @@ -0,0 +1,481 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +//////////////////////////////// + +#define PDB_NATURAL_ALIGN 4 +#define PDB_SYMBOL_ALIGN PDB_NATURAL_ALIGN + +//////////////////////////////// +// Hash table + +#define PDB_HASH_TABLE_PACK_FUNC(name) void name(Arena *arena, String8List *local_data_srl, String8List *key_value_srl, String8 key, String8 value, void *ud) +typedef PDB_HASH_TABLE_PACK_FUNC(PDB_HashTablePackFunc); + +#define PDB_HASH_TABLE_UNPACK_FUNC(name) B32 name(void *ud, String8 local_data, String8 key_value_data, U64 *key_value_cursor, String8 *key_out, String8 *value_out) +typedef PDB_HASH_TABLE_UNPACK_FUNC(PDB_HashTableUnpackFunc); + +typedef struct PDB_HashTableBucket +{ + String8 key; + String8 value; +} PDB_HashTableBucket; + +typedef struct PDB_HashTable +{ + Arena *arena; + PDB_HashTableBucket *bucket_arr; + U32Array present_bits; + U32Array deleted_bits; + U32 max; + U32 count; +} PDB_HashTable; + +typedef enum +{ + PDB_HashTableParseError_OK, + PDB_HashTableParseError_OUT_OF_BYTES, + PDB_HashTableParseError_CORRUPTED +} PDB_HashTableParseError; + +//////////////////////////////// +// String Table + +typedef struct PDB_StringTableBucket +{ + String8 data; + PDB_StringOffset offset; + PDB_StringIndex istr; +} PDB_StringTableBucket; + +typedef struct PDB_StringTable +{ + Arena *arena; + U32 version; + U32 size; + U32 bucket_count; + U32 bucket_max; + U32 *ibucket_array; + PDB_StringTableBucket **bucket_array; +} PDB_StringTable; + +typedef enum +{ + PDB_StringTableOpenError_OK, + PDB_StringTableOpenError_BAD_MAGIC, + PDB_StringTableOpenError_UNKNOWN_VERSION, + PDB_StringTableOpenError_CORRUPTED, + PDB_StringTableOpenError_STRING_OFFSET_OUT_OF_BOUNDS, + PDB_StringTableOpenError_OFFSETS_EXCEED_BUCKET_COUNT +} PDB_StringTableOpenError; + +//////////////////////////////// +// Type Server + +#define PDB_TYPE_HINT_STEP 128 +#define PDB_LEAF_ALIGN PDB_NATURAL_ALIGN + +typedef enum +{ + PDB_OpenTypeServerError_OK, + PDB_OpenTypeServerError_UNKNOWN, + PDB_OpenTypeServerError_INVALID_BUCKET_COUNT, + PDB_OpenTypeServerError_INVALID_TI_RANGE, + PDB_OpenTypeServerError_UNSUPPORTED_VERSION, +} PDB_OpenTypeServerError; + +typedef struct PDB_TypeBucket +{ + struct PDB_TypeBucket *next; + String8 raw_leaf; + CV_TypeIndex type_index; +} PDB_TypeBucket; + +typedef struct PDB_TypeServer +{ + Arena *arena; + CV_TypeIndex ti_lo; + String8List leaf_list; + U64 bucket_cap; + PDB_TypeBucket **buckets; + MSF_StreamNumber hash_sn; + PDB_HashTable hash_adj; +} PDB_TypeServer; + +typedef struct PDB_TypeHashStreamInfo +{ + PDB_OffsetSize hash_vals; + PDB_OffsetSize ti_offs; + PDB_OffsetSize hash_adj; +} PDB_TypeHashStreamInfo; + +typedef struct PDB_TypeServerParse +{ + Rng1U64 ti_range; + String8 leaf_data; +} PDB_TypeServerParse; + +typedef struct +{ + CV_DebugT debug_t; + U64 *udt_counts; + U64 *udt_offsets; + Rng1U64 *ranges; + PDB_TypeServer *type_server; + PDB_TypeBucket *udt_buckets; +} PDB_PushLeafTask; + +typedef struct +{ + PDB_TypeServer *ts; + U32 *map; +} PDB_WriteTypeToBucketMap; + +typedef struct +{ + U64 align; + CV_TypeIndex ti_lo; + CV_TypeIndex ti_hi; + U64 hint_count; + PDB_TpiOffHint *hint_arr; + String8Node **lf_arr; + Rng1U64 *lf_range_arr; + U64 *lf_cursor_arr; + U8 *lf_buf; + U64 lf_buf_size; +} PDB_WriteTypesTask; + +//////////////////////////////// +// Info + +typedef struct PDB_InfoContext +{ + Arena *arena; + COFF_TimeStamp time_stamp; + U32 age; + OS_Guid guid; + PDB_FeatureFlags flags; + PDB_HashTable named_stream_ht; + PDB_HashTable src_header_block_ht; + PDB_StringTable strtab; +} PDB_InfoContext; + +//////////////////////////////// +// SRC Header Block + +typedef enum +{ + PDB_SrcError_OK, + PDB_SrcError_DUPLICATE_NAME_STREAM, + PDB_SrcError_DUPLICATE_ENTRY, + PDB_SrcError_UNABLE_TO_WRITE_DATA, + PDB_SrcError_UNSUPPORTED_COMPRESSION, + PDB_SrcError_UNKNOWN +} PDB_SrcError; + +//////////////////////////////// +// GSI + +#define PDB_GSI_V70_SYMBOL_ALIGN 4 +#define PDB_GSI_V70_WORD_SIZE 32 +#define PDB_GSI_V70_BUCKET_COUNT 4096 +#define PDB_GSI_V70_BITMAP_COUNT ((PDB_GSI_V70_BUCKET_COUNT / PDB_GSI_V70_WORD_SIZE) + 1) +#define PDB_GSI_V70_BITMAP_SIZE (PDB_GSI_V70_BITMAP_COUNT * sizeof(U32)) + +typedef struct PDB_GsiContext +{ + Arena *arena; + U64 word_size; + U64 symbol_align; + U64 bucket_count; + U64 symbol_count; + CV_SymbolList *bucket_arr; +} PDB_GsiContext; + +typedef struct PDB_GsiSortRecord +{ + ISectOff isect_off; + String8 name; + U64 offset; +} PDB_GsiSortRecord; + +typedef struct PDB_GsiBuildResult +{ + PDB_GsiHeader header; + U64 hash_record_count; + PDB_GsiHashRecord *hash_record_arr; + PDB_GsiSortRecord *sort_record_arr; + U64 bitmap_count; + U32 *bitmap; + U64 compressed_bucket_count; + U32 *compressed_bucket_arr; + U64 total_hash_size; + String8 symbol_data; +} PDB_GsiBuildResult; + +typedef struct PDB_GsiSerializeSymbolsTask +{ + U64 symbol_align; + CV_SymbolList *bucket_arr; + U64 *bucket_size_arr; + U64 *bucket_off_arr; + U8 *buffer; + PDB_GsiSortRecord **sort_record_arr_arr; + PDB_GsiSortRecord *sort_record_arr; +} PDB_GsiSerializeSymbolsTask; + +//////////////////////////////// +// PSI + +typedef struct PDB_PsiContext +{ + Arena *arena; + PDB_GsiContext *gsi; +} PDB_PsiContext; + +//////////////////////////////// +// DBI + +#define PDB_MODULE_ALIGN PDB_NATURAL_ALIGN + +typedef struct PDB_DbiModule +{ + struct PDB_DbiModule *next; + MSF_StreamNumber sn; + CV_ModIndex imod; + PDB_DbiSectionContrib first_sc; + U64 sym_data_size; + U64 c11_data_size; + U64 c13_data_size; + U64 globrefs_size; // TODO: what is this for? + String8 obj_path; + String8 lib_path; + String8List source_file_list; +} PDB_DbiModule; + +typedef struct PDB_DbiModuleList +{ + PDB_DbiModule *first; + PDB_DbiModule *last; + U64 count; +} PDB_DbiModuleList; + +typedef struct PDB_DbiSectionContribNode +{ + struct PDB_DbiSectionContribNode *next; + PDB_DbiSectionContrib data; +} PDB_DbiSectionContribNode; + +typedef struct PDB_DbiSectionContribList +{ + PDB_DbiSectionContribNode *first; + PDB_DbiSectionContribNode *last; + U64 count; +} PDB_DbiSectionContribList; + +typedef struct PDB_DbiSectionNode +{ + struct PDB_DbiSectionNode *next; + COFF_SectionHeader data; +} PDB_DbiSectionNode; + +typedef struct PDB_DbiSectionList +{ + U64 count; + PDB_DbiSectionNode *first; + PDB_DbiSectionNode *last; +} PDB_DbiSectionList; + +typedef struct PDB_DbiContext +{ + Arena *arena; + U32 age; + COFF_MachineType machine; + MSF_StreamNumber globals_sn; + MSF_StreamNumber publics_sn; + MSF_StreamNumber symbols_sn; + PDB_DbiModuleList module_list; + PDB_DbiSectionContribList sec_contrib_list; + PDB_DbiSectionList section_list; + PDB_StringTable ec_names; + MSF_StreamNumber dbg_streams[PDB_DbiStream_COUNT]; +} PDB_DbiContext; + +//////////////////////////////// +// PDB + +typedef struct PDB_Context +{ + Arena *arena; + MSF_Context *msf; + PDB_InfoContext *info; + PDB_DbiContext *dbi; + PDB_GsiContext *gsi; + PDB_PsiContext *psi; + PDB_TypeServer *type_servers[CV_TypeIndexSource_COUNT]; +} PDB_Context; + +//////////////////////////////// + +typedef struct +{ + PDB_GsiContext *gsi; + Rng1U64 *ranges; + CV_SymbolNode **symbols; + U32 *hashes; +} GSI_SymbolHasherTask; + +typedef struct +{ + CV_StringHashTable string_ht; + PDB_DbiModule **mod_arr; + U16 *imod_arr; + U16 *source_file_name_count_arr; + U32 **source_file_name_offset_arr; +} PDB_DbiBuildFileInfoTask; + +//////////////////////////////// +// PDB + +internal PDB_Context * pdb_alloc(U64 page_size, COFF_MachineType machine, COFF_TimeStamp time_stamp, U32 age, OS_Guid guid); +internal PDB_Context * pdb_open(String8 data); +internal void pdb_release(PDB_Context **pdb_ptr); +internal void pdb_build(TP_Context *tp, TP_Arena *pool_temp, PDB_Context *pdb, CV_StringHashTable string_ht); +internal void pdb_set_machine(PDB_Context *pdb, COFF_MachineType machine); +internal void pdb_set_guid(PDB_Context *pdb, OS_Guid guid); +internal void pdb_set_time_stamp(PDB_Context *pdb, COFF_TimeStamp time_stamp); +internal void pdb_set_age(PDB_Context *pdb, U32 age); +internal COFF_MachineType pdb_get_machine(PDB_Context *pdb); +internal COFF_TimeStamp pdb_get_time_stamp(PDB_Context *pdb); +internal U32 pdb_get_age(PDB_Context *pdb); +internal OS_Guid pdb_get_guid(PDB_Context *pdb); + +//////////////////////////////// +// Info + +internal PDB_InfoContext * pdb_info_alloc(U32 age, COFF_TimeStamp time_stamp, OS_Guid guid); +internal PDB_InfoContext * pdb_info_open(MSF_Context *msf, MSF_StreamNumber sn); +internal void pdb_info_build(PDB_InfoContext *info, MSF_Context *msf, MSF_StreamNumber sn); +internal void pdb_info_release(PDB_InfoContext **info_ptr); +internal MSF_StreamNumber pdb_push_named_stream(PDB_HashTable *named_stream_ht, MSF_Context *msf, String8 name); +internal MSF_StreamNumber pdb_find_named_stream(PDB_HashTable *named_stream_ht, String8 name); +internal PDB_SrcError pdb_add_src(PDB_InfoContext *info, MSF_Context *msf, String8 file_path, String8 file_data, PDB_SrcCompType comp); + +//////////////////////////////// +// GSI + +internal PDB_GsiContext * gsi_alloc(void); +internal PDB_GsiContext * gsi_open(MSF_Context *msf, MSF_StreamNumber sn, String8 symbol_data); +internal void gsi_build(TP_Context *tp, PDB_GsiContext *gsi, MSF_Context *msf, MSF_StreamNumber gsi_sn, MSF_StreamNumber symbols_sn); +internal void gsi_release(PDB_GsiContext **gsi_ptr); +internal void gsi_write_build_result(TP_Context *tp, PDB_GsiBuildResult build, MSF_Context *msf, MSF_StreamNumber sn, MSF_StreamNumber symbols_sn); +internal PDB_GsiBuildResult gsi_build_ex(TP_Context *tp, Arena *arena, PDB_GsiContext *gsi, U64 symbol_data_base, B32 export_symbol_ptr_arr, U64 msf_page_size); +internal U32 gsi_hash(PDB_GsiContext *gsi, String8 input); +internal CV_SymbolNode * gsi_push(PDB_GsiContext *gsi, CV_Symbol *symbol); +internal void gsi_push_many_arr(TP_Context *tp, PDB_GsiContext *gsi, U64 count, CV_SymbolNode **symbol_arr); +internal void gsi_push_many_list(PDB_GsiContext *gsi, U64 count, U32 *hash_arr, CV_SymbolList *list); +internal CV_SymbolNode * gsi_search(PDB_GsiContext *gsi, CV_Symbol *symbol); + +//////////////////////////////// +// PSI + +internal PDB_PsiContext * psi_alloc(void); +internal PDB_PsiContext * psi_open(MSF_Context *msf, MSF_StreamNumber sn, String8 symbol_data); +internal void psi_build(TP_Context *tp, PDB_PsiContext *psi, MSF_Context *msf, MSF_StreamNumber sn, MSF_StreamNumber symbols_sn); +internal void psi_release(PDB_PsiContext **psi_ptr); +internal CV_SymbolNode * psi_push(PDB_PsiContext *psi, CV_Pub32Flags flags, U32 offset, U16 isect, String8 name); + +// TODO: +//internal CV_Symbol psi_neareset_symbol(PDB_PsiContext *psi, U16 isect, U32 off); +//internal void psi_push_thunk_map(PDB_PsiContext *psi, U32 *thunk_map, U32 thunk_count, U32 thunk_size, PDB_SO *sect_map, U32 sect_count, ISectOff thunk_table); + +//////////////////////////////// +// DBI + +internal PDB_DbiContext * dbi_alloc(COFF_MachineType machine, U32 age); +internal PDB_DbiContext * dbi_open(MSF_Context *msf, MSF_StreamNumber sn); +internal void dbi_build(TP_Context *tp, PDB_DbiContext *dbi, MSF_Context *msf, MSF_StreamNumber dbi_sn, CV_StringHashTable string_ht); +internal void dbi_release(PDB_DbiContext **dbi_ptr); +internal PDB_DbiModule * dbi_push_module(PDB_DbiContext *dbi, String8 obj_path, String8 lib_path); +internal String8 dbi_module_read_symbol_data(Arena *arena, MSF_Context *msf, PDB_DbiModule *mod); +internal String8 dbi_module_read_c11_data(Arena *arena, MSF_Context *msf, PDB_DbiModule *mod); +internal String8 dbi_module_read_c13_data(Arena *arena, MSF_Context *msf, PDB_DbiModule *mod); +internal void dbi_module_push_section_contrib(PDB_DbiContext *dbi, PDB_DbiModule *mod, ISectOff isect_off, U32 size, U32 data_crc, U32 reloc_crc, COFF_SectionFlags flags); +internal String8List * dbi_open_file_info(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header); +internal PDB_DbiModuleList dbi_open_module_info(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header, String8List *file_info); +internal PDB_DbiSectionContribList dbi_open_sec_contrib(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header); +internal PDB_StringTable dbi_open_ec_names(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header); +internal void dbi_open_dbg_streams(MSF_StreamNumber *dbg_streams, MSF_Context *msf, MSF_StreamNumber sn, PDB_DbiHeader *dbi_header); +internal PDB_DbiSectionList dbi_open_section_headers(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn); +internal void dbi_build_section_header_stream(PDB_DbiContext *dbi, MSF_Context *msf, MSF_StreamNumber sn); + +//////////////////////////////// +// Hash Table + +internal void pdb_hash_table_alloc(PDB_HashTable *ht, U32 max); +internal void pdb_hash_table_release(PDB_HashTable *ht); +internal PDB_HashTableParseError pdb_hash_table_from_data(PDB_HashTable *ht, String8 data, B32 has_local_data, PDB_HashTableUnpackFunc *unpack_func, void *unpack_ud, U64 *read_bytes_out); +internal String8 pdb_data_from_hash_table(Arena *arena, PDB_HashTable *ht, B32 has_local_data, PDB_HashTablePackFunc *pack_func, void *pack_ud); +internal void pdb_hash_table_set(PDB_HashTable *ht, String8 key, String8 value); +internal B32 pdb_hash_table_get(PDB_HashTable *ht, String8 key, String8 *value_out); +internal void pdb_hash_table_delete(PDB_HashTable *ht, String8 key); +internal B32 pdb_hash_table_try_set(PDB_HashTable *ht, String8 key, String8 value); +internal B32 pdb_hash_table_is_present(PDB_HashTable *ht, U32 k); +internal B32 pdb_hash_table_is_deleted(PDB_HashTable *ht, U32 k); +internal U32 pdb_hash_table_hash(String8 key); +internal void pdb_hash_table_grow(PDB_HashTable *ht, U64 new_capacity); +internal void pdb_hash_table_get_present_keys_and_values(Arena *arena, PDB_HashTable *ht, String8Array *keys_out, String8Array *values_out); + +//////////////////////////////// + +internal PDB_HashTableParseError pdb_hash_adj_hash_table_from_data(PDB_HashTable *ht, String8 data, PDB_StringTable *strtab, U64 *read_bytes_out); +internal PDB_HashTableParseError pdb_src_header_block_ht_from_data(PDB_HashTable *ht, String8 data, PDB_StringTable *strtab, U64 *read_bytes_out); +internal PDB_HashTableParseError pdb_named_stream_ht_from_data(PDB_HashTable *ht, String8 data, U64 *read_bytes_out); + +internal String8 pdb_data_from_hash_adj_hash_table(Arena *arena, PDB_HashTable *ht, PDB_StringTable *strtab); +internal String8 pdb_data_from_src_header_block_ht(Arena *arena, PDB_HashTable *ht, PDB_StringTable *strtab); +internal String8 pdb_data_from_named_stream_ht(Arena *arena, PDB_HashTable *ht); + +//////////////////////////////// +// String Table + +internal void pdb_strtab_alloc(PDB_StringTable *strtab, U32 max); +internal PDB_StringTableOpenError pdb_strtab_open(PDB_StringTable *strtab, MSF_Context *msf, MSF_StreamNumber sn); +internal void pdb_strtab_build(PDB_StringTable *strtab, MSF_Context *msf, MSF_StreamNumber sn); +internal void pdb_strtab_release(PDB_StringTable *strtab); +internal PDB_StringIndex pdb_strtab_add(PDB_StringTable *strtab, String8 string); +internal B32 pdb_strtab_search(PDB_StringTable *strtab, String8 string, PDB_StringIndex *index_out); +internal String8 pdb_strtab_string_from_offset(PDB_StringTable *strtab, PDB_StringOffset offset); +internal PDB_StringOffset pdb_strtab_string_to_offset(PDB_StringTable *strtab, PDB_StringIndex stridx); +internal U32 pdb_strtab_get_serialized_size(PDB_StringTable *strtab); +internal B32 pdb_strtab_try_add(PDB_StringTable *strtab, String8 string, PDB_StringIndex *index_out); +internal void pdb_strtab_grow(PDB_StringTable *strtab, U64 new_max); +internal U32 pdb_strtab_hash(PDB_StringTable *strtab, String8 string); + +//////////////////////////////// +// Type Server + +internal PDB_OpenTypeServerError pdb_type_server_parse_from_data_v80(String8 data, PDB_TypeServerParse *parse_out); +internal PDB_OpenTypeServerError pdb_type_server_parse_from_data(String8 data, PDB_TypeServerParse *parse_out); +internal PDB_TypeServer * pdb_type_server_alloc(U64 bucket_count); +internal PDB_TypeServer * pdb_type_server_open_v80(MSF_Context *msf, MSF_StreamNumber sn, PDB_StringTable *strtab); +internal PDB_TypeServer * pdb_type_server_open(MSF_Context *msf, MSF_StreamNumber sn, PDB_StringTable *strtab); +internal void pdb_type_server_build(TP_Context *tp, PDB_TypeServer *ts, PDB_StringTable *strtab, MSF_Context *msf, MSF_StreamNumber sn); +internal void pdb_type_server_release(PDB_TypeServer **serv_ptr); +internal void pdb_type_server_push(PDB_TypeServer *ts, String8 raw_leaf); +internal void pdb_type_server_push_parallel(TP_Context *tp, PDB_TypeServer *ts, CV_DebugT types); +//internal CV_LeafNode * pdb_type_server_leaf_from_string(PDB_TypeServer *ts, String8 string); +internal String8Node * pdb_type_server_reserve(PDB_TypeServer *ts, U64 count); +internal String8Node * pdb_type_server_make_leaf(PDB_TypeServer *ts, CV_LeafKind kind, String8 data); +internal void pdb_type_server_push_bucket(PDB_TypeServer *ts, CV_Leaf *leaf); +internal PDB_TypeHashStreamInfo pdb_type_hash_stream_build(TP_Context *tp, PDB_TypeServer *ts, PDB_StringTable *strtab, MSF_Context *msf, PDB_TpiOffHint *hint_arr, U64 hint_count); + +//////////////////////////////// +// Enum -> String + +internal String8 pdb_string_from_src_error(PDB_SrcError error); +internal String8 pdb_string_from_open_type_server_error(PDB_OpenTypeServerError error); + + diff --git a/src/linker/pdb_ext/pdb_helpers.c b/src/linker/pdb_ext/pdb_helpers.c new file mode 100644 index 00000000..501ded65 --- /dev/null +++ b/src/linker/pdb_ext/pdb_helpers.c @@ -0,0 +1,89 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal U64 +pdb_read_bit_vector_string(String8 data, U64 offset, U32Array *bits_out) +{ + U64 cursor = offset; + + U32 word_count = 0; + cursor += str8_deserial_read_struct(data, cursor, &word_count); + + U64 word_data_read_size = word_count * sizeof(U32); + String8 word_data = str8(0,0); + cursor += str8_deserial_read_block(data, cursor, word_data_read_size, &word_data); + + if (word_data.size == word_data_read_size) { + bits_out->count = word_count; + bits_out->v = (U32*)word_data.str; + } else { + bits_out->count = 0; + bits_out->v = 0; + } + + U64 read_size = cursor - offset; + return read_size; +} + +internal U64 +pdb_read_bit_vector_msf(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, U32Array *bits_out) +{ + // peek word count + MSF_UInt pos = msf_stream_get_pos(msf, sn); + U32 word_count = msf_stream_read_u32(msf, sn); + msf_stream_seek(msf, sn, pos); + + // read out header + packed words + U64 buffer_size = sizeof(word_count) + word_count * sizeof(U32); + U8 *buffer = push_array(arena, U8, buffer_size); + MSF_UInt read_size = msf_stream_read(msf, sn, buffer, buffer_size); + Assert(read_size == buffer_size); + + // parse words + U64 parse_size = pdb_read_bit_vector_string(str8(buffer, buffer_size), 0, bits_out); + return parse_size; +} + +internal B32 +pdb_write_bit_vector(MSF_Context *msf, MSF_StreamNumber sn, B32 *flag_array, U64 flag_count) +{ + B32 is_write_ok = 0; + + U32 word_size = sizeof(U32); + U32 bits_per_word = MSF_BITS_PER_CHAR * word_size; + U32 word_count = (flag_count + MSF_BITS_PER_CHAR) / MSF_BITS_PER_CHAR; + + is_write_ok = msf_stream_write_struct(msf, sn, &word_count); + if (is_write_ok) { + for (U64 iword = 0, iflag = 0; iword < word_count; ++iword) { + U32 word = 0; + + for (U64 iflag_opl = Min(flag_count, iflag + MSF_BITS_PER_CHAR); iflag < iflag_opl; ++iflag) { + if (flag_array[iflag]) { + word |= 1 << (iflag % bits_per_word); + } + } + + is_write_ok = msf_stream_write_struct(msf, sn, &word); + if (!is_write_ok) { + break; + } + } + } + + return is_write_ok; +} + +internal U64 +pdb_get_bit_vector_size(U32 bucket_count) +{ + U32 word_size = sizeof(U32); + U32 word_count = (bucket_count + MSF_BITS_PER_CHAR) / MSF_BITS_PER_CHAR; + + U64 result = 0; + result += sizeof(word_count); + result += word_size * word_count; + + return result; +} + diff --git a/src/linker/pdb_ext/pdb_helpers.h b/src/linker/pdb_ext/pdb_helpers.h new file mode 100644 index 00000000..315ff665 --- /dev/null +++ b/src/linker/pdb_ext/pdb_helpers.h @@ -0,0 +1,14 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal U32 pdb_hash_v1(String8 data); +internal U32 pdb_hash_udt(CV_UDTInfo udt_info, String8 data); + +internal U64 pdb_read_bit_vector_string(String8 data, U64 offset, U32Array *bits_out); +internal U64 pdb_read_bit_vector_msf(Arena *arena, MSF_Context *msf, MSF_StreamNumber sn, U32Array *bits_out); +internal B32 pdb_write_bit_vector(MSF_Context *msf, MSF_StreamNumber sn, B32 *flag_array, U64 flag_count); +internal U64 pdb_get_bit_vector_size(U32 bucket_count); + + diff --git a/src/linker/rdi/rdi.c b/src/linker/rdi/rdi.c new file mode 100644 index 00000000..dafd09bf --- /dev/null +++ b/src/linker/rdi/rdi.c @@ -0,0 +1,15 @@ +internal String8 +rdi_string_from_name_map_kind(RDI_NameMapKind kind) +{ + switch (kind) { + case RDI_NameMapKind_NULL : return str8_lit("NULL"); + case RDI_NameMapKind_GlobalVariables : return str8_lit("GlobalVariables"); + case RDI_NameMapKind_ThreadVariables : return str8_lit("ThreadVariables"); + case RDI_NameMapKind_Procedures : return str8_lit("Procedures"); + case RDI_NameMapKind_Types : return str8_lit("Types"); + case RDI_NameMapKind_LinkNameProcedures: return str8_lit("LinkNameProcedures"); + case RDI_NameMapKind_NormalSourcePaths : return str8_lit("NormalSourcePaths"); + } + return str8_lit(""); +} + diff --git a/src/linker/rdi/rdi.h b/src/linker/rdi/rdi.h new file mode 100644 index 00000000..b5d7e894 --- /dev/null +++ b/src/linker/rdi/rdi.h @@ -0,0 +1,5 @@ +#pragma once + +internal String8 rdi_string_from_name_map_kind(RDI_NameMapKind kind); + + diff --git a/src/linker/rdi/rdi_builder.c b/src/linker/rdi/rdi_builder.c new file mode 100644 index 00000000..e545a3d5 --- /dev/null +++ b/src/linker/rdi/rdi_builder.c @@ -0,0 +1,5592 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal RDIB_DataModel +rdib_infer_data_model(OperatingSystem os, RDI_Arch arch) +{ + RDIB_DataModel data_model = RDIB_DataModel_Null; + switch (os) { + case OperatingSystem_Null: break; + case OperatingSystem_Windows: { + switch (arch) { + case RDI_Arch_X86: + case RDI_Arch_X64: + data_model = RDIB_DataModel_LLP64; break; + default: NotImplemented; + } + } break; + case OperatingSystem_Linux: { + switch (arch) { + case RDI_Arch_X86: data_model = RDIB_DataModel_ILP32; break; + case RDI_Arch_X64: data_model = RDIB_DataModel_LLP64; break; + default: NotImplemented; + } + } break; + case OperatingSystem_Mac: { + switch (arch) { + case RDI_Arch_X86: NotImplemented; break; + case RDI_Arch_X64: data_model = RDIB_DataModel_LP64; break; + } + } break; + default: InvalidPath; + } + return data_model; +} + +internal RDI_TypeKind +rdib_short_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_S16; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_S16; + case RDIB_DataModel_LP64 : return RDI_TypeKind_S16; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_S16; + case RDIB_DataModel_SILP64: return RDI_TypeKind_S64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_unsigned_short_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_U16; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_U16; + case RDIB_DataModel_LP64 : return RDI_TypeKind_U16; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_U16; + case RDIB_DataModel_SILP64: return RDI_TypeKind_U64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_int_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_S32; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_S32; + case RDIB_DataModel_LP64 : return RDI_TypeKind_S32; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_S64; + case RDIB_DataModel_SILP64: return RDI_TypeKind_S64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_unsigned_int_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_U32; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_U32; + case RDIB_DataModel_LP64 : return RDI_TypeKind_U32; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_SILP64: return RDI_TypeKind_U64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_long_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_S32; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_S32; + case RDIB_DataModel_LP64 : return RDI_TypeKind_S64; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_S64; + case RDIB_DataModel_SILP64: return RDI_TypeKind_S64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_unsigned_long_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_U32; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_U32; + case RDIB_DataModel_LP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_SILP64: return RDI_TypeKind_U64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_long_long_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_S64; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_S64; + case RDIB_DataModel_LP64 : return RDI_TypeKind_S64; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_S64; + case RDIB_DataModel_SILP64: return RDI_TypeKind_S64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_unsigned_long_long_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_U64; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_LP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_SILP64: return RDI_TypeKind_U64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +internal RDI_TypeKind +rdib_pointer_size_t_type_from_data_model(RDIB_DataModel data_model) +{ + switch (data_model) { + case RDIB_DataModel_Null : break; + case RDIB_DataModel_ILP32 : return RDI_TypeKind_U32; + case RDIB_DataModel_LLP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_LP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_ILP64 : return RDI_TypeKind_U64; + case RDIB_DataModel_SILP64: return RDI_TypeKind_U64; + default: InvalidPath; + } + return RDI_TypeKind_NULL; +} + +//////////////////////////////// + +internal void +rdib_udt_member_list_push_node(RDIB_UDTMemberList *list, RDIB_UDTMember *node) +{ + SLLQueuePushCount(list, node); +} + +internal void +rdib_udt_member_list_concat_in_place(RDIB_UDTMemberList *list, RDIB_UDTMemberList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal void +rdib_line_table_push_fragment_node(RDIB_LineTable *list, RDIB_LineTableFragment *n) +{ + SLLQueuePush_N(list->first, list->last, n, next_line_table); + ++list->count; +} + +internal RDIB_LineTableFragment * +rdib_line_table_push(Arena *arena, RDIB_LineTable *list) +{ + RDIB_LineTableFragment *n = push_array(arena, RDIB_LineTableFragment, 1); + rdib_line_table_push_fragment_node(list, n); + return n; +} + +//////////////////////////////// + +internal RDIB_LineTableFragment * +rdib_line_table_fragment_chunk_list_push(Arena *arena, RDIB_LineTableFragmentChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_LineTableFragment); + return SLLChunkListLastItem(list); +} + +internal RDIB_Unit * +rdib_unit_chunk_list_push(Arena *arena, RDIB_UnitChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_Unit); + return SLLChunkListLastItem(list); +} + +internal RDIB_Scope * +rdib_scope_chunk_list_push(Arena *arena, RDIB_ScopeChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_Scope); + return SLLChunkListLastItem(list); +} + +internal RDIB_Procedure * +rdib_procedure_chunk_list_push(Arena *arena, RDIB_ProcedureChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_Procedure); + return SLLChunkListLastItem(list); +} + +internal RDIB_Variable * +rdib_variable_chunk_list_push(Arena *arena, RDIB_VariableChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_Variable); + return SLLChunkListLastItem(list); +} + +internal RDIB_LineTable * +rdib_line_table_chunk_list_push(Arena *arena, RDIB_LineTableChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_LineTable); + return SLLChunkListLastItem(list); +} + +internal RDIB_Type * +rdib_type_chunk_list_push(Arena *arena, RDIB_TypeChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_Type); + RDIB_Type *type = SLLChunkListLastItem(list); + type->final_idx = 0; + return type; +} + +internal RDIB_UDTMember * +rdib_udt_member_chunk_list_push(Arena *arena, RDIB_UDTMemberChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_UDTMember); + return SLLChunkListLastItem(list); +} + +internal RDIB_SourceFile * +rdib_source_file_chunk_list_push(Arena *arena, RDIB_SourceFileChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_SourceFile); + return SLLChunkListLastItem(list); +} + +internal RDIB_InlineSite * +rdib_inline_site_chunk_list_push(Arena *arena, RDIB_InlineSiteChunkList *list, U64 cap) +{ + SLLChunkListPush(arena, list, cap, RDIB_InlineSite); + return SLLChunkListLastItem(list); +} + +internal RDIB_Unit * +rdib_unit_chunk_list_push_zero(Arena *arena, RDIB_UnitChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_Unit); + return SLLChunkListLastItem(list); +} + +internal RDIB_Scope * +rdib_scope_chunk_list_push_zero(Arena *arena, RDIB_ScopeChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_Scope); + return SLLChunkListLastItem(list); +} + +internal RDIB_Procedure * +rdib_procedure_chunk_list_push_zero(Arena *arena, RDIB_ProcedureChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_Procedure); + return SLLChunkListLastItem(list); +} + +internal RDIB_Variable * +rdib_variable_chunk_list_push_zero(Arena *arena, RDIB_VariableChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_Variable); + return SLLChunkListLastItem(list); +} + +internal RDIB_LineTable * +rdib_line_table_chunk_list_push_zero(Arena *arena, RDIB_LineTableChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_LineTable); + return SLLChunkListLastItem(list); +} + +internal RDIB_Type * +rdib_type_chunk_list_push_zero(Arena *arena, RDIB_TypeChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_Type); + return SLLChunkListLastItem(list); +} + +internal RDIB_UDTMember * +rdib_udt_member_chunk_list_push_zero(Arena *arena, RDIB_UDTMemberChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_UDTMember); + return SLLChunkListLastItem(list); +} + +internal RDIB_SourceFile * +rdib_source_file_chunk_list_push_zero(Arena *arena, RDIB_SourceFileChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_SourceFile); + return SLLChunkListLastItem(list); +} + +internal RDIB_InlineSite * +rdib_inline_site_chunk_list_push_zero(Arena *arena, RDIB_InlineSiteChunkList *list, U64 cap) +{ + SLLChunkListPushZero(arena, list, cap, RDIB_InlineSite); + return SLLChunkListLastItem(list); +} + +internal RDIB_UnitChunk * +rdib_unit_chunk_list_reserve_ex(Arena *arena, RDIB_UnitChunkList *list, U64 count_per_chunk, U64 item_count) +{ + U64 chunk_count = CeilIntegerDiv(item_count, count_per_chunk); + RDIB_UnitChunk *chunks = push_array(arena, RDIB_UnitChunk, chunk_count); + U64 base = list->last ? list->last->base : 0; + + for (U64 i = 0; i+1 < chunk_count; i += 1, item_count -= count_per_chunk, base += count_per_chunk) { + chunks[i].base = base; + chunks[i].count = count_per_chunk; + chunks[i].cap = count_per_chunk; + chunks[i].v = push_array(arena, RDIB_Unit, count_per_chunk); + SLLQueuePush(list->first, list->last, &chunks[i]); + + for (U64 k = 0; k < count_per_chunk; ++k) { + chunks[i].v[k].chunk = &chunks[i]; + } + } + + chunks[chunk_count-1].base = base; + chunks[chunk_count-1].count = item_count; + chunks[chunk_count-1].cap = item_count; + chunks[chunk_count-1].v = push_array(arena, RDIB_Unit, item_count); + for (U64 k = 0; k < item_count; ++k) { + chunks[chunk_count-1].v[k].chunk = &chunks[chunk_count-1]; + } + + SLLQueuePush(list->first, list->last, &chunks[chunk_count-1]); + list->count += chunk_count; + + return chunks; +} + +internal void +rdib_unit_chunk_list_reserve(Arena *arena, RDIB_UnitChunkList *list, U64 cap) +{ + // fill out node + RDIB_UnitChunk *chunk = push_array(arena, RDIB_UnitChunk, 1); + chunk->cap = cap; + chunk->v = push_array(arena, RDIB_Unit, cap); + + // push node to list + SLLQueuePush(list->first, list->last, chunk); + list->count += 1; +} + +internal void +rdib_type_chunk_list_reserve(Arena *arena, RDIB_TypeChunkList *list, U64 cap) +{ + // fill out node + RDIB_TypeChunk *chunk = push_array(arena, RDIB_TypeChunk, 1); + chunk->cap = cap; + chunk->v = push_array(arena, RDIB_Type, cap); + + // push node to list + SLLQueuePush(list->first, list->last, chunk); + list->count += 1; +} + +internal void +rdib_source_file_list_reserve(Arena *arena, RDIB_SourceFileChunkList *list, U64 cap) +{ + // fill out node + RDIB_SourceFileChunk *chunk = push_array(arena, RDIB_SourceFileChunk, 1); + chunk->cap = cap; + chunk->v = push_array(arena, RDIB_SourceFile, cap); + + // push node to list + SLLQueuePush(list->first, list->last, chunk); + list->count += 1; +} + +internal void +rdib_unit_chunk_list_concat_in_place(RDIB_UnitChunkList *list, RDIB_UnitChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_UnitChunk); +} + +internal void +rdib_scope_chunk_list_concat_in_place(RDIB_ScopeChunkList *list, RDIB_ScopeChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_ScopeChunk); +} + +internal void +rdib_udt_member_chunk_list_concat_in_place(RDIB_UDTMemberChunkList *list, RDIB_UDTMemberChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_UDTMemberChunk); +} + +internal void +rdib_procedure_chunk_list_concat_in_place(RDIB_ProcedureChunkList *list, RDIB_ProcedureChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_ProcedureChunk); +} + +internal void +rdib_variable_chunk_list_concat_in_place(RDIB_VariableChunkList *list, RDIB_VariableChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_VariableChunk); +} + +internal void +rdib_line_table_chunk_list_concat_in_place(RDIB_LineTableChunkList *list, RDIB_LineTableChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_LineTableChunk); +} + +internal void +rdib_inline_site_chunk_list_concat_in_place(RDIB_InlineSiteChunkList *list, RDIB_InlineSiteChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_InlineSiteChunk); +} + +internal void +rdib_type_chunk_list_concat_in_place(RDIB_TypeChunkList *list, RDIB_TypeChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_TypeChunk); +} + +internal void +rdib_source_file_chunk_list_concat_in_place(RDIB_SourceFileChunkList *list, RDIB_SourceFileChunkList *to_concat) +{ + SLLConcatInPlaceChunkList(list, to_concat, RDIB_SourceFileChunk); +} + +internal void +rdib_line_table_chunk_list_concat_in_place_many(RDIB_LineTableChunkList *list, RDIB_LineTableChunkList *to_concat, U64 count) +{ + SLLConcatInPlaceChunkListArray(list, to_concat, RDIB_LineTableChunk, count); +} + +internal void +rdib_scope_chunk_list_concat_in_place_many(RDIB_ScopeChunkList *list, RDIB_ScopeChunkList *to_concat, U64 count) +{ + SLLConcatInPlaceChunkListArray(list, to_concat, RDIB_ScopeChunk, count); +} + +internal void +rdib_variable_chunk_list_concat_in_place_many(RDIB_VariableChunkList *list, RDIB_VariableChunkList *to_concat, U64 count) +{ + SLLConcatInPlaceChunkListArray(list, to_concat, RDIB_VariableChunk, count); +} + +internal void +rdib_procedure_chunk_list_concat_in_place_many(RDIB_ProcedureChunkList *list, RDIB_ProcedureChunkList *to_concat, U64 count) +{ + SLLConcatInPlaceChunkListArray(list, to_concat, RDIB_ProcedureChunk, count); +} + +internal void +rdib_inline_site_chunk_list_concat_in_place_many(RDIB_InlineSiteChunkList *list, RDIB_InlineSiteChunkList *to_concat, U64 count) +{ + SLLConcatInPlaceChunkListArray(list, to_concat, RDIB_InlineSiteChunk, count); +} + +internal void +rdib_type_chunk_list_concat_in_place_many(RDIB_TypeChunkList *list, RDIB_TypeChunkList *to_concat, U64 count) +{ + SLLConcatInPlaceChunkListArray(list, to_concat, RDIB_TypeChunk, count); +} + +internal void +rdib_udt_member_chunk_list_concat_in_place_many(RDIB_UDTMemberChunkList *list, RDIB_UDTMemberChunkList *to_concat, U64 count) +{ + SLLConcatInPlaceChunkListArray(list, to_concat, RDIB_UDTMemberChunk, count); +} + +internal RDIB_UnitChunk ** +rdib_array_from_unit_chunk_list(Arena *arena, RDIB_UnitChunkList list) +{ + ProfBeginFunction(); + RDIB_UnitChunk **result = push_array_no_zero(arena, RDIB_UnitChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_UnitChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_ScopeChunk ** +rdib_array_from_scope_chunk_list(Arena *arena, RDIB_ScopeChunkList list) +{ + ProfBeginFunction(); + RDIB_ScopeChunk **result = push_array_no_zero(arena, RDIB_ScopeChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_ScopeChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_VariableChunk ** +rdib_array_from_variable_chunk_list(Arena *arena, RDIB_VariableChunkList list) +{ + ProfBeginFunction(); + RDIB_VariableChunk **result = push_array_no_zero(arena, RDIB_VariableChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_VariableChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_LineTableChunk ** +rdib_array_from_line_table_chunk_list(Arena *arena, RDIB_LineTableChunkList list) +{ + ProfBeginFunction(); + RDIB_LineTableChunk **result = push_array_no_zero(arena, RDIB_LineTableChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_LineTableChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_ProcedureChunk ** +rdib_array_from_procedure_chunk_list(Arena *arena, RDIB_ProcedureChunkList list) +{ + ProfBeginFunction(); + RDIB_ProcedureChunk **result = push_array_no_zero(arena, RDIB_ProcedureChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_ProcedureChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_InlineSiteChunk ** +rdib_array_from_inline_site_chunk_list(Arena *arena, RDIB_InlineSiteChunkList list) +{ + ProfBeginFunction(); + RDIB_InlineSiteChunk **result = push_array_no_zero(arena, RDIB_InlineSiteChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_InlineSiteChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_UDTMemberChunk ** +rdib_array_from_udt_member_chunk_list(Arena *arena, RDIB_UDTMemberChunkList list) +{ + ProfBeginFunction(); + RDIB_UDTMemberChunk **result = push_array_no_zero(arena, RDIB_UDTMemberChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_UDTMemberChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_TypeChunk ** +rdib_array_from_type_chunk_list(Arena *arena, RDIB_TypeChunkList list) +{ + ProfBeginFunction(); + RDIB_TypeChunk **result = push_array_no_zero(arena, RDIB_TypeChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_TypeChunk *chunk = list.first; chunk != 0; chunk = chunk->next, ++chunk_idx) { + result[chunk_idx] = chunk; + } + ProfEnd(); + return result; +} + +internal RDIB_SourceFileChunk ** +rdib_array_from_source_file_chunk_list(Arena *arena, RDIB_SourceFileChunkList list) +{ + ProfBeginFunction(); + RDIB_SourceFileChunk **result = push_array_no_zero(arena, RDIB_SourceFileChunk *, list.count); + U64 chunk_idx = 0; + for (RDIB_SourceFileChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + result[chunk_idx++] = chunk; + } + ProfEnd(); + return result; +} + +internal U64 +rdib_unit_chunk_list_total_count(RDIB_UnitChunkList list) +{ + U64 total_count = 0; + for (RDIB_UnitChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_scope_chunk_list_total_count(RDIB_ScopeChunkList list) +{ + U64 total_count = 0; + for (RDIB_ScopeChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_variable_chunk_list_total_count(RDIB_VariableChunkList list) +{ + U64 total_count = 0; + for (RDIB_VariableChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_line_table_chunk_list_total_count(RDIB_LineTableChunkList list) +{ + U64 total_count = 0; + for (RDIB_LineTableChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_procedure_chunk_list_total_count(RDIB_ProcedureChunkList list) +{ + U64 total_count = 0; + for (RDIB_ProcedureChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_inline_site_chunk_list_total_count(RDIB_InlineSiteChunkList list) +{ + U64 total_count = 0; + for (RDIB_InlineSiteChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_udt_member_chunk_list_total_count(RDIB_UDTMemberChunkList list) +{ + U64 total_count = 0; + for (RDIB_UDTMemberChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_type_chunk_list_total_count(RDIB_TypeChunkList list) +{ + U64 total_count = 0; + for (RDIB_TypeChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U64 +rdib_source_file_chunk_list_total_count(RDIB_SourceFileChunkList list) +{ + U64 total_count = 0; + for (RDIB_SourceFileChunk *chunk = list.first; chunk != 0; chunk = chunk->next) { + total_count += chunk->count; + } + return total_count; +} + +internal U32 +rdib_idx_from_unit(RDIB_Unit *n) +{ + U32 idx = 0; + if (n) { + Assert(n->chunk->v <= n && n < (n->chunk->v + n->chunk->count)); + idx = safe_cast_u32(n->chunk->base + (n - n->chunk->v)); + Assert(idx - n->chunk->base < n->chunk->count); + } + return idx; +} + +internal U32 +rdib_idx_from_scope(RDIB_Scope *n) +{ + U32 idx = 0; + if (n) { + Assert(n->chunk->v <= n && n < (n->chunk->v + n->chunk->count)); + idx = safe_cast_u32(n->chunk->base + (n - n->chunk->v)); + Assert(idx - n->chunk->base < n->chunk->count); + } + return idx; +} + +internal U32 +rdib_idx_from_inline_site(RDIB_InlineSite *n) +{ + U32 idx = 0; + if (n) { + Assert(n->chunk->v <= n && n < (n->chunk->v + n->chunk->count)); + idx = safe_cast_u32(n->chunk->base + (n - n->chunk->v)); + Assert(idx - n->chunk->base < n->chunk->count); + } + return idx; +} + +internal U32 +rdib_idx_from_variable(RDIB_Variable *n) +{ + U32 idx = 0; + if (n) { + Assert(n->chunk->v <= n && n < (n->chunk->v + n->chunk->count)); + idx = safe_cast_u32(n->chunk->base + (n - n->chunk->v)); + Assert(idx - n->chunk->base < n->chunk->count); + } + return idx; +} + +internal U32 +rdib_idx_from_procedure(RDIB_Procedure *n) +{ + U32 idx = 0; + if (n) { + Assert(n->chunk->v <= n && n < (n->chunk->v + n->chunk->count)); + idx = safe_cast_u32(n->chunk->base + (n - n->chunk->v)); + Assert(idx - n->chunk->base < n->chunk->count); + } + return idx; +} + +internal U32 +rdib_idx_from_source_file(RDIB_SourceFile *n) +{ + U32 idx = 0; + if (n) { + Assert(n->chunk->v <= n && n < (n->chunk->v + n->chunk->count)); + idx = safe_cast_u32(n->chunk->base + (n - n->chunk->v)); + Assert(idx - n->chunk->base < n->chunk->count); + } + return idx; +} + +internal U32 +rdib_idx_from_line_table(RDIB_LineTable *n) +{ + U32 idx = 0; + if (n) { + Assert(n->chunk->v <= n && n < (n->chunk->v + n->chunk->count)); + idx = safe_cast_u32(n->chunk->base + (n - n->chunk->v)); + Assert(idx - n->chunk->base < n->chunk->count); + } + return idx; +} + +internal U32 +rdib_idx_from_type(RDIB_Type *n) +{ + U32 idx = 0; + if (n) { + idx = safe_cast_u32(n->final_idx); + } + return idx; +} + +internal U32 +rdib_idx_from_udt_type(RDIB_Type *n) +{ + U32 idx = 0; + if (n && RDI_IsUserDefinedType(n->kind)) { + idx = safe_cast_u32(n->udt.udt_idx); + } + return idx; +} + +//////////////////////////////// +// Source File + +internal B32 +rdib_source_file_match(RDIB_SourceFile *a, RDIB_SourceFile *b, OperatingSystem os) +{ + StringMatchFlags match_flags = path_match_flags_from_os(os); + if (str8_match(a->normal_full_path, b->normal_full_path, match_flags)) { + if (a->checksum_kind == b->checksum_kind) { + if (str8_match(a->checksum, b->checksum, 0)) { + return 1; + } + } + } + return 0; +} + +//////////////////////////////// +// Eval Ops + +internal RDIB_EvalBytecodeOp * +rdib_bytecode_push_op(Arena *arena, RDIB_EvalBytecode *bytecode, RDI_EvalOp op, RDI_U64 p) +{ + RDIB_EvalBytecodeOp *node = push_array(arena, RDIB_EvalBytecodeOp, 1); + node->op = op; + node->p_size = RDI_DECODEN_FROM_CTRLBITS(rdi_eval_op_ctrlbits_table[op]); + node->p = p; + + SLLQueuePush(bytecode->first, bytecode->last, node); + bytecode->count += 1; + bytecode->size += 1 + node->p_size; + + return node; +} + +internal void +rdib_bytecode_push_ucsont(Arena *arena, RDIB_EvalBytecode *bytecode, RDI_U64 uconst) +{ + if (uconst <= max_U8) { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU8, uconst); + } else if (uconst <= max_U16) { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU16, uconst); + } else if (uconst <= max_U32) { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU32, uconst); + } else { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU64, uconst); + } +} + +internal void +rdib_bytecode_push_sconst(Arena *arena, RDIB_EvalBytecode *bytecode, RDI_S64 sconst) +{ + if (min_S8 <= sconst && sconst <= max_S8) { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU8, (RDI_U64)sconst); + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_TruncSigned, 8); + } else if (min_S16 <= sconst && sconst <= max_S16) { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU16, (RDI_U64)sconst); + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_TruncSigned, 16); + } else if (min_S32 <= sconst && sconst <= max_S32) { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU32, (RDI_U64)sconst); + } else { + rdib_bytecode_push_op(arena, bytecode, RDI_EvalOp_ConstU64, (RDI_U64)sconst); + } +} + +//////////////////////////////// +// Location + +internal RDIB_Location +rdib_make_location_addr_byte_stream(Rng1U64List ranges, RDIB_EvalBytecode bytecode) +{ + RDIB_Location loc = {0}; + loc.ranges = ranges; + loc.kind = RDI_LocationKind_AddrBytecodeStream; + loc.bytecode = bytecode; + return loc; +} + +internal RDIB_Location +rdib_make_location_addr_bytecode_stream(Rng1U64List ranges, RDIB_EvalBytecode bytecode) +{ + RDIB_Location loc = {0}; + loc.ranges = ranges; + loc.kind = RDI_LocationKind_AddrBytecodeStream; + loc.bytecode = bytecode; + return loc; +} + +internal RDIB_Location +rdib_make_location_val_bytecode_stream(Rng1U64List ranges, RDIB_EvalBytecode bytecode) +{ + RDIB_Location loc = {0}; + loc.ranges = ranges; + loc.kind = RDI_LocationKind_ValBytecodeStream; + loc.bytecode = bytecode; + return loc; +} + +internal RDIB_Location +rdib_make_location_addr_reg_plus_u16(Rng1U64List ranges, RDI_RegCode reg_code, RDI_U16 offset) +{ + RDIB_Location loc = {0}; + loc.ranges = ranges; + loc.kind = RDI_LocationKind_AddrRegPlusU16; + loc.reg_code = reg_code; + loc.offset = offset; + return loc; +} + +internal RDIB_Location +rdib_make_location_addr_addr_reg_plus_u16(Rng1U64List ranges, RDI_RegCode reg_code, RDI_U16 offset) +{ + RDIB_Location loc = {0}; + loc.kind = RDI_LocationKind_AddrAddrRegPlusU16; + loc.ranges = ranges; + loc.reg_code = reg_code; + loc.offset = offset; + return loc; +} + +internal RDIB_Location +rdib_make_location_val_reg(Rng1U64List ranges, RDI_RegCode reg_code) +{ + RDIB_Location loc = {0}; + loc.kind = RDI_LocationKind_ValReg; + loc.ranges = ranges; + loc.reg_code = reg_code; + return loc; +} + +internal RDIB_LocationNode * +rdib_location_list_push(Arena *arena, RDIB_LocationList *list, RDIB_Location v) +{ + RDIB_LocationNode *node = push_array(arena, RDIB_LocationNode, 1); + node->v = v; + SLLQueuePush(list->first, list->last, node); + ++list->count; + return node; +} + +internal RDIB_LocationNode * +rdib_push_location_addr_reg_off(Arena *arena, RDIB_LocationList *list, RDI_Arch arch, RDI_RegCode reg_code, U32 reg_byte_size, U32 reg_byte_pos, S64 offset, B32 is_reference, Rng1U64List ranges) +{ + RDIB_Location loc; + + if (0 <= offset && offset <= (S64)max_U16) { + if (is_reference) { + loc = rdib_make_location_addr_addr_reg_plus_u16(ranges, reg_code, (U16)offset); + } else { + loc = rdib_make_location_addr_reg_plus_u16(ranges, reg_code, (U16)offset); + } + } + + // long offset, emit byte code + else { + RDIB_EvalBytecode bytecode = {0}; + U32 reg_read_param = RDI_EncodeRegReadParam(reg_code, reg_byte_size, reg_byte_pos); + rdib_bytecode_push_op(arena, &bytecode, RDI_EvalOp_RegRead, reg_read_param); + rdib_bytecode_push_sconst(arena, &bytecode, offset); + rdib_bytecode_push_op(arena, &bytecode, RDI_EvalOp_Add, 0); + + if (is_reference) { + U64 addr_size = rdi_addr_size_from_arch(arch); + rdib_bytecode_push_op(arena, &bytecode, RDI_EvalOp_MemRead, addr_size); + } + + loc = rdib_make_location_addr_bytecode_stream(ranges, bytecode); + } + + RDIB_LocationNode *node = rdib_location_list_push(arena, list, loc); + return node; +} + +internal void +rdib_variable_list_push_node(RDIB_VariableList *list, RDIB_VariableNode *node) +{ + SLLQueuePush(list->first, list->last, node); + ++list->count; +} + +internal RDIB_VariableNode * +rdib_variable_list_push(Arena *arena, RDIB_VariableList *list) +{ + RDIB_VariableNode *node = push_array(arena, RDIB_VariableNode, 1); + rdib_variable_list_push_node(list, node); + return node; +} + +//////////////////////////////// +// Types + +internal U64 +rdib_size_from_type(RDIB_Type *type) +{ + if (type) { + switch (type->kind) { + case RDI_TypeKind_Void: + case RDI_TypeKind_Char8: + case RDI_TypeKind_Char16: + case RDI_TypeKind_Char32: + case RDI_TypeKind_UChar8: + case RDI_TypeKind_UChar16: + case RDI_TypeKind_UChar32: + case RDI_TypeKind_U8: + case RDI_TypeKind_U16: + case RDI_TypeKind_U32: + case RDI_TypeKind_U64: + case RDI_TypeKind_U128: + case RDI_TypeKind_U256: + case RDI_TypeKind_U512: + case RDI_TypeKind_S8: + case RDI_TypeKind_S16: + case RDI_TypeKind_S32: + case RDI_TypeKind_S64: + case RDI_TypeKind_S128: + case RDI_TypeKind_S256: + case RDI_TypeKind_S512: + case RDI_TypeKind_Bool: + case RDI_TypeKind_F16: + case RDI_TypeKind_F32: + case RDI_TypeKind_F32PP: + case RDI_TypeKind_F48: + case RDI_TypeKind_F64: + case RDI_TypeKind_F80: + case RDI_TypeKind_F128: + case RDI_TypeKind_ComplexF32: + case RDI_TypeKind_ComplexF64: + case RDI_TypeKind_ComplexF80: + case RDI_TypeKind_ComplexF128: + case RDI_TypeKind_Handle: + return type->builtin.size; + case RDI_TypeKind_Modifier: + return rdib_size_from_type((RDIB_Type *)type->modifier.type_ref); + case RDI_TypeKind_Ptr: + case RDI_TypeKind_LRef: + case RDI_TypeKind_RRef: + return type->ptr.size; + case RDI_TypeKind_Array: + return type->array.size; + + case RDI_TypeKind_Function: + case RDI_TypeKind_Method: + case RDI_TypeKindExt_StaticMethod: { + Assert(!"check"); + return 0; + } + case RDI_TypeKind_Struct: + case RDI_TypeKind_Class: + case RDI_TypeKind_IncompleteStruct: + case RDI_TypeKind_IncompleteClass: + return type->udt.struct_type.size; + + case RDI_TypeKind_Union: + case RDI_TypeKind_IncompleteUnion: + return type->udt.union_type.size; + + case RDI_TypeKind_Alias: + Assert(!"check"); + + case RDI_TypeKind_Enum: + case RDI_TypeKind_IncompleteEnum: + return rdib_size_from_type(type->udt.enum_type.base_type); + + case RDI_TypeKind_MemberPtr: + case RDI_TypeKind_Bitfield: + case RDI_TypeKind_Variadic: + case RDI_TypeKindExt_Members: + case RDI_TypeKindExt_Params: + InvalidPath; // no size + } + } + return 0; +} + +internal RDIB_TypeRef +rdib_make_type_ref(Arena *arena, RDIB_Type *type) +{ + RDIB_Type **ref = push_array(arena, RDIB_Type *, 1); + ref[0] = type; + return ref; +} + +internal void +rdib_deref_type_refs(TP_Context *tp, RDIB_TypeChunkList *list) +{ + for (RDIB_TypeChunk *chunk = list->first; chunk != 0; chunk = chunk->next) { + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = &chunk->v[i]; + if (type->kind == RDI_TypeKind_Struct || type->kind == RDI_TypeKind_Class || + type->kind == RDI_TypeKind_IncompleteStruct || type->kind == RDI_TypeKind_IncompleteClass) { + type->udt.members = *(RDIB_Type **)type->udt.members; + type->udt.struct_type.derived = *(RDIB_Type **)type->udt.struct_type.derived; + type->udt.struct_type.vtshape = *(RDIB_Type **)type->udt.struct_type.vtshape; + } else if (type->kind == RDI_TypeKind_Enum || type->kind == RDI_TypeKind_IncompleteEnum) { + type->udt.members = *(RDIB_Type **)type->udt.members; + type->udt.enum_type.base_type = *(RDIB_Type **)type->udt.enum_type.base_type; + } else if (type->kind == RDI_TypeKind_Union || type->kind == RDI_TypeKind_IncompleteUnion) { + type->udt.members = *(RDIB_Type **)type->udt.members; + } else if (type->kind == RDI_TypeKind_Array) { + type->array.entry_type = *(RDIB_Type **)type->array.entry_type; + } else if (type->kind == RDI_TypeKind_Function) { + type->func.return_type = *(RDIB_Type **)type->func.return_type; + type->func.params_type = *(RDIB_Type **)type->func.params_type; + } else if (type->kind == RDI_TypeKind_Method) { + type->method.class_type = *(RDIB_Type **)type->method.class_type; + type->method.this_type = *(RDIB_Type **)type->method.this_type; + type->method.return_type = *(RDIB_Type **)type->method.return_type; + type->method.params_type = *(RDIB_Type **)type->method.params_type; + } else if (type->kind == RDI_TypeKindExt_StaticMethod) { + type->static_method.class_type = *(RDIB_Type **)type->static_method.class_type; + type->static_method.return_type = *(RDIB_Type **)type->static_method.return_type; + type->static_method.params_type = *(RDIB_Type **)type->static_method.params_type; + } else if (type->kind == RDI_TypeKind_Ptr || type->kind == RDI_TypeKind_LRef || type->kind == RDI_TypeKind_RRef) { + type->ptr.type_ref = *(RDIB_Type **)type->ptr.type_ref; + } else if (type->kind == RDI_TypeKind_Modifier) { + type->modifier.type_ref = *(RDIB_Type **)type->modifier.type_ref; + } else if (type->kind == RDI_TypeKind_Bitfield) { + type->bitfield.value_type = *(RDIB_Type **)type->bitfield.value_type; + } else if (type->kind == RDI_TypeKindExt_Params) { + for (U64 i = 0; i < type->params.count; ++i) { + type->params.types[i] = *(RDIB_Type **)type->params.types[i]; + } + } else if (type->kind == RDI_TypeKindExt_Members) { + for (RDIB_UDTMember *member = type->members.list.first; member != 0; member = member->next) { + switch (member->kind) { + case RDI_MemberKind_NULL: break; + case RDI_MemberKind_DataField: { + member->data_field.type_ref = *(RDIB_Type **)member->data_field.type_ref; + } break; + case RDI_MemberKind_StaticData: { + member->static_data.type_ref = *(RDIB_Type **)member->static_data.type_ref; + } break; + case RDI_MemberKind_Method: { + member->method.type_ref = *(RDIB_Type **)member->method.type_ref; + } break; + case RDI_MemberKind_NestedType: { + member->nested_type.type_ref = *(RDIB_Type **)member->nested_type.type_ref; + } break; + case RDI_MemberKind_Base: { + member->base_class.type_ref = *(RDIB_Type **)member->base_class.type_ref; + } break; + case RDI_MemberKind_VirtualBase: { + member->virtual_base_class.type_ref = *(RDIB_Type **)member->virtual_base_class.type_ref; + } break; + case RDI_MemberKindExt_MemberListPointer: { + member->member_list_pointer = *(RDIB_Type **)member->member_list_pointer; + } break; +#if 0 + case RDI_MemberKind_Enumerate: { + // no types + } break; +#endif + default: InvalidPath; + } + } + } + } + } +} + +internal U64 +rdib_sizeof_type(RDIB_Type *type) +{ + U64 size = 0; + + RDIB_Type *curr = type; + while (curr->kind == RDI_TypeKind_Modifier) { + curr = curr->modifier.type_ref; + } + + if (RDI_TypeKind_FirstBuiltIn <= curr->kind && curr->kind < RDI_TypeKind_LastBuiltIn) { + size = curr->builtin.size; + } else if (curr->kind == RDI_TypeKind_Ptr || curr->kind == RDI_TypeKind_LRef || curr->kind == RDI_TypeKind_RRef) { + size = curr->ptr.size; + } else if (curr->kind == RDI_TypeKind_Struct || curr->kind == RDI_TypeKind_Class || + curr->kind == RDI_TypeKind_IncompleteStruct || curr->kind == RDI_TypeKind_IncompleteClass) { + size = curr->udt.struct_type.size; + } else if (curr->kind == RDI_TypeKind_Union || curr->kind == RDI_TypeKind_IncompleteUnion) { + size = curr->udt.union_type.size; + } else if (curr->kind == RDI_TypeKind_Enum || curr->kind == RDI_TypeKind_IncompleteEnum) { + size = rdib_sizeof_type(curr->udt.enum_type.base_type); + } else if (curr->kind == RDI_TypeKind_Bitfield) { + size = rdib_sizeof_type(curr->bitfield.value_type); + } else { + Assert(!"error: type doens't have a size"); + } + + return size; +} + +internal U64 +rdib_count_members_deep(RDIB_Type *type) +{ + U64 member_count = 0; + for (RDIB_UDTMember *member = type->members.list.first; member != 0; member = member->next) { + if (member->kind == RDI_MemberKindExt_MemberListPointer) { + member_count += rdib_count_members_deep(member->member_list_pointer); + } else { + member_count += 1; + } + } + return member_count; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_type_stats_task) +{ + ProfBeginFunction(); + + RDIB_TypeStatsTask *task = raw_task; + RDIB_TypeChunk *chunk = task->chunks[task_id]; + + for (U64 itype = 0; itype < chunk->count; ++itype) { + RDIB_Type *type = chunk->v + itype; + + if (type->kind == RDI_TypeKind_Class || type->kind == RDI_TypeKind_Struct || type->kind == RDI_TypeKind_Union || type->kind == RDI_TypeKind_Enum) { + task->type_stats->udt_counts[task_id] += 1; + } + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_concat_members_task) +{ + ProfBeginFunction(); + RDIB_MembersTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_TypeChunk *chunk = task->type_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = &chunk->v[i]; + RDIB_UDTMemberList acc = {0}; + + for (RDIB_Type *curr = type; ;) { + // concat members + rdib_udt_member_list_concat_in_place(&acc, &curr->members.list); + + // does this type continue member list? + if (acc.count == 0 || acc.last->kind != RDI_MemberKindExt_MemberListPointer) { + break; + } + + // remove member list pointer + RDIB_UDTMember *continuation = acc.last; + SLLQueuePop(acc.first, acc.last); + --acc.count; + + // advance to next type + curr = continuation->member_list_pointer; + + // other types should not reference any part of member list except for head type. + Assert(curr->kind == RDI_TypeKindExt_Members); + curr->kind = RDI_TypeKind_NULL; + } + + // update member list + type->members.list = acc; + } + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_count_head_members_task) +{ + ProfBeginFunction(); + RDIB_MembersTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_TypeChunk *chunk = task->type_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = &chunk->v[i]; + if (type->kind == RDI_TypeKindExt_Members) { + task->counts[task_id] += type->members.list.count; + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_assign_head_member_indices_task) +{ + ProfBeginFunction(); + RDIB_MembersTask *task = raw_task; + U64 cursor = task->offsets[task_id]; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_TypeChunk *chunk = task->type_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = &chunk->v[i]; + if (type->kind == RDI_TypeKindExt_Members) { + type->members.first_member_idx = cursor; + cursor += type->members.list.count; + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_fill_udt_members_task) +{ + ProfBeginFunction(); + RDIB_MembersTask *task = raw_task; + RDIB_TypeChunk *chunk = task->type_chunks[task_id]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = chunk->v + i; + Assert(type->kind == RDI_TypeKindExt_Members); + + U64 member_idx = 0; + for (RDIB_UDTMember *src = type->members.list.first; src != 0; src = src->next, ++member_idx) { + U64 idx = type->members.first_member_idx + member_idx; + RDI_Member *dst = &task->udt_members_rdi[idx]; + + switch (src->kind) { + case RDI_MemberKind_NULL: { + MemoryZeroStruct(dst); + } break; + case RDI_MemberKind_DataField: { + dst->kind = RDI_MemberKind_DataField; + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->data_field.name); + dst->type_idx = rdib_idx_from_type(src->data_field.type_ref); + dst->off = src->data_field.offset; + } break; + case RDI_MemberKind_StaticData: { + dst->kind = RDI_MemberKind_StaticData; + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->static_data.name); + dst->type_idx = rdib_idx_from_type(src->static_data.type_ref); + } break; + case RDI_MemberKind_Method: + case RDI_MemberKind_StaticMethod: + case RDI_MemberKind_VirtualMethod: { + dst->kind = src->kind; + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->method.name); + dst->type_idx = rdib_idx_from_type(src->method.type_ref); + dst->off = src->method.vftable_offset; + } break; + case RDI_MemberKind_Base: { + dst->kind = RDI_MemberKind_Base; + dst->name_string_idx = 0; + dst->type_idx = rdib_idx_from_type(src->base_class.type_ref); + dst->off = src->base_class.offset; + } break; + case RDI_MemberKind_VirtualBase: { + dst->kind = RDI_MemberKind_VirtualBase; + dst->name_string_idx = 0; + dst->type_idx = rdib_idx_from_type(src->virtual_base_class.type_ref); + dst->off = 0; // TODO: ??? + } break; + case RDI_MemberKind_NestedType: { + dst->kind = RDI_MemberKind_NestedType; + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->nested_type.name); + dst->type_idx = rdib_idx_from_type(src->nested_type.type_ref); + dst->off = 0; + } break; + case RDI_MemberKindExt_MemberListPointer: { + InvalidPath; + } break; + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_fill_enum_members_task) +{ + ProfBeginFunction(); + RDIB_MembersTask *task = raw_task; + RDIB_TypeChunk *chunk = task->type_chunks[task_id]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = chunk->v + i; + + if (type->kind != RDI_TypeKindExt_Members) continue; + + U64 member_idx = 0; + for (RDIB_UDTMember *src = type->members.list.first; src != 0; src = src->next, ++member_idx) { + U64 idx = type->members.first_member_idx + member_idx; + RDI_EnumMember *dst = &task->enum_members_rdi[idx]; + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->enumerate.name); + dst->val = src->enumerate.value; + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_fill_udts_task) +{ + ProfBeginFunction(); + RDIB_UserDefinesTask *task = raw_task; + + U64 ichunk = task_id; + RDIB_TypeChunk *chunk = task->type_chunks[ichunk]; + U64 udt_cursor = task->udt_base_idx[ichunk]; + U64 udt_cap = task->type_stats.udt_counts[ichunk]; + + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = &chunk->v[i]; + + if (RDI_IsCompleteUserDefinedTypeKind(type->kind)) { + RDIB_Type *members_type = type->udt.members; + + // assign UDT idx + type->udt.udt_idx = udt_cursor; + + // fill out struct/class UDT + Assert(udt_cursor < task->udt_base_idx[ichunk] + udt_cap); + RDI_UDT *udt = &task->udts[udt_cursor++]; + udt->self_type_idx = rdib_idx_from_type(type); + udt->flags = type->kind == RDI_TypeKind_Enum ? RDI_UDTFlag_EnumMembers : 0; + if (members_type->members.list.count > 0) { + udt->member_first = members_type->members.first_member_idx; + udt->member_count = members_type->members.list.count; + } else { + udt->member_first = 0; + udt->member_count = 0; + } + udt->file_idx = 0; + udt->line = 0; + udt->col = 0; + } + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_type_nodes_task) +{ + Temp scratch = scratch_begin(0, 0); + + U64 ichunk = task_id; + RDIB_TypeNodesTask *task = raw_task; + RDIB_TypeChunk *chunk = task->type_chunks[ichunk]; + + for (U64 itype = 0; itype < chunk->count; ++itype) { + RDIB_Type *src = &chunk->v[itype]; + U64 dst_idx = rdib_idx_from_type(src); + RDI_TypeNode *dst = &task->type_nodes[dst_idx]; + + if (src->kind == RDI_TypeKind_NULL) { + MemoryZeroStruct(dst); + dst->kind = RDI_TypeKind_NULL; + } else if (RDI_TypeKind_FirstBuiltIn <= src->kind && src->kind <= RDI_TypeKind_LastBuiltIn) { + dst->kind = src->kind; + dst->flags = 0; + dst->byte_size = src->builtin.size; + dst->built_in.name_string_idx = rdib_idx_from_string_map(task->string_map, src->builtin.name); + } else if (src->kind == RDI_TypeKind_Modifier) { + dst->kind = RDI_TypeKind_Modifier; + dst->byte_size = rdib_sizeof_type(src->modifier.type_ref); + dst->flags = src->modifier.flags; + dst->constructed.direct_type_idx = rdib_idx_from_type(src->modifier.type_ref); + } else if (src->kind == RDI_TypeKind_Ptr || src->kind == RDI_TypeKind_LRef || src->kind == RDI_TypeKind_RRef) { + dst->kind = src->kind; + dst->byte_size = src->ptr.size; + dst->flags = 0; + dst->constructed.direct_type_idx = rdib_idx_from_type(src->ptr.type_ref); + } else if (src->kind == RDI_TypeKind_Method) { + RDIB_Type *params_type = src->method.params_type; + Assert(params_type->kind == RDI_TypeKindExt_Params); + RDIB_IndexRunBucket *param_idx_run = task->idx_run_map->buckets[src->method.param_idx_run_bucket_idx]; + + dst->kind = RDI_TypeKind_Method; + dst->flags = 0; + dst->byte_size = 0; + dst->constructed.direct_type_idx = rdib_idx_from_type(src->method.return_type); + dst->constructed.count = param_idx_run->indices.count; + dst->constructed.param_idx_run_first = param_idx_run->index_in_output_array; + } else if (src->kind == RDI_TypeKindExt_StaticMethod) { + RDIB_Type *params_type = src->static_method.params_type; + Assert(params_type->kind == RDI_TypeKindExt_Params); + RDIB_IndexRunBucket *param_idx_run = task->idx_run_map->buckets[src->static_method.param_idx_run_bucket_idx]; + + dst->kind = RDI_TypeKind_Method; + dst->flags = 0; + dst->byte_size = 0; + dst->constructed.direct_type_idx = rdib_idx_from_type(src->static_method.return_type); + dst->constructed.count = param_idx_run->indices.count; + dst->constructed.param_idx_run_first = param_idx_run->index_in_output_array; + } else if (src->kind == RDI_TypeKind_Function) { + RDIB_Type *params_type = src->func.params_type; + Assert(params_type->kind == RDI_TypeKindExt_Params); + RDIB_IndexRunBucket *param_idx_run = task->idx_run_map->buckets[src->func.param_idx_run_bucket_idx]; + + dst->kind = RDI_TypeKind_Function; + dst->flags = 0; + dst->byte_size = 0; + dst->constructed.direct_type_idx = rdib_idx_from_type(src->func.return_type); + dst->constructed.count = param_idx_run->indices.count; + dst->constructed.param_idx_run_first = param_idx_run->index_in_output_array; + } else if (src->kind == RDI_TypeKind_Array) { + U64 entry_size = rdib_size_from_type(src->array.entry_type); + U64 array_size = src->array.size; + U64 array_count = entry_size > 0 ? array_size / entry_size : 0; + + dst->kind = src->kind; + dst->flags = 0; + dst->byte_size = array_size; + dst->constructed.direct_type_idx = rdib_idx_from_type(src->array.entry_type); + dst->constructed.count = array_count; + } else if (src->kind == RDI_TypeKind_Bitfield) { + dst->kind = RDI_TypeKind_Bitfield; + dst->flags = 0; + dst->byte_size = rdib_sizeof_type(src->bitfield.value_type); + dst->bitfield.direct_type_idx = rdib_idx_from_type(src->bitfield.value_type); + dst->bitfield.off = src->bitfield.off; + dst->bitfield.size = src->bitfield.count; + } else if (src->kind == RDI_TypeKind_Struct || src->kind == RDI_TypeKind_Class || + src->kind == RDI_TypeKind_IncompleteStruct || src->kind == RDI_TypeKind_IncompleteClass) { + dst->kind = src->kind; + dst->flags = 0; + dst->byte_size = src->udt.struct_type.size; + dst->user_defined.name_string_idx = rdib_idx_from_string_map(task->string_map, src->udt.name); + dst->user_defined.udt_idx = src->udt.udt_idx; + dst->user_defined.direct_type_idx = 0; + } else if (src->kind == RDI_TypeKind_Union || src->kind == RDI_TypeKind_IncompleteUnion) { + dst->kind = src->kind; + dst->flags = 0; + dst->byte_size = src->udt.union_type.size; + dst->user_defined.name_string_idx = rdib_idx_from_string_map(task->string_map, src->udt.name); + dst->user_defined.udt_idx = src->udt.udt_idx; + dst->user_defined.direct_type_idx = 0; + } else if (src->kind == RDI_TypeKind_Enum || src->kind == RDI_TypeKind_IncompleteEnum) { + dst->kind = RDI_TypeKind_Enum; + dst->flags = 0; + dst->byte_size = rdib_size_from_type(src->udt.enum_type.base_type); + dst->user_defined.name_string_idx = rdib_idx_from_string_map(task->string_map, src->udt.name); + dst->user_defined.udt_idx = src->udt.udt_idx; + dst->user_defined.direct_type_idx = rdib_idx_from_type(src->udt.enum_type.base_type); + } else if (src->kind == RDI_TypeKind_Alias) { + // TODO + NotImplemented; + } else if (src->kind == RDI_TypeKind_MemberPtr) { + // TODO + NotImplemented; + } else if (src->kind == RDI_TypeKind_Variadic) { + MemoryZeroStruct(dst); + dst->kind = RDI_TypeKind_Variadic; + } else if (src->kind == RDI_TypeKindExt_VirtualTable) { + // TODO + MemoryZeroStruct(dst); + dst->kind = RDI_TypeKind_NULL; + } else { + InvalidPath; + } + } + + scratch_end(scratch); +} + +internal void +rdib_data_sections_from_types(TP_Context *tp, + Arena *arena, + RDIB_DataSectionList *sect_list, + RDI_Arch arch, + RDIB_StringMap *string_map, + RDIB_IndexRunMap *idx_run_map, + U64 udt_member_chunk_count, + RDIB_TypeChunk **udt_member_type_chunks, + U64 enum_member_chunk_count, + RDIB_TypeChunk **enum_member_type_chunks, + U64 total_type_node_count, + U64 type_chunk_count, + RDIB_TypeChunk **type_chunks, + RDIB_TypeStats type_stats) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + ProfBegin("UDT Members"); + U64 udt_member_count_rdi; + RDI_Member *udt_members_rdi; + { + RDIB_MembersTask task = {0}; + + ProfBegin("Concat"); + task.ranges = tp_divide_work(scratch.arena, udt_member_chunk_count, tp->worker_count); + task.type_chunks = udt_member_type_chunks; + tp_for_parallel(tp, 0, tp->worker_count, rdib_concat_members_task, &task); + ProfEnd(); + + ProfBegin("Count"); + task.counts = push_array(scratch.arena, U64, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_count_head_members_task, &task); + ProfEnd(); + + ProfBegin("Assign Indices"); + task.offsets = offsets_from_counts_array_u64(scratch.arena, task.counts, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_assign_head_member_indices_task, &task); + ProfEnd(); + + udt_member_count_rdi = sum_array_u64(tp->worker_count, task.counts); + udt_members_rdi = push_array_no_zero(arena, RDI_Member, udt_member_count_rdi); + + ProfBegin("Fill"); + task.string_map = string_map; + task.udt_members_rdi = udt_members_rdi; + tp_for_parallel(tp, 0, udt_member_chunk_count, rdib_fill_udt_members_task, &task); + ProfEnd(); + } + ProfEnd(); + + ProfBegin("Enum Members"); + U64 enum_member_count_rdi; + RDI_EnumMember *enum_members_rdi; + { + RDIB_MembersTask task = {0}; + + ProfBegin("Concat"); + task.ranges = tp_divide_work(scratch.arena, enum_member_chunk_count, tp->worker_count); + task.type_chunks = enum_member_type_chunks; + tp_for_parallel(tp, 0, tp->worker_count, rdib_concat_members_task, &task); + ProfEnd(); + + ProfBegin("Count"); + task.counts = push_array(scratch.arena, U64, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_count_head_members_task, &task); + ProfEnd(); + + ProfBegin("Assign Indices"); + task.offsets = offsets_from_counts_array_u64(scratch.arena, task.counts, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_assign_head_member_indices_task, &task); + ProfEnd(); + + enum_member_count_rdi = sum_array_u64(tp->worker_count, task.counts); + enum_members_rdi = push_array_no_zero(arena, RDI_EnumMember, enum_member_count_rdi); + + ProfBegin("Fill"); + task.string_map = string_map; + task.enum_members_rdi = enum_members_rdi; + tp_for_parallel(tp, 0, enum_member_chunk_count, rdib_fill_enum_members_task, &task); + ProfEnd(); + } + ProfEnd(); + + ProfBegin("Sum type stats"); + U64 total_udt_count = sum_array_u64(type_chunk_count, type_stats.udt_counts); + ProfEnd(); + + ProfBegin("Up front pushes"); + RDI_UDT *udts = push_array_no_zero(arena, RDI_UDT, total_udt_count ); + RDI_TypeNode *type_nodes = push_array_no_zero(arena, RDI_TypeNode, total_type_node_count); + ProfEnd(); + + ProfBegin("Fill out UDTs"); + RDIB_UserDefinesTask udts_task = {0}; + udts_task.type_chunks = type_chunks; + udts_task.type_stats = type_stats; + udts_task.udt_base_idx = offsets_from_counts_array_u64(scratch.arena, type_stats.udt_counts, type_chunk_count); + udts_task.udts = udts; + tp_for_parallel(tp, 0, type_chunk_count, rdib_fill_udts_task, &udts_task); + ProfEnd(); + + ProfBegin("Fill out type nodes"); + RDIB_TypeNodesTask type_nodes_task = {0}; + type_nodes_task.addr_size = rdi_addr_size_from_arch(arch); + type_nodes_task.string_map = string_map; + type_nodes_task.idx_run_map = idx_run_map; + type_nodes_task.type_chunks = type_chunks; + type_nodes_task.type_stats = type_stats; + type_nodes_task.type_nodes = type_nodes; + tp_for_parallel(tp, 0, type_chunk_count, rdib_type_nodes_task, &type_nodes_task); + ProfEnd(); + + RDIB_DataSection udt_member_sect = { .tag = RDI_SectionKind_Members }; + RDIB_DataSection enum_member_sect = { .tag = RDI_SectionKind_EnumMembers }; + RDIB_DataSection udt_sect = { .tag = RDI_SectionKind_UDTs }; + RDIB_DataSection type_nodes_sect = { .tag = RDI_SectionKind_TypeNodes }; + + str8_list_push(arena, &udt_member_sect.data, str8_array(udt_members_rdi, udt_member_count_rdi )); + str8_list_push(arena, &enum_member_sect.data, str8_array(enum_members_rdi, enum_member_count_rdi)); + str8_list_push(arena, &udt_sect.data, str8_array(udts, total_udt_count )); + str8_list_push(arena, &type_nodes_sect.data, str8_array(type_nodes, total_type_node_count)); + + rdib_data_section_list_push(arena, sect_list, enum_member_sect); + rdib_data_section_list_push(arena, sect_list, udt_member_sect ); + rdib_data_section_list_push(arena, sect_list, udt_sect ); + rdib_data_section_list_push(arena, sect_list, type_nodes_sect ); + + scratch_end(scratch); + ProfEnd(); +} + +//////////////////////////////// + +internal RDIB_PathTree * +rdib_path_tree_init(Arena *arena, U64 list_count) +{ + RDIB_PathTree *tree = push_array(arena, RDIB_PathTree, 1); + tree->root = push_array(arena, RDIB_PathTreeNode, 1); + tree->list_count = list_count; + tree->node_lists = push_array(arena, RDIB_PathTreeNodeList, list_count); + return tree; +} + +internal void +rdib_path_tree_insert(Arena *arena, RDIB_PathTree *tree, String8 path, RDIB_SourceFile *src_file) +{ + Temp scratch = scratch_begin(&arena, 1); + + RDIB_PathTreeNode *curr_sub_path = tree->root; + String8List sub_paths = str8_split_path(scratch.arena, path); + str8_path_list_resolve_dots_in_place(&sub_paths, path_style_from_str8(path)); + + for (String8Node *n = sub_paths.first; n != 0; n = n->next) { + RDIB_PathTreeNode *sub_child; + + // is there directory or file defined on this level? + for (sub_child = curr_sub_path->first_child; sub_child != 0; sub_child = sub_child->next_sibling) { + if (str8_match(sub_child->sub_path, n->string, 0)) { + break; + } + } + + // new directory/file + if (sub_child == 0) { + sub_child = push_array(arena, RDIB_PathTreeNode, 1); + sub_child->node_idx = tree->node_count; + sub_child->parent = curr_sub_path; + sub_child->sub_path = n->string; + sub_child->src_file = 0; + SLLQueuePush_N(curr_sub_path->first_child, curr_sub_path->last_child, sub_child, next_sibling); + ++tree->node_count; + + // last node, insert file + if (n->next == 0) { + sub_child->src_file = src_file; + } + + // HACK: setup node list per thread for serialization step + U64 list_idx = tree->next_list_idx % tree->list_count; + SLLQueuePush_N(tree->node_lists[list_idx].first, tree->node_lists[list_idx].last, sub_child, next_order); + ++tree->next_list_idx; + } + + // descend to sub node + curr_sub_path = sub_child; + } + + scratch_end(scratch); +} + +internal U32 +rdib_idx_from_path_tree(RDIB_PathTree *tree, String8 path) +{ + Temp scratch = scratch_begin(0,0); + + RDIB_PathTreeNode *curr_sub_path = tree->root; + String8List sub_paths = str8_split_by_string_chars(scratch.arena, path, str8_lit("/\\"), 0); + for (String8Node *n = sub_paths.first; n != 0; n = n->next) { + // scan children sub-path match + RDIB_PathTreeNode *sub_child; + for (sub_child = curr_sub_path->first_child; sub_child != 0; sub_child = sub_child->next_sibling) { + if (str8_match(sub_child->sub_path, n->string, StringMatchFlag_CaseInsensitive)) { + break; + } + } + + // found match? + if (sub_child == 0) { + break; + } + + // descend + curr_sub_path = sub_child; + } + + U64 idx = max_U64; + if (curr_sub_path != 0) { + idx = curr_sub_path->node_idx; + } + + scratch_end(scratch); + return safe_cast_u32(idx); +} + + +//////////////////////////////// + +internal U64 +rdib_string_map_hash(String8 string) +{ + XXH64_hash_t hash64 = XXH3_64bits(string.str, string.size); + return hash64; +} + +internal RDIB_StringMap * +rdib_init_string_map(Arena *arena, U64 cap) +{ + RDIB_StringMap *string_map = push_array(arena, RDIB_StringMap, 1); + string_map->cap = (U64)((F64)cap * 1.3); + string_map->buckets = push_array(arena, RDIB_StringMapBucket *, string_map->cap); + return string_map; +} + +internal U32 +rdib_idx_from_string_map(RDIB_StringMap *string_map, String8 string) +{ + U64 hash = rdib_string_map_hash(string); + U64 best_idx = hash % string_map->cap; + U64 idx = best_idx; + + do { + RDIB_StringMapBucket *bucket = string_map->buckets[idx]; + + if (bucket == 0) { + break; + } + + if (str8_match(bucket->string, string, 0)) { + return safe_cast_u32(bucket->idx); + } + + idx = (idx + 1) % string_map->cap; + } while (idx != best_idx); + + Assert(!"incomplete string map"); + return max_U32; +} + +internal RDIB_StringMapBucket * +rdib_string_map_insert_or_update(RDIB_StringMapBucket **buckets, U64 cap, U64 hash, RDIB_StringMapBucket *new_bucket, RDIB_StringMapUpdateFunc *update_func) +{ + RDIB_StringMapBucket *result = 0; + B32 was_bucket_inserted_or_updated = 0; + + U64 best_idx = hash % cap; + U64 idx = best_idx; + + do { + retry:; + RDIB_StringMapBucket *curr_bucket = buckets[idx]; + + if (curr_bucket == 0) { + RDIB_StringMapBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + was_bucket_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } else if (str8_match(curr_bucket->string, new_bucket->string, 0)) { + if (curr_bucket->sorter.v <= new_bucket->sorter.v) { + if (new_bucket->raw_values != 0) { + void_node_concat_atomic(&curr_bucket->raw_values, new_bucket->raw_values); + new_bucket->raw_values = 0; + } + + // recycle bucket + result = new_bucket; + + // don't need to update, more recent leaf is in the bucket + was_bucket_inserted_or_updated = 1; + + break; + } + + if (new_bucket->raw_values) { + new_bucket->raw_values->next = buckets[idx]->raw_values; + } + + RDIB_StringMapBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + + // recycle bucket + result = compare_bucket; + + // new bucket is in the hash table, exit + was_bucket_inserted_or_updated = 1; + break; + } + + if (new_bucket->raw_values) { + new_bucket->raw_values->next = 0; + } + + // another thread took the bucket... + goto retry; + } + + // advance + idx = (idx + 1) % cap; + } while (idx != best_idx); + + // are there enough free buckets? + Assert(was_bucket_inserted_or_updated); + + return result; +} + +internal void +rdib_string_map_insert_item(Arena *arena, RDIB_CollectStringsTask *task, U64 task_id, String8 string, void *value) +{ + // do we have a free bucket? + RDIB_StringMapBucket **bucket = &task->free_buckets[task_id]; + if (*bucket == 0) { + *bucket = push_array(arena, RDIB_StringMapBucket, 1); + } + + // fill out bucket + (*bucket)->string = string; + (*bucket)->raw_values = value; + (*bucket)->sorter.hi = safe_cast_u32(task_id); + (*bucket)->sorter.lo = safe_cast_u32(task->element_indices[task_id]); + + // insert bucket into string map + U64 hash = rdib_string_map_hash(string); + RDIB_StringMapBucket *insert_or_update = rdib_string_map_insert_or_update(task->string_map->buckets, task->string_map->cap, hash, *bucket, task->string_map_update_func); + + // advance element index + if (insert_or_update != *bucket) { + ++task->element_indices[task_id]; + } + + // recycle bucket + *bucket = insert_or_update; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_count_extant_buckets_string_map_task) +{ + ProfBeginFunction(); + RDIB_GetExtantBucketsStringMapTask *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + if (task->string_map->buckets[bucket_idx] != 0) { + task->counts[task_id] += 1; + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_get_extant_buckets_string_map_task) +{ + ProfBeginFunction(); + RDIB_GetExtantBucketsStringMapTask *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min, cursor = task->offsets[task_id]; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_StringMapBucket *bucket = task->string_map->buckets[bucket_idx]; + if (bucket != 0) { + task->result[cursor] = bucket; + ++cursor; + } + } + ProfEnd(); +} + +internal RDIB_StringMapBucket ** +rdib_extant_buckets_from_string_map(TP_Context *tp, Arena *arena, RDIB_StringMap *string_map, U64 *bucket_count_out) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + RDIB_GetExtantBucketsStringMapTask task = {0}; + task.string_map = string_map; + + ProfBegin("Count Extant Buckets"); + task.counts = push_array(scratch.arena, U64, tp->worker_count); + task.ranges = tp_divide_work(scratch.arena, string_map->cap, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_count_extant_buckets_string_map_task, &task); + ProfEnd(); + + *bucket_count_out = sum_array_u64(tp->worker_count, task.counts); + + ProfBegin("Copy Extant Buckets"); + task.offsets = offsets_from_counts_array_u64(scratch.arena, task.counts, tp->worker_count); + task.result = push_array(arena, RDIB_StringMapBucket *, *bucket_count_out); + tp_for_parallel(tp, 0, tp->worker_count, rdib_get_extant_buckets_string_map_task, &task); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); + return task.result; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_string_map_bucket_chunk_idx_histo_task) +{ + ProfBeginFunction(); + RDIB_StringMapRadixSort *task = raw_task; + Temp scratch = scratch_begin(0,0); + + U32 *range_histo = push_array(scratch.arena, U32, task->chunk_idx_opl); + + // count items per sorter + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_StringMapBucket *bucket = task->src[bucket_idx]; + U64 chunk_idx = bucket->sorter.hi; + Assert(chunk_idx < task->chunk_idx_opl); + ++range_histo[chunk_idx]; + } + + // add in per thread sorter counts + for (U64 i = 0; i < task->chunk_idx_opl; ++i) { + ins_atomic_u32_add_eval(&task->chunk_histo[i], range_histo[i]); + } + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_string_map_radix_sort_chunk_idx_task) +{ + ProfBeginFunction(); + RDIB_StringMapRadixSort *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_StringMapBucket *bucket = task->src[bucket_idx]; + U32 chunk_idx = bucket->sorter.hi; + U32 dst_idx = ins_atomic_u32_inc_eval(&task->chunk_offsets[chunk_idx]) - 1; + task->dst[dst_idx] = bucket; + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_string_map_radix_sort_element_idx_task) +{ + ProfBeginFunction(); + RDIB_StringMapRadixSort *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + U64 range_lo = task->chunk_offsets[chunk_idx]; + U64 range_hi = task->chunk_offsets[chunk_idx] + task->chunk_histo[chunk_idx]; + + ProfBegin("Zero out Histogram"); + U32 histo_bot[1 << 10]; MemoryZeroArray(histo_bot); + U32 histo_mid[1 << 11]; MemoryZeroArray(histo_mid); + U32 histo_top[1 << 11]; MemoryZeroArray(histo_top); + ProfEnd(); + + ProfBegin("Element Histogram"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_StringMapBucket *elem = task->dst[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit_bot = (elem_idx >> 0) % ArrayCount(histo_bot); + U32 digit_mid = (elem_idx >> 10) % ArrayCount(histo_mid); + U32 digit_top = (elem_idx >> 21) % ArrayCount(histo_top); + histo_bot[digit_bot] += 1; + histo_mid[digit_mid] += 1; + histo_top[digit_top] += 1; + } + ProfEnd(); + + ProfBegin("Histogram Counts -> Offsets"); + counts_to_offsets_array_u32(ArrayCount(histo_bot), &histo_bot[0]); + counts_to_offsets_array_u32(ArrayCount(histo_mid), &histo_mid[0]); + counts_to_offsets_array_u32(ArrayCount(histo_top), &histo_top[0]); + ProfEnd(); + + ProfBegin("Sort Bot"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_StringMapBucket *elem = task->dst[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit = (elem_idx >> 0) % ArrayCount(histo_bot); + U32 src_idx = range_lo + histo_bot[digit]++; + task->src[src_idx] = elem; + } + ProfEnd(); + + ProfBegin("Sort Mid"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_StringMapBucket *elem = task->src[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit = (elem_idx >> 10) % ArrayCount(histo_mid); + U32 dst_idx = range_lo + histo_mid[digit]++; + task->dst[dst_idx] = elem; + } + ProfEnd(); + + ProfBegin("Sort Top"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_StringMapBucket *elem = task->dst[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit = (elem_idx >> 21) % ArrayCount(histo_top); + U32 src_idx = range_lo + histo_top[digit]++; + task->src[src_idx] = elem; + } + ProfEnd(); + } + + ProfEnd(); +} + +internal void +rdib_string_map_sort_buckets(TP_Context *tp, RDIB_StringMapBucket **buckets, U64 bucket_count, U64 chunk_idx_opl) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + RDIB_StringMapRadixSort task = {0}; + task.chunk_idx_opl = chunk_idx_opl; + task.ranges = tp_divide_work(scratch.arena, bucket_count, tp->worker_count); + task.src = buckets; + task.dst = push_array_no_zero(scratch.arena, RDIB_StringMapBucket *, bucket_count); + + ProfBegin("Chunk Index Histogram"); + task.chunk_histo = push_array(scratch.arena, U32, chunk_idx_opl); + tp_for_parallel(tp, 0, tp->worker_count, rdib_string_map_bucket_chunk_idx_histo_task, &task); + ProfEnd(); + + // sort correctness check on chunk index +#if 0 + for (U64 i = 1; i < bucket_count; ++i) { + RDIB_StringMapBucket *prev = buckets[i - 1]; + RDIB_StringMapBucket *curr = buckets[i + 0]; + U32 prev_chunk_idx = prev->sorter.hi; + U32 curr_chunk_idx = curr->sorter.hi; + AssertAlways(prev_chunk_idx <= curr_chunk_idx); + } +#endif + + ProfBegin("Chunk Histo -> Offsets"); + task.chunk_offsets = offsets_from_counts_array_u32(scratch.arena, task.chunk_histo, chunk_idx_opl); + ProfEnd(); + + ProfBegin("Sort on chunk index"); + tp_for_parallel(tp, 0, tp->worker_count, rdib_string_map_radix_sort_chunk_idx_task, &task); + ProfEnd(); + + ProfBegin("Sort on element index"); + task.chunk_offsets = offsets_from_counts_array_u32(scratch.arena, task.chunk_histo, chunk_idx_opl); + task.ranges = tp_divide_work(scratch.arena, chunk_idx_opl, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_string_map_radix_sort_element_idx_task, &task); + ProfEnd(); + + // sort correctness check on element index +#if 1 + { + for (U64 i = 1; i < bucket_count; ++i) { + RDIB_StringMapBucket *prev = buckets[i - 1]; + RDIB_StringMapBucket *curr = buckets[i + 0]; + U32 prev_chunk_idx = prev->sorter.hi; + U32 curr_chunk_idx = curr->sorter.hi; + if (prev_chunk_idx == curr_chunk_idx) { + U32 prev_elem_idx = prev->sorter.lo; + U32 curr_elem_idx = curr->sorter.lo; + AssertAlways(prev_elem_idx < curr_elem_idx); + } + } + } +#endif + + scratch_end(scratch); + ProfEnd(); +} + +internal void +rdib_string_map_assign_indices(RDIB_StringMapBucket **buckets, U64 bucket_count) +{ + ProfBeginFunction(); + for (U64 idx = 0; idx < bucket_count; ++idx) { + buckets[idx]->idx = idx; + } + ProfEnd(); +} + +// Specialized Inserts + +internal void +rdib_string_map_insert_string_table_item(Arena *arena, RDIB_CollectStringsTask *task, U64 task_id, String8 string) +{ + rdib_string_map_insert_item(arena, task, task_id, string, 0); +} + +internal void +rdib_string_map_insert_name_map_item(Arena *arena, RDIB_CollectStringsTask *task, U64 task_id, String8 string, VoidNode *node) +{ + rdib_string_map_insert_item(arena, task, task_id, string, node); +} + +RDIB_STRING_MAP_UPDATE_FUNC(rdib_string_map_update_null) +{ + // null update +} + +RDIB_STRING_MAP_UPDATE_FUNC(rdib_string_map_update_concat_void_list_atomic) +{ + node->next = ins_atomic_ptr_eval_assign(head, node); +} + +//////////////////////////////// +// String Table Tasks + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_sects_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 sect_idx = task->ranges[task_id].min; sect_idx < task->ranges[task_id].max; ++sect_idx) { + RDIB_BinarySection *sect = &task->sects[sect_idx]; + rdib_string_map_insert_string_table_item(arena, task, task_id, sect->name); + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_units_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_UnitChunk *chunk = task->units[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Unit *unit = &chunk->v[i]; + rdib_string_map_insert_string_table_item(arena, task, task_id, unit->unit_name); + rdib_string_map_insert_string_table_item(arena, task, task_id, unit->compiler_name); + rdib_string_map_insert_string_table_item(arena, task, task_id, unit->source_file); + rdib_string_map_insert_string_table_item(arena, task, task_id, unit->object_file); + rdib_string_map_insert_string_table_item(arena, task, task_id, unit->archive_file); + rdib_string_map_insert_string_table_item(arena, task, task_id, unit->build_path); + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_source_files_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_SourceFileChunk *chunk = task->src_file_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_SourceFile *src_file = chunk->v + i; + rdib_string_map_insert_string_table_item(arena, task, task_id, src_file->normal_full_path); + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_vars_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_VariableChunk *chunk = task->vars[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Variable *var = &chunk->v[i]; + rdib_string_map_insert_string_table_item(arena, task, task_id, var->name); + rdib_string_map_insert_string_table_item(arena, task, task_id, var->link_name); + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_procs_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_ProcedureChunk *chunk = task->procs[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Procedure *proc = &chunk->v[i]; + rdib_string_map_insert_string_table_item(arena, task, task_id, proc->name); + rdib_string_map_insert_string_table_item(arena, task, task_id, proc->link_name); + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_inline_sites_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_InlineSiteChunk *chunk = task->inline_sites[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_InlineSite *inline_site = &chunk->v[i]; + rdib_string_map_insert_string_table_item(arena, task, task_id, inline_site->name); + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_udt_members_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_UDTMemberChunk *chunk = task->udt_members[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_UDTMember *udt_member = &chunk->v[i]; + switch (udt_member->kind) { + case RDI_MemberKind_NULL : break; + case RDI_MemberKind_DataField : rdib_string_map_insert_string_table_item(arena, task, task_id, udt_member->data_field.name ); break; + case RDI_MemberKind_StaticData : rdib_string_map_insert_string_table_item(arena, task, task_id, udt_member->static_data.name); break; + case RDI_MemberKind_Method : rdib_string_map_insert_string_table_item(arena, task, task_id, udt_member->method.name ); break; + case RDI_MemberKind_NestedType : rdib_string_map_insert_string_table_item(arena, task, task_id, udt_member->nested_type.name); break; + case RDI_MemberKind_Base : break; + case RDI_MemberKind_VirtualBase : break; + //case RDI_MemberKind_Enumerate : rdib_string_map_insert_string_table_item(arena, task, task_id, udt_member->enumerate.name); break; + case RDI_MemberKindExt_MemberListPointer: break; + default: InvalidPath; + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_enum_members_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_UDTMemberChunk *chunk = task->enum_members[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + rdib_string_map_insert_string_table_item(arena, task, task_id, chunk->v[i].enumerate.name); + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_types_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_TypeChunk *chunk = task->types[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Type *type = &chunk->v[i]; + if (RDI_TypeKind_FirstBuiltIn <= type->kind && type->kind <= RDI_TypeKind_LastBuiltIn) { + rdib_string_map_insert_string_table_item(arena, task, task_id, type->builtin.name); + } else if (RDI_IsUserDefinedType(type->kind)) { + rdib_string_map_insert_string_table_item(arena, task, task_id, type->udt.name); + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_collect_strings_path_nodes_task) +{ + ProfBeginFunction(); + RDIB_CollectStringsTask *task = raw_task; + for (RDIB_PathTreeNode *n = task->path_node_lists[task_id].first; n != 0; n = n->next_order) { + rdib_string_map_insert_string_table_item(arena, task, task_id, n->sub_path); + } + ProfEnd(); +} + +//////////////////////////////// +// Name Map Tasks + +internal +THREAD_POOL_TASK_FUNC(rdib_name_map_var_task) +{ + RDIB_CollectStringsTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_VariableChunk *chunk = task->vars[chunk_idx]; + VoidNode *nodes = push_array(arena, VoidNode, chunk->count); + for (U64 var_idx = 0; var_idx < chunk->count; ++var_idx) { + RDIB_Variable *n = &chunk->v[var_idx]; + nodes[var_idx].v = n; + rdib_string_map_insert_name_map_item(arena, task, task_id, n->name, &nodes[var_idx]); + } + } +} + +internal +THREAD_POOL_TASK_FUNC(rdib_name_map_var_link_name_task) +{ + RDIB_CollectStringsTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_VariableChunk *chunk = task->vars[chunk_idx]; + VoidNode *nodes = push_array(arena, VoidNode, chunk->count); + for (U64 var_idx = 0; var_idx < chunk->count; ++var_idx) { + RDIB_Variable *n = &chunk->v[var_idx]; + nodes[var_idx].v = n; + rdib_string_map_insert_name_map_item(arena, task, task_id, n->link_name, &nodes[var_idx]); + } + } +} + +internal +THREAD_POOL_TASK_FUNC(rdib_name_map_procedure_task) +{ + RDIB_CollectStringsTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_ProcedureChunk *chunk = task->procs[chunk_idx]; + VoidNode *nodes = push_array(arena, VoidNode, chunk->count); + for (U64 proc_idx = 0; proc_idx < chunk->count; ++proc_idx) { + RDIB_Procedure *n = &chunk->v[proc_idx]; + nodes[proc_idx].v = n; + rdib_string_map_insert_name_map_item(arena, task, task_id, n->name, &nodes[proc_idx]); + } + } +} + +internal +THREAD_POOL_TASK_FUNC(rdib_name_map_procedures_link_name_task) +{ + RDIB_CollectStringsTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_ProcedureChunk *chunk = task->procs[chunk_idx]; + VoidNode *nodes = push_array(arena, VoidNode, chunk->count); + for (U64 proc_idx = 0; proc_idx < chunk->count; ++proc_idx) { + RDIB_Procedure *n = &chunk->v[proc_idx]; + nodes[proc_idx].v = n; + rdib_string_map_insert_name_map_item(arena, task, task_id, n->link_name, &nodes[proc_idx]); + } + } +} + +internal +THREAD_POOL_TASK_FUNC(rdib_name_map_types_task) +{ + RDIB_CollectStringsTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_TypeChunk *chunk = task->types[chunk_idx]; + VoidNode *nodes = push_array(arena, VoidNode, chunk->count); + VoidNode *node_cursor = nodes; + for (U64 type_idx = 0; type_idx < chunk->count; ++type_idx) { + RDIB_Type *type = &chunk->v[type_idx]; + node_cursor->v = type; + + if (RDI_IsUserDefinedType(type->kind)) { + rdib_string_map_insert_name_map_item(arena, task, task_id, type->udt.name, node_cursor); + ++node_cursor; + } + } + } +} + +internal +THREAD_POOL_TASK_FUNC(rdib_name_map_normal_paths_task) +{ + RDIB_CollectStringsTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_SourceFileChunk *chunk = task->src_file_chunks[chunk_idx]; + VoidNode *nodes = push_array(arena, VoidNode, chunk->count); + VoidNode *node_cursor = nodes; + for (U64 i = 0; i < chunk->count; ++i, ++node_cursor) { + node_cursor->v = &chunk->v[i]; + rdib_string_map_insert_name_map_item(arena, task, task_id, chunk->v[i].normal_full_path, node_cursor); + } + } +} + +//////////////////////////////// +// Index Run Map + +internal U64 +rdib_index_run_hash(U32 count, U32 *idxs) +{ + XXH64_hash_t hash64 = XXH3_64bits(idxs, count * sizeof(idxs[0])); + return hash64; +} + +internal RDIB_IndexRunMap * +rdib_init_index_run_map(Arena *arena, U64 cap) +{ + ProfBeginFunction(); + RDIB_IndexRunMap *map = push_array(arena, RDIB_IndexRunMap, 1); + map->cap = cap; + map->buckets = push_array(arena, RDIB_IndexRunBucket *, cap); + ProfEnd(); + return map; +} + +internal RDIB_IndexRunBucket * +rdib_index_run_map_insert_or_update(Arena *arena, RDIB_IndexRunBucket **buckets, U64 cap, U64 hash, RDIB_IndexRunBucket *new_bucket, U64 *bucket_idx_out) +{ + B32 was_bucket_inserted_or_updated = 0; + + RDIB_IndexRunBucket *result = 0; + + U64 best_idx = hash % cap; + U64 idx = best_idx; + + do { + retry:; + RDIB_IndexRunBucket *curr_bucket = buckets[idx]; + + if (curr_bucket == 0) { + RDIB_IndexRunBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + + if (compare_bucket == curr_bucket) { + // success, bucket was inserted + was_bucket_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } else if (u32_array_compare(curr_bucket->indices, new_bucket->indices)) { + if (curr_bucket->sorter.v <= new_bucket->sorter.v) { + // recycle bucket + result = new_bucket; + + // don't need to update, more recent leaf is in the bucket + was_bucket_inserted_or_updated = 1; + break; + } + + RDIB_IndexRunBucket *compare_bucket = ins_atomic_ptr_eval_cond_assign(&buckets[idx], new_bucket, curr_bucket); + if (compare_bucket == curr_bucket) { + // recycle bucket + result = compare_bucket; + + // new bucket is in the hash table, exit + was_bucket_inserted_or_updated = 1; + break; + } + + // another thread took the bucket... + goto retry; + } + + // advance + idx = (idx + 1) % cap; + } while (idx != best_idx); + + // are there enough free buckets? + Assert(was_bucket_inserted_or_updated); + + // output bucket index + *bucket_idx_out = idx; + + return result; +} + +internal U32 +rdib_idx_run_from_bucket_idx(RDIB_IndexRunMap *map, U64 bucket_idx) +{ + RDIB_IndexRunBucket *bucket = map->buckets[bucket_idx]; + U32 idx_run32 = safe_cast_u32(bucket->index_in_output_array); + return idx_run32; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_count_extant_buckets_index_run_map_task) +{ + ProfBeginFunction(); + RDIB_GetExtantBucketsIndexRunMapTask *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + if (task->idx_run_map->buckets[bucket_idx] != 0) { + task->counts[task_id] += 1; + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_get_extant_buckets_index_run_map_task) +{ + ProfBeginFunction(); + RDIB_GetExtantBucketsIndexRunMapTask *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min, cursor = task->offsets[task_id]; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_IndexRunBucket *bucket = task->idx_run_map->buckets[bucket_idx]; + if (bucket != 0) { + task->result[cursor] = bucket; + ++cursor; + } + } + ProfEnd(); +} + +internal RDIB_IndexRunBucket ** +rdib_extant_buckets_from_index_run_map(TP_Context *tp, Arena *arena, RDIB_IndexRunMap *idx_run_map, U64 *bucket_count_out) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + RDIB_GetExtantBucketsIndexRunMapTask task = {0}; + task.idx_run_map = idx_run_map; + + ProfBegin("Count Extant Buckets"); + task.counts = push_array(scratch.arena, U64, tp->worker_count); + task.ranges = tp_divide_work(scratch.arena, idx_run_map->cap, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_count_extant_buckets_index_run_map_task, &task); + ProfEnd(); + + *bucket_count_out = sum_array_u64(tp->worker_count, task.counts); + + ProfBegin("Copy Extant Buckets"); + task.offsets = offsets_from_counts_array_u64(scratch.arena, task.counts, tp->worker_count); + task.result = push_array(arena, RDIB_IndexRunBucket *, *bucket_count_out); + tp_for_parallel(tp, 0, tp->worker_count, rdib_get_extant_buckets_index_run_map_task, &task); + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); + return task.result; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_index_run_map_bucket_chunk_idx_histo_task) +{ + ProfBeginFunction(); + RDIB_IndexRunMapRadixSort *task = raw_task; + Temp scratch = scratch_begin(0,0); + + U32 *range_histo = push_array(scratch.arena, U32, task->chunk_idx_opl); + + // count items per sorter + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_IndexRunBucket *bucket = task->src[bucket_idx]; + U32 chunk_idx = bucket->sorter.hi; + Assert(chunk_idx < task->chunk_idx_opl); + ++range_histo[chunk_idx]; + } + + // add in per thread sorter counts + for (U64 i = 0; i < task->chunk_idx_opl; ++i) { + ins_atomic_u32_add_eval(&task->chunk_histo[i], range_histo[i]); + } + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_index_run_map_radix_sort_chunk_idx_task) +{ + ProfBeginFunction(); + RDIB_IndexRunMapRadixSort *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_IndexRunBucket *bucket = task->src[bucket_idx]; + U32 chunk_idx = bucket->sorter.hi; + U32 dst_idx = ins_atomic_u32_inc_eval(&task->chunk_offsets[chunk_idx]) - 1; + task->dst[dst_idx] = bucket; + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_index_run_map_radix_sort_element_idx_task) +{ + ProfBeginFunction(); + RDIB_IndexRunMapRadixSort *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + U64 range_lo = task->chunk_offsets[chunk_idx]; + U64 range_hi = task->chunk_offsets[chunk_idx] + task->chunk_histo[chunk_idx]; + + ProfBegin("Zero out Histogram"); + U32 histo_bot[1 << 10]; MemoryZeroArray(histo_bot); + U32 histo_mid[1 << 11]; MemoryZeroArray(histo_mid); + U32 histo_top[1 << 11]; MemoryZeroArray(histo_top); + ProfEnd(); + + ProfBegin("Element Histogram"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_IndexRunBucket *elem = task->dst[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit_bot = (elem_idx >> 0) % ArrayCount(histo_bot); + U32 digit_mid = (elem_idx >> 10) % ArrayCount(histo_mid); + U32 digit_top = (elem_idx >> 21) % ArrayCount(histo_top); + histo_bot[digit_bot] += 1; + histo_mid[digit_mid] += 1; + histo_top[digit_top] += 1; + } + ProfEnd(); + + ProfBegin("Histogram Counts -> Offsets"); + counts_to_offsets_array_u32(ArrayCount(histo_bot), &histo_bot[0]); + counts_to_offsets_array_u32(ArrayCount(histo_mid), &histo_mid[0]); + counts_to_offsets_array_u32(ArrayCount(histo_top), &histo_top[0]); + ProfEnd(); + + ProfBegin("Sort Bot"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_IndexRunBucket *elem = task->dst[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit = (elem_idx >> 0) % ArrayCount(histo_bot); + U32 src_idx = range_lo + histo_bot[digit]++; + task->src[src_idx] = elem; + } + ProfEnd(); + + ProfBegin("Sort Mid"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_IndexRunBucket *elem = task->src[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit = (elem_idx >> 10) % ArrayCount(histo_mid); + U32 dst_idx = range_lo + histo_mid[digit]++; + task->dst[dst_idx] = elem; + } + ProfEnd(); + + ProfBegin("Sort Top"); + for (U64 i = range_lo; i < range_hi; ++i) { + RDIB_IndexRunBucket *elem = task->dst[i]; + U32 elem_idx = elem->sorter.lo; + U32 digit = (elem_idx >> 21) % ArrayCount(histo_top); + U32 src_idx = range_lo + histo_top[digit]++; + task->src[src_idx] = elem; + } + ProfEnd(); + } + + ProfEnd(); +} + +internal void +rdib_index_run_map_sort_buckets(TP_Context *tp, RDIB_IndexRunBucket **buckets, U64 bucket_count, U64 chunk_idx_opl) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + RDIB_IndexRunMapRadixSort task = {0}; + task.chunk_idx_opl = chunk_idx_opl; + task.ranges = tp_divide_work(scratch.arena, bucket_count, tp->worker_count); + task.src = buckets; + task.dst = push_array_no_zero(scratch.arena, RDIB_IndexRunBucket *, bucket_count); + + ProfBegin("Chunk Index Histogram"); + task.chunk_histo = push_array(scratch.arena, U32, chunk_idx_opl); + tp_for_parallel(tp, 0, tp->worker_count, rdib_index_run_map_bucket_chunk_idx_histo_task, &task); + ProfEnd(); + + // sort correctness check on chunk index +#if 0 + for (U64 i = 1; i < bucket_count; ++i) { + RDIB_StringMapBucket *prev = buckets[i - 1]; + RDIB_StringMapBucket *curr = buckets[i + 0]; + U32 prev_chunk_idx = RDIB_StringMap_ChunkIdx32FromSorter(prev->sorter); + U32 curr_chunk_idx = RDIB_StringMap_ChunkIdx32FromSorter(curr->sorter); + AssertAlways(prev_chunk_idx <= curr_chunk_idx); + } +#endif + + ProfBegin("Chunk Histo -> Offsets"); + task.chunk_offsets = offsets_from_counts_array_u32(scratch.arena, task.chunk_histo, chunk_idx_opl); + ProfEnd(); + + ProfBegin("Sort on chunk index"); + tp_for_parallel(tp, 0, tp->worker_count, rdib_index_run_map_radix_sort_chunk_idx_task, &task); + ProfEnd(); + + ProfBegin("Sort on element index"); + task.chunk_offsets = offsets_from_counts_array_u32(scratch.arena, task.chunk_histo, chunk_idx_opl); + task.ranges = tp_divide_work(scratch.arena, chunk_idx_opl, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_index_run_map_radix_sort_element_idx_task, &task); + ProfEnd(); + + // sort correctness check on element index +#if 0 + { + for (U64 i = 1; i < bucket_count; ++i) { + RDIB_IndexRunBucket *prev = buckets[i - 1]; + RDIB_IndexRunBucket *curr = buckets[i + 0]; + U32 prev_chunk_idx = prev->sorter.hi; + U32 curr_chunk_idx = curr->sorter.hi; + if (prev_chunk_idx == curr_chunk_idx) { + U32 prev_elem_idx = prev->sorter.lo; + U32 curr_elem_idx = curr->sorter.lo; + AssertAlways(prev_elem_idx < curr_elem_idx); + } + } + } +#endif + + scratch_end(scratch); + ProfEnd(); +} + +internal void +rdib_index_run_map_assign_indices(RDIB_IndexRunBucket **buckets, U64 bucket_count) +{ + ProfBeginFunction(); + for (U64 bucket_idx = 0, cursor = 0; bucket_idx < bucket_count; ++bucket_idx) { + buckets[bucket_idx]->index_in_output_array = cursor; + cursor += buckets[bucket_idx]->indices.count; + } + ProfEnd(); +} + +// index run map specialization + +internal U64 +rdib_index_run_map_insert_item(Arena *arena, RDIB_BuildIndexRunsTask *task, U64 worker_id, U64 item_idx, U64 count, U32 *idxs) +{ + Assert(item_idx < max_U32); + + // do we have a free bucket? + RDIB_IndexRunBucket *bucket = task->free_buckets[worker_id]; + if (bucket == 0) { + bucket = push_array(arena, RDIB_IndexRunBucket, 1); + } + + // fill out bucket + bucket->indices.count = count; + bucket->indices.v = idxs; + bucket->sorter.v = task->sorter_idx << 32 | (U32)item_idx; + + // insert bucket + U64 hash = rdib_index_run_hash(count, idxs); + U64 bucket_idx = max_U64; + RDIB_IndexRunBucket *free_bucket = rdib_index_run_map_insert_or_update(arena, + task->idx_run_map->buckets, + task->idx_run_map->cap, + hash, + bucket, + &bucket_idx); + Assert(bucket_idx != max_U64); + + // recycle bucket + task->free_buckets[worker_id] = free_bucket; + + return bucket_idx; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_idx_runs_params_task) +{ + ProfBeginFunction(); + + RDIB_BuildIndexRunsTask *task = raw_task; + RDIB_TypeChunk *chunk = task->type_chunks[task_id]; + + for (RDIB_Type *type = &chunk->v[0], *opl = chunk->v + chunk->count; type < opl; ++type) { + if (type->kind == RDI_TypeKind_Function) { + RDIB_Type *params = type->func.params_type; + + // pack params + U64 type_index_count = params->params.count; + U32 *type_indices = push_array_no_zero(arena, U32, type_index_count); + for (U64 param_idx = 0; param_idx < params->params.count; ++param_idx) { + type_indices[param_idx] = rdib_idx_from_type(params->params.types[param_idx]); + } + + // insert type indices + U32 func_type_idx = rdib_idx_from_type(type); + type->func.param_idx_run_bucket_idx = rdib_index_run_map_insert_item(arena, task, worker_id, func_type_idx, type_index_count, type_indices); + } else if (type->kind == RDI_TypeKind_Method) { + RDIB_Type *params = type->method.params_type; + + U64 type_index_count = params->params.count + 1; + U32 *type_indices = push_array_no_zero(arena, U32, type_index_count); + U64 type_idx_cursor = 0; + + // pack 'this' type + type_indices[type_idx_cursor++] = rdib_idx_from_type(type->method.this_type); + + // pack params + for (U64 param_idx = 0; param_idx < params->params.count; ++param_idx) { + type_indices[type_idx_cursor++] = rdib_idx_from_type(params->params.types[param_idx]); + } + + // insert type indices + U32 method_type_idx = rdib_idx_from_type(type); + type->method.param_idx_run_bucket_idx = rdib_index_run_map_insert_item(arena, task, worker_id, method_type_idx, type_index_count, type_indices); + } else if (type->kind == RDI_TypeKindExt_StaticMethod) { + RDIB_Type *params = type->static_method.params_type; + + U64 type_index_count = params->params.count + 1; + U32 *type_indices = push_array_no_zero(arena, U32, type_index_count); + U64 type_idx_cursor = 0; + + // static methods don't have 'this' + type_indices[type_idx_cursor++] = 0; + + // pack params + for (U64 param_idx = 0; param_idx < params->params.count; ++param_idx) { + type_indices[type_idx_cursor++] = rdib_idx_from_type(params->params.types[param_idx]); + } + + // insert type indices + U32 static_method_type_idx = rdib_idx_from_type(type); + type->static_method.param_idx_run_bucket_idx = rdib_index_run_map_insert_item(arena, task, worker_id, static_method_type_idx, type_index_count, type_indices); + } + } + + ProfEnd(); +} + +internal U32 +rdib_idx_from_name_map_void_node(RDIB_BuildIndexRunsTask *task, VoidNode *node) +{ + U64 idx = 0; + switch (task->name_map_kind) { + case RDI_NameMapKind_NULL : break; + case RDI_NameMapKind_GlobalVariables : idx = rdib_idx_from_variable ((RDIB_Variable* ) node); break; + case RDI_NameMapKind_ThreadVariables : idx = rdib_idx_from_variable ((RDIB_Variable * ) node); break; + case RDI_NameMapKind_Procedures : idx = rdib_idx_from_procedure ((RDIB_Procedure * ) node); break; + case RDI_NameMapKind_Types : idx = rdib_idx_from_type ((RDIB_Type * ) node); break; + case RDI_NameMapKind_LinkNameProcedures: idx = rdib_idx_from_procedure ((RDIB_Procedure * ) node); break; + case RDI_NameMapKind_NormalSourcePaths : idx = rdib_idx_from_source_file((RDIB_SourceFile *) node); break; + default: InvalidPath; + } + U32 idx32 = safe_cast_u32(idx); + return idx32; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_idx_runs_name_map_buckets_task) +{ + ProfBeginFunction(); + + RDIB_BuildIndexRunsTask *task = raw_task; + + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_StringMapBucket *bucket = task->name_map_buckets[bucket_idx]; + U64 count = void_list_count_nodes(bucket->raw_values); + + if (count > 1) { + // build array of indices that point to name map respective arrays + U32 *idxs = push_array_no_zero(arena, U32, count); + { + U64 curr_idx = 0; + for (VoidNode *curr = bucket->raw_values; curr != 0; curr = curr->next, ++curr_idx) { + idxs[curr_idx] = rdib_idx_from_name_map_void_node(task, curr->v); + } + } + + // make index array deterministic + u32_array_sort(count, idxs); // TODO: we don't need to sort with one worker thread + + // :string_map_bucket_sorter_copy + U64 idx_run_bucket_idx = rdib_index_run_map_insert_item(arena, task, worker_id, bucket_idx, count, idxs); // TODO: fix `idx` leak when we insert same runs + + // fill out bucket + bucket->count = count; + bucket->idx_run_bucket_idx = idx_run_bucket_idx; + } if (count == 1) { + U32 match_idx = rdib_idx_from_name_map_void_node(task, bucket->raw_values->v); + + // fill out bucket + bucket->count = 1; + bucket->match_idx = match_idx; + } + } + + ProfEnd(); +} + +//////////////////////////////// + +#if 0 +internal U32 +rdib_idx_from_params(RDIB_IndexRunMap *map, RDIB_Type *params) +{ + Assert(params->kind == RDI_TypeKindExt_Params); + U32 idx = params->params.idx_run_bucket->index_in_output_array; + return idx; +} +#endif + +//////////////////////////////// +// Data Sections + +internal void +rdib_data_section_list_push_node(RDIB_DataSectionList *list, RDIB_DataSectionNode *node) +{ + SLLQueuePushCount(list, node); +} + +internal RDIB_DataSectionNode * +rdib_data_section_list_push(Arena *arena, RDIB_DataSectionList *list, RDIB_DataSection v) +{ + RDIB_DataSectionNode *node = push_array(arena, RDIB_DataSectionNode, 1); + node->v = v; + rdib_data_section_list_push_node(list, node); + return node; +} + +internal void +rdib_data_section_list_concat_in_place(RDIB_DataSectionList *list, RDIB_DataSectionList *to_concat) +{ + SLLConcatInPlace(list, to_concat); +} + +internal void +rdib_data_sections_from_top_level_info(Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_TopLevelInfo *src) +{ + ProfBeginFunction(); + + RDI_TopLevelInfo *dst = push_array(arena, RDI_TopLevelInfo, 1); + dst->arch = src->arch; + dst->exe_name_string_idx = rdib_idx_from_string_map(string_map, src->exe_name); + dst->exe_hash = src->exe_hash; + dst->voff_max = src->voff_max; + dst->producer_name_string_idx = rdib_idx_from_string_map(string_map, src->producer_string); + + RDIB_DataSection sect = { .tag = RDI_SectionKind_TopLevelInfo }; + str8_list_push(arena, §.data, str8_struct(dst)); + rdib_data_section_list_push(arena, sect_list, sect); + + ProfEnd(); +} + +internal void +rdib_data_sections_from_binary_sections(Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_BinarySection *binary_sects, U64 binary_sects_count) +{ + ProfBeginFunction(); + + RDI_BinarySection *dst_arr = push_array(arena, RDI_BinarySection, binary_sects_count); + + for (U64 sect_idx = 0; sect_idx < binary_sects_count; ++sect_idx) { + RDIB_BinarySection *src = &binary_sects[sect_idx]; + RDI_BinarySection *dst = &dst_arr[sect_idx]; + + dst->name_string_idx = rdib_idx_from_string_map(string_map, src->name); + dst->flags = src->flags; + dst->voff_first = src->voff_first; + dst->voff_opl = src->voff_opl; + dst->foff_first = src->foff_first; + dst->foff_opl = src->foff_opl; + } + + RDIB_DataSection sect = { .tag = RDI_SectionKind_BinarySections }; + str8_list_push(arena, §.data, str8_array(dst_arr, binary_sects_count)); + rdib_data_section_list_push(arena, sect_list, sect); + + ProfEnd(); +} + +internal void +rdib_data_sections_from_units(Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + RDIB_PathTree *path_tree, + U64 total_unit_count, + U64 unit_chunk_count, + RDIB_UnitChunk **unit_chunks) +{ + ProfBeginFunction(); + + RDI_Unit *dst_arr = push_array(arena, RDI_Unit, total_unit_count); + for (U64 chunk_idx = 0; chunk_idx < unit_chunk_count; chunk_idx += 1) { + RDIB_UnitChunk *chunk = unit_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; i += 1) { + RDIB_Unit *src = &chunk->v[i]; + U64 unit_idx = rdib_idx_from_unit(src); + RDI_Unit *dst = &dst_arr[unit_idx]; + dst->unit_name_string_idx = rdib_idx_from_string_map(string_map, src->unit_name); + dst->compiler_name_string_idx = rdib_idx_from_string_map(string_map, src->compiler_name); + dst->source_file_path_node = rdib_idx_from_path_tree(path_tree, src->source_file); + dst->object_file_path_node = rdib_idx_from_path_tree(path_tree, src->object_file); + dst->archive_file_path_node = rdib_idx_from_path_tree(path_tree, src->archive_file); + dst->build_path_node = rdib_idx_from_path_tree(path_tree, src->build_path); + dst->language = src->language; + dst->line_table_idx = src->line_table->output_array_idx; + } + } + + RDIB_DataSection sect = { .tag = RDI_SectionKind_Units }; + str8_list_push(arena, §.data, str8_array(dst_arr, total_unit_count)); + rdib_data_section_list_push(arena, sect_list, sect); + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_vmap_count_ranges_unit_task) +{ + ProfBeginFunction(); + RDIB_VMapBuilderTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_UnitChunk *chunk = task->unit_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Unit *unit = &chunk->v[i]; + task->counts[task_id] += unit->virt_range_count; + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_vmap_count_ranges_gvar_task) +{ + ProfBeginFunction(); + RDIB_VMapBuilderTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_VariableChunk *chunk = task->gvar_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Variable *var = &chunk->v[i]; + for (RDIB_LocationNode *loc_n = var->locations.first; loc_n != 0; loc_n = loc_n->next) { + task->counts[task_id] += loc_n->v.ranges.count; + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_vmap_count_ranges_scope_task) +{ + ProfBeginFunction(); + RDIB_VMapBuilderTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_ScopeChunk *chunk = task->scope_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + task->counts[task_id] += chunk->v[i].ranges.count; + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_fill_vmap_entries_unit_task) +{ + ProfBeginFunction(); + RDIB_VMapBuilderTask *task = raw_task; + U64 range_cursor = task->offsets[task_id]; + U64 range_cursor_opl = task->offsets[task_id] + task->counts[task_id]; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_UnitChunk *chunk = task->unit_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Unit *unit = &chunk->v[i]; + for (Rng1U64 *range_ptr = unit->virt_ranges, *range_opl = unit->virt_ranges + unit->virt_range_count; + range_ptr < range_opl; ++range_ptr) { + Assert(range_cursor < range_cursor_opl); + Assert(range_ptr->min <= range_ptr->max); + + RDIB_VMapRange *vmap_range = task->vmap + range_cursor; + vmap_range->voff = range_ptr->min; + vmap_range->size = range_ptr->max - range_ptr->min; + vmap_range->idx = rdib_idx_from_unit(unit); + range_cursor += 1; + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_fill_vmap_entries_gvar_task) +{ + ProfBeginFunction(); + RDIB_VMapBuilderTask *task = raw_task; + + U64 range_cursor = task->offsets[task_id]; + U64 range_cursor_opl = task->offsets[task_id] + task->counts[task_id]; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_VariableChunk *chunk = task->gvar_chunks[chunk_idx]; + for (U64 var_idx = 0; var_idx < chunk->count; ++var_idx) { + RDIB_Variable *var = &chunk->v[var_idx]; + for (RDIB_LocationNode *loc_n = var->locations.first; loc_n != 0; loc_n = loc_n->next) { + for (Rng1U64Node *range_n = loc_n->v.ranges.first; range_n != 0; range_n = range_n->next) { + Assert(range_cursor < range_cursor_opl); + Assert(range_n->v.min <= range_n->v.max); + U64 size = range_n->v.max - range_n->v.min; + + RDIB_VMapRange *vmap_range = task->vmap + range_cursor; + vmap_range->voff = range_n->v.min; + vmap_range->size = size; + vmap_range->idx = rdib_idx_from_variable(var); + range_cursor += 1; + } + } + } + } + + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_fill_vmap_entries_scope_task) +{ + ProfBeginFunction(); + RDIB_VMapBuilderTask *task = raw_task; + + U64 range_cursor = task->offsets[task_id]; + U64 range_cursor_opl = task->offsets[task_id] + task->counts[task_id]; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_ScopeChunk *chunk = task->scope_chunks[chunk_idx]; + for (U64 scope_idx = 0; scope_idx < chunk->count; ++scope_idx) { + RDIB_Scope *scope = &chunk->v[scope_idx]; + for (Rng1U64Node *range_n = scope->ranges.first; range_n != 0; range_n = range_n->next) { + Assert(range_cursor < range_cursor_opl); + Assert(range_n->v.min <= range_n->v.max); + + RDIB_VMapRange *vmap_range = task->vmap + range_cursor; + vmap_range->voff = range_n->v.min; + vmap_range->size = range_n->v.max - range_n->v.min; + vmap_range->idx = rdib_idx_from_scope(scope); + range_cursor += 1; + } + } + } + + ProfEnd(); +} + +internal void +rdib_sort_procs_radix_32(RDIB_Procedure **v, U64 count) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + RDIB_Procedure **temp = push_array_no_zero(scratch.arena, RDIB_Procedure *, count); + RDIB_Procedure **src = v; + RDIB_Procedure **dst = temp; + + ProfBegin("Count Memzero"); + U32 count_8lo[256]; MemoryZeroArray(count_8lo); + U32 count_8hi[256]; MemoryZeroArray(count_8hi); + U32 count_16[1 << 16]; MemoryZeroArray(count_16); + ProfEnd(); + + ProfBegin("Histogram"); + for (U64 i = 0; i < count; i += 1) { + RDIB_Procedure *p = src[i]; + + U64 digit_8lo = (p->scope->ranges.first->v.min >> 0) % ArrayCount(count_8lo); + U64 digit_8hi = (p->scope->ranges.first->v.min >> 8) % ArrayCount(count_8hi); + U64 digit_16 = (p->scope->ranges.first->v.min >> 16) % ArrayCount(count_16); + + count_8lo[digit_8lo] += 1; + count_8hi[digit_8hi] += 1; + count_16[digit_16] += 1; + } + ProfEnd(); + + ProfBegin("Counts -> Offsets"); + counts_to_offsets_array_u32(ArrayCount(count_8lo), count_8lo); + counts_to_offsets_array_u32(ArrayCount(count_8hi), count_8hi); + counts_to_offsets_array_u32(ArrayCount(count_16), count_16 ); + ProfEnd(); + + ProfBegin("Order 8 Lo"); + for (U64 i = 0; i < count; i += 1) { + RDIB_Procedure *p = src[i]; + U64 digit = (p->scope->ranges.first->v.min >> 0) % ArrayCount(count_8lo); + dst[count_8lo[digit]++] = p; + } + ProfEnd(); + + ProfBegin("Order 8 Hi"); + for (U64 i = 0; i < count; i += 1) { + RDIB_Procedure *p = dst[i]; + U64 digit = (p->scope->ranges.first->v.min >> 8) % ArrayCount(count_8hi); + src[count_8hi[digit]++] = p; + } + ProfEnd(); + + ProfBegin("Order 16"); + for (U64 i = 0; i < count; i += 1) { + RDIB_Procedure *p = src[i]; + U64 digit = (p->scope->ranges.first->v.min >> 16) % ArrayCount(count_16); + dst[count_16[digit]++] = p; + } + ProfEnd(); + + MemoryCopyTyped(src, dst, count); + + scratch_end(scratch); + ProfEnd(); +} + +internal String8List +rdib_data_from_vmap(Arena *arena, U64 range_count, RDIB_VMapRange *ranges) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + const U64 size_bit_count0 = 8; + const U64 size_bit_count1 = 8; + const U64 size_bit_count2 = 16; + + const U64 voff_bit_count0 = 11; + const U64 voff_bit_count1 = 11; + const U64 voff_bit_count2 = 10; + + ProfBegin("Push shared buffer"); + U64 radix_memory_size = sizeof(RDIB_VMapRange) * range_count + + sizeof(U32) * ((1 << size_bit_count0) + (1 << size_bit_count1) + (1 << size_bit_count2)) + + sizeof(U32) * ((1 << voff_bit_count0) + (1 << voff_bit_count1) + (1 << voff_bit_count2)); + U8 *radix_memory = push_array_no_zero(arena, U8, radix_memory_size); + ProfEnd(); + + // TODO: windows caps images at 4GiB so we use 32-bit radix sort, but on linux + // images can have > 4GiB and we need to detect when vmap uses upper 32bits + // in voffs do a 64-bit radix sort. + ProfBegin("Sort"); + { + RDIB_VMapRange *src = ranges; + RDIB_VMapRange *dst = (RDIB_VMapRange *)radix_memory; + + U32 *size_count0 = (U32 *)(dst + range_count); + U32 *size_count1 = size_count0 + (1 << size_bit_count0); + U32 *size_count2 = size_count1 + (1 << size_bit_count1); + + U32 *voff_count0 = size_count2 + (1 << size_bit_count2); + U32 *voff_count1 = voff_count0 + (1 << voff_bit_count0); + U32 *voff_count2 = voff_count1 + (1 << voff_bit_count1); + + // + // Build histogram + // + + MemoryZeroTyped(size_count0, 1 << size_bit_count0); + MemoryZeroTyped(size_count1, 1 << size_bit_count1); + MemoryZeroTyped(size_count2, 1 << size_bit_count2); + + MemoryZeroTyped(voff_count0, 1 << voff_bit_count0); + MemoryZeroTyped(voff_count1, 1 << voff_bit_count1); + MemoryZeroTyped(voff_count2, 1 << voff_bit_count2); + + for (U64 i = 0; i < range_count; ++i) { + RDIB_VMapRange *r = src+i; + + U32 size_digit0 = (-r->size >> 0) % (1 << size_bit_count0); + U32 size_digit1 = (-r->size >> size_bit_count0) % (1 << size_bit_count1); + U32 size_digit2 = (-r->size >> (size_bit_count0 + size_bit_count1)) % (1 << size_bit_count2); + + U64 voff_digit0 = (r->voff >> 0) % (1 << voff_bit_count0); + U64 voff_digit1 = (r->voff >> voff_bit_count0) % (1 << voff_bit_count1); + U64 voff_digit2 = (r->voff >> (voff_bit_count0 + voff_bit_count1)) % (1 << voff_bit_count2); + + ++size_count0[size_digit0]; + ++size_count1[size_digit1]; + ++size_count2[size_digit2]; + + ++voff_count0[voff_digit0]; + ++voff_count1[voff_digit1]; + ++voff_count2[voff_digit2]; + } + + counts_to_offsets_array_u32((1 << size_bit_count0), size_count0); + counts_to_offsets_array_u32((1 << size_bit_count1), size_count1); + counts_to_offsets_array_u32((1 << size_bit_count2), size_count2); + + counts_to_offsets_array_u32((1 << voff_bit_count0), voff_count0); + counts_to_offsets_array_u32((1 << voff_bit_count1), voff_count1); + counts_to_offsets_array_u32((1 << voff_bit_count2), voff_count2); + + // + // Sort on range size (high to low) + // + + for (U64 i = 0; i < range_count; ++i) { + RDIB_VMapRange r = src[i]; + U32 digit = (-r.size >> 0) % (1 << size_bit_count0); + dst[size_count0[digit]++] = r; + } + + for (U64 i = 0; i < range_count; ++i) { + RDIB_VMapRange r = dst[i]; + U32 digit = (-r.size >> size_bit_count0) % (1 << size_bit_count1); + src[size_count1[digit]++] = r; + } + + for (U64 i = 0; i < range_count; ++i) { + RDIB_VMapRange r = src[i]; + U32 digit = (-r.size >> (size_bit_count0 + size_bit_count1)) % (1 << size_bit_count2); + dst[size_count2[digit]++] = r; + } + + // + // Sort on range voff (low to high) + // + + for (U64 i = 0; i < range_count; ++i) { + RDIB_VMapRange r = dst[i]; + U32 digit = (r.voff >> 0) % (1 << voff_bit_count0); + src[voff_count0[digit]++] = r; + } + + for (U64 i = 0; i < range_count; ++i) { + RDIB_VMapRange r = src[i]; + U32 digit = (r.voff >> voff_bit_count0) % (1 << voff_bit_count1); + dst[voff_count1[digit]++] = r; + } + + for (U64 i = 0; i < range_count; ++i) { + RDIB_VMapRange r = dst[i]; + U32 digit = (r.voff >> (voff_bit_count0 + voff_bit_count1)) % (1 << voff_bit_count2); + src[voff_count2[digit]++] = r; + } + } + ProfEnd(); + + ProfBegin("Layout virtual map"); + String8List raw_vmap = {0}; + { + U64 default_vme_cap = 4096; + U64 vme_block_cap = radix_memory_size / sizeof(RDI_VMapEntry); + U64 vme_block_size = 0; + RDI_VMapEntry *vme_block = (RDI_VMapEntry *)radix_memory; + + // Recycle radix sort memory + str8_list_push(arena, &raw_vmap, str8_array(vme_block, vme_block_cap)); + +#define push_vme() (vme_block_size < raw_vmap.last->string.size/sizeof(vme_block[0])) ? &vme_block[vme_block_size++] : \ + (vme_block = push_array(arena, RDI_VMapEntry, vme_block_cap), \ + vme_block_cap = default_vme_cap, \ + vme_block_size = 0, \ + str8_list_push(arena, &raw_vmap, str8_array(vme_block, vme_block_cap)), \ + &vme_block[vme_block_size++]) + + struct Stack { + RDIB_VMapRange *range; + struct Stack *next; + }; + struct Stack *stack = 0; + struct Stack *free_stack = 0; + stack = push_array(scratch.arena, struct Stack, 1); + stack->range = &ranges[0]; + + for (U64 range_idx = 1; range_idx < range_count; ++range_idx) { + RDIB_VMapRange *r = ranges+range_idx; + RDIB_VMapRange *last_bot_range = stack->range; + RDIB_VMapRange *last_pop_range = 0; + while (stack->range->idx != 0) { + if (r->voff < stack->range->voff + stack->range->size) { + // Current range is a subset, keep building stack + break; + } + + struct Stack *frame = stack; + SLLStackPop(stack); + + // Did we reach bottom most range? + if (last_pop_range == 0) { + // Don't push VME for index with adjacent ranges + if (vme_block_size > 0 && vme_block[vme_block_size-1].idx != frame->range->idx) { + RDI_VMapEntry *vme = push_vme(); + vme->voff = frame->range->voff; + vme->idx = frame->range->idx; + } + } + + // Reopen parent range + // + // Does parent range extend past child range? + if (stack->range->voff + stack->range->size != frame->range->voff + frame->range->size && + // Does next range open on where stack range ends? + r->voff != frame->range->voff + frame->range->size) { + RDI_VMapEntry *vme = push_vme(); + vme->idx = stack->range->idx; + vme->voff = frame->range->voff + frame->range->size; + } + + last_pop_range = stack->range; + + // Recycle stack frame + SLLStackPush(free_stack, frame); + } + + // Prefix + if (last_pop_range == 0 && last_bot_range->voff != r->voff) { + RDI_VMapEntry* vme = push_vme(); + vme->voff = last_bot_range->voff; + vme->idx = last_bot_range->idx; + } + + struct Stack *frame; + if (free_stack == 0) { + frame = push_array(scratch.arena, struct Stack, 1); + } else { + frame = free_stack; + SLLStackPop(free_stack); + } + frame->range = r; + SLLStackPush(stack, frame); + } + + // Empty stack + { + RDIB_VMapRange *last_pop_range = 0; + while (stack->range->idx != 0) { + struct Stack *frame = stack; + SLLStackPop(stack); + + if (last_pop_range == 0) { + if (vme_block_size > 0 && vme_block[vme_block_size-1].idx != frame->range->idx) { + RDI_VMapEntry *vme = push_vme(); + vme->voff = frame->range->voff; + vme->idx = frame->range->idx; + } + } + + if (stack->range->voff + stack->range->size != frame->range->voff + frame->range->size) { + RDI_VMapEntry *vme = push_vme(); + vme->voff = frame->range->voff + frame->range->size; + vme->idx = stack->range->idx; + } + + last_pop_range = stack->range; + } + } + + // Subtract unsued vmap entries + U64 last_vme_unused = raw_vmap.last->string.size - sizeof(vme_block[0]) * vme_block_size; + raw_vmap.last->string.size -= last_vme_unused; + raw_vmap.total_size -= last_vme_unused; + +#undef push_vme + } + ProfEnd(); + + + // duplicate voff check +#if 0 + U64 prev = max_U64; + for (String8Node *node = raw_vmap.first; node != 0; node = node->next) { + RDI_VMapEntry *e = (RDI_VMapEntry*)node->string.str; + for (U64 i = 0, c = node->string.size / sizeof(RDI_VMapEntry); i < c; ++i) { + Assert(e[i].voff != prev); + prev = e[i].voff; + } + } +#endif + + scratch_end(scratch); + ProfEnd(); + return raw_vmap; +} + +THREAD_POOL_TASK_FUNC(rdib_fill_scope_vmaps_task) +{ + ProfBeginFunction(); + RDIB_VMapBuilderTask *task = raw_task; + task->raw_vmaps[task_id] = rdib_data_from_vmap(arena, task->vmap_counts[task_id], task->vmaps[task_id]); + ProfEnd(); +} + +internal void +rdib_data_sections_from_unit_gvar_scope_vmaps(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + U64 unit_chunk_count, RDIB_UnitChunk **unit_chunks, + U64 gvar_chunk_count, RDIB_VariableChunk **gvar_chunks, + U64 scope_chunk_count, RDIB_ScopeChunk **scope_chunks) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + RDIB_VMapBuilderTask task = {0}; + task.counts = push_array(scratch.arena, U64, tp->worker_count); + task.ranges = tp_divide_work(scratch.arena, unit_chunk_count, tp->worker_count); + + ProfBegin("Unit VMap"); + U64 unit_vmap_count; + RDIB_VMapRange *unit_vmaps; + { + ProfBegin("Count Ranges"); + MemoryZeroTyped(task.counts, tp->worker_count); + task.unit_chunks = unit_chunks; + task.ranges = tp_divide_work(scratch.arena, unit_chunk_count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_vmap_count_ranges_unit_task, &task); + ProfEnd(); + + ProfBegin("Push"); + unit_vmap_count = sum_array_u64(tp->worker_count, task.counts); + unit_vmaps = push_array_no_zero(scratch.arena, RDIB_VMapRange, unit_vmap_count); + ProfEnd(); + + ProfBegin("Fill"); + task.vmap = unit_vmaps; + task.unit_chunks = unit_chunks; + task.offsets = offsets_from_counts_array_u64(scratch.arena, task.counts, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_fill_vmap_entries_unit_task, &task); + ProfEnd(); + } + ProfEnd(); + + ProfBegin("Global Variables"); + U64 gvar_vmap_count; + RDIB_VMapRange *gvar_vmaps; + { + ProfBegin("Count"); + MemoryZeroTyped(task.counts, tp->worker_count); + task.ranges = tp_divide_work(scratch.arena, gvar_chunk_count, tp->worker_count); + task.gvar_chunks = gvar_chunks; + tp_for_parallel(tp, 0, tp->worker_count, rdib_vmap_count_ranges_gvar_task, &task); + ProfEnd(); + + ProfBegin("Push"); + gvar_vmap_count = sum_array_u64(tp->worker_count, task.counts); + gvar_vmaps = push_array_no_zero(scratch.arena, RDIB_VMapRange, gvar_vmap_count); + ProfEnd(); + + ProfBegin("Fill"); + task.vmap = gvar_vmaps; + task.gvar_chunks = gvar_chunks; + task.offsets = offsets_from_counts_array_u64(scratch.arena, task.counts, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_fill_vmap_entries_gvar_task, &task); + ProfEnd(); + } + ProfEnd(); + + ProfBegin("Scopes"); + U64 scope_vmap_count; + RDIB_VMapRange *scope_vmaps; + { + ProfBegin("Count"); + MemoryZeroTyped(task.counts, tp->worker_count); + task.ranges = tp_divide_work(scratch.arena, scope_chunk_count, tp->worker_count); + task.scope_chunks = scope_chunks; + tp_for_parallel(tp, 0, tp->worker_count, rdib_vmap_count_ranges_scope_task, &task); + ProfEnd(); + + ProfBegin("Push"); + scope_vmap_count = sum_array_u64(tp->worker_count, task.counts); + scope_vmaps = push_array_no_zero(scratch.arena, RDIB_VMapRange, scope_vmap_count); + ProfEnd(); + + ProfBegin("Fill"); + task.vmap = scope_vmaps; + task.scope_chunks = scope_chunks; + task.offsets = offsets_from_counts_array_u64(scratch.arena, task.counts, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_fill_vmap_entries_scope_task, &task); + ProfEnd(); + } + ProfEnd(); + + task.vmap_counts[0] = unit_vmap_count; + task.vmap_counts[1] = gvar_vmap_count; + task.vmap_counts[2] = scope_vmap_count; + task.vmaps[0] = unit_vmaps; + task.vmaps[1] = gvar_vmaps; + task.vmaps[2] = scope_vmaps; + + ProfBegin("Fill RDI VMaps"); + MemoryZeroArray(task.raw_vmaps); + tp_for_parallel(tp, arena, 3, rdib_fill_scope_vmaps_task, &task); + ProfEnd(); + + RDIB_DataSection unit_vmap_sect = { .tag = RDI_SectionKind_UnitVMap, .data = task.raw_vmaps[0] }; + RDIB_DataSection gvar_vmap_sect = { .tag = RDI_SectionKind_GlobalVMap, .data = task.raw_vmaps[1] }; + RDIB_DataSection scope_vmap_sect = { .tag = RDI_SectionKind_ScopeVMap, .data = task.raw_vmaps[2] }; + rdib_data_section_list_push(arena->v[0], sect_list, unit_vmap_sect ); + rdib_data_section_list_push(arena->v[0], sect_list, gvar_vmap_sect ); + rdib_data_section_list_push(arena->v[0], sect_list, scope_vmap_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_copy_string_data_task) +{ + RDIB_CopyStringDataTask *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_StringMapBucket *bucket = task->buckets[bucket_idx]; + U64 string_table_offset = task->string_table[bucket_idx]; + Assert(string_table_offset + bucket->string.size <= task->string_data_size); + MemoryCopy(task->string_data + string_table_offset, bucket->string.str, bucket->string.size); + } +} + +internal void +rdib_data_sections_from_string_map(TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMapBucket **buckets, U64 bucket_count) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + // assign string table offset for each bucket + U64 cursor = 0; + U32 *string_table = push_array_no_zero(arena, U32, bucket_count); + for (U64 bucket_idx = 0; bucket_idx < bucket_count; ++bucket_idx) { + string_table[bucket_idx] = cursor; + cursor += buckets[bucket_idx]->string.size; + } + + // populate string data buffer with bucket strings + RDIB_CopyStringDataTask task = {0}; + task.string_table = string_table; + task.string_data_size = cursor; + task.string_data = push_array_no_zero(arena, U8, task.string_data_size); + task.buckets = buckets; + task.ranges = tp_divide_work(scratch.arena, bucket_count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_copy_string_data_task, &task); + + // fill out string table section + RDIB_DataSection string_table_sect = {0}; + string_table_sect.tag = RDI_SectionKind_StringTable; + str8_list_push(arena, &string_table_sect.data, str8((U8 *)task.string_table, sizeof(task.string_table[0]) * bucket_count)); + + // fill out string data section + RDIB_DataSection string_data_sect = { .tag = RDI_SectionKind_StringData }; + str8_list_push(arena, &string_data_sect.data, str8(task.string_data, task.string_data_size)); + + // push sections to list + rdib_data_section_list_push(arena, sect_list, string_table_sect); + rdib_data_section_list_push(arena, sect_list, string_data_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_idx_run_copy_task) +{ + RDIB_IdxRunCopyTask *task = raw_task; + for (U64 bucket_idx = task->ranges[task_id].min; bucket_idx < task->ranges[task_id].max; ++bucket_idx) { + RDIB_IndexRunBucket *bucket = task->buckets[bucket_idx]; + MemoryCopyTyped(&task->output_array[bucket->index_in_output_array], bucket->indices.v, bucket->indices.count); + } +} + +internal void +rdib_data_sections_from_index_runs(TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_IndexRunBucket **buckets, U64 bucket_count) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + ProfBegin("Count Indices"); + U64 total_index_count = 0; + for (U64 bucket_idx = 0; bucket_idx < bucket_count; ++bucket_idx) { + total_index_count += buckets[bucket_idx]->indices.count; + } + ProfEnd(); + + U32 *output_array = push_array_no_zero(arena, U32, total_index_count); + + RDIB_IdxRunCopyTask task = {0}; + task.buckets = buckets; + task.ranges = tp_divide_work(scratch.arena, bucket_count, tp->worker_count); + task.output_array = output_array; + tp_for_parallel(tp, 0, tp->worker_count, rdib_idx_run_copy_task, &task); + + RDIB_DataSection data_sect = { .tag = RDI_SectionKind_IndexRuns }; + str8_list_push(arena, &data_sect.data, str8_array(output_array, total_index_count)); + + rdib_data_section_list_push(arena, sect_list, data_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_file_path_nodes_task) +{ + ProfBeginFunction(); + RDIB_BuildFilePathNodesTask *task = raw_task; + RDIB_StringMap *string_map = task->string_map; + RDIB_PathTree *path_tree = task->path_tree; + for (RDIB_PathTreeNode *n = path_tree->node_lists[task_id].first; n != 0; n = n->next_order) { + RDI_FilePathNode *dst = task->nodes_dst + n->node_idx; + dst->name_string_idx = rdib_idx_from_string_map(string_map, n->sub_path); + + B32 is_source_file_node = (n->first_child == 0); + if (is_source_file_node) { + dst->source_file_idx = rdib_idx_from_source_file(n->src_file); + } else { + // directories don't have a source file + Assert(n->src_file == 0); + dst->source_file_idx = 0; + } + + if(n->parent) { + dst->parent_path_node = n->parent->node_idx; + } + if (n->first_child) { + dst->first_child = n->first_child->node_idx; + } + if (n->next_sibling) { + dst->next_sibling = n->next_sibling->node_idx; + } + } + ProfEnd(); +} + +internal void +rdib_data_sections_from_path_tree(TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_PathTree *path_tree) +{ + ProfBeginFunction(); + RDI_FilePathNode *nodes_dst = push_array_no_zero(arena, RDI_FilePathNode, path_tree->node_count); + + RDIB_BuildFilePathNodesTask task = {0}; + task.path_tree = path_tree; + task.string_map = string_map; + task.nodes_dst = nodes_dst; + tp_for_parallel(tp, 0, path_tree->list_count, rdib_build_file_path_nodes_task, &task); + + RDIB_DataSection data_sect = { .tag = RDI_SectionKind_FilePathNodes }; + str8_list_push(arena, &data_sect.data, str8_array(nodes_dst, path_tree->node_count)); + + rdib_data_section_list_push(arena, sect_list, data_sect); + + ProfEnd(); +} + +internal RDIB_PathTree * +rdib_build_path_tree(Arena *arena, + U64 worker_count, + RDIB_SourceFile *null_src_file, + U64 unit_chunk_count, + RDIB_UnitChunk **unit_chunks, + U64 src_file_chunk_count, + RDIB_SourceFileChunk **src_file_chunks) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + RDIB_PathTree *tree = rdib_path_tree_init(arena, worker_count); + + rdib_path_tree_insert(arena, tree, str8_lit(""), null_src_file); + + ProfBegin("Units"); + for (U64 ichunk = 0; ichunk < unit_chunk_count; ++ichunk) { + RDIB_UnitChunk *chunk = unit_chunks[ichunk]; + for (U64 iunit = 0; iunit < chunk->count; ++iunit) { + RDIB_Unit *unit = &chunk->v[iunit]; + rdib_path_tree_insert(arena, tree, unit->source_file, null_src_file); + rdib_path_tree_insert(arena, tree, unit->object_file, null_src_file); + rdib_path_tree_insert(arena, tree, unit->archive_file, null_src_file); + rdib_path_tree_insert(arena, tree, unit->build_path, null_src_file); + } + } + ProfEnd(); + + ProfBegin("Source Files"); + for (U64 chunk_idx = 0; chunk_idx < src_file_chunk_count; ++chunk_idx) { + RDIB_SourceFileChunk *chunk = src_file_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_SourceFile *src_file = chunk->v + i; + rdib_path_tree_insert(arena, tree, src_file->normal_full_path, src_file); + } + } + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); + return tree; +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_var_section_task) +{ + ProfBeginDynamic("Global Variables Task %llu", task_id); + RDIB_BuildSymbolSectionTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_VariableChunk *chunk = task->gvars_rdib[chunk_idx]; + RDI_GlobalVariable *vars = push_array_no_zero(arena, RDI_GlobalVariable, chunk->count); + + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Variable *src = &chunk->v[i]; + RDI_GlobalVariable *dst = &vars[i]; + + // TODO: temporary hack while we don't have bytecode eval in RDI_GlobalVariable + U64 voff = 0; + if (src->locations.first != 0) { + if (src->locations.first->v.kind == RDI_LocationKind_AddrBytecodeStream && src->locations.first->v.bytecode.first->op == RDI_EvalOp_ModuleOff) { + voff = src->locations.first->v.bytecode.first->p; + } + } + + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->name); + dst->voff = voff; + dst->type_idx = rdib_idx_from_type(src->type); + dst->link_flags = src->link_flags; + + if (src->container_type != 0) { + Assert(!src->container_proc); + dst->link_flags |= RDI_LinkFlag_TypeScoped; + dst->container_idx = rdib_idx_from_udt_type(src->container_type); + } + if (src->container_proc != 0) { + Assert(!src->container_type); + dst->link_flags |= RDI_LinkFlag_ProcScoped; + dst->container_idx = rdib_idx_from_procedure(src->container_proc); + } + } + + str8_list_push(arena, &task->gvars_out[task_id], str8_array(vars, chunk->count)); + } + + ProfEnd(); +} + +internal void +rdib_data_sections_from_global_variables(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + U64 total_count, + U64 chunk_count, + RDIB_VariableChunk **chunks) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Build"); + RDIB_BuildSymbolSectionTask task = {0}; + task.string_map = string_map; + task.ranges = tp_divide_work(scratch.arena, chunk_count, tp->worker_count); + task.gvars_rdib = chunks; + task.gvars_out = push_array(scratch.arena, String8List, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, rdib_build_var_section_task, &task); + ProfEnd(); + + RDIB_DataSection gvars_sect = { .tag = RDI_SectionKind_GlobalVariables }; + str8_list_concat_in_place_array(&gvars_sect.data, task.gvars_out, tp->worker_count); + rdib_data_section_list_push(arena->v[0], sect_list, gvars_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_tvar_section_task) +{ + RDIB_BuildSymbolSectionTask *task = raw_task; + ProfBeginDynamic("Thread Variables Task [Chunk Count: %llu]", task->ranges[task_id].max - task->ranges[task_id].min); + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_VariableChunk *chunk = task->tvars_rdib[chunk_idx]; + RDI_ThreadVariable *vars = push_array_no_zero(arena, RDI_ThreadVariable, chunk->count); + + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Variable *src = &chunk->v[i]; + RDI_ThreadVariable *dst = &vars[i]; + + U32 tls_off = 0; + if (src->locations.first != 0) { + if (src->locations.first->v.kind == RDI_LocationKind_AddrBytecodeStream && src->locations.first->v.bytecode.first->op == RDI_EvalOp_TLSOff) { + tls_off = src->locations.first->v.bytecode.first->p; + } + } + + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->name); + dst->tls_off = tls_off; + dst->type_idx = rdib_idx_from_type(src->type); + + if (src->container_type != 0) { + Assert(!src->container_proc); + dst->link_flags |= RDI_LinkFlag_TypeScoped; + dst->container_idx = rdib_idx_from_udt_type(src->container_type); + } + if (src->container_proc != 0) { + Assert(!src->container_type); + dst->link_flags |= RDI_LinkFlag_ProcScoped; + dst->container_idx = rdib_idx_from_procedure(src->container_proc); + } + } + + str8_list_push(arena, &task->tvars_out[task_id], str8_array(vars, chunk->count)); + } + + ProfEnd(); +} + +internal void +rdib_data_sections_from_thread_variables(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + U64 total_count, + U64 chunk_count, + RDIB_VariableChunk **chunks) +{ + ProfBeginDynamic("Thread Variables [Chunk Count: %llu, Total Count %llu]", total_count, chunk_count); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Build"); + RDIB_BuildSymbolSectionTask task = {0}; + task.string_map = string_map; + task.ranges = tp_divide_work(scratch.arena, chunk_count, tp->worker_count); + task.tvars_rdib = chunks; + task.tvars_out = push_array(scratch.arena, String8List, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, rdib_build_tvar_section_task, &task); + ProfEnd(); + + RDIB_DataSection tvars_sect = { .tag = RDI_SectionKind_ThreadVariables }; + str8_list_concat_in_place_array(&tvars_sect.data, task.tvars_out, tp->worker_count); + rdib_data_section_list_push(arena->v[0], sect_list, tvars_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_procs_section_task) +{ + RDIB_BuildSymbolSectionTask *task = raw_task; + ProfBeginDynamic("Procedures Task [Chunk Count: %llu]", task->ranges[task_id].max - task->ranges[task_id].min); + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_ProcedureChunk *chunk = task->procs_rdib[chunk_idx]; + RDI_Procedure *procs = push_array_no_zero(arena, RDI_Procedure, chunk->count); + + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_Procedure *src = &chunk->v[i]; + RDI_Procedure *dst = &procs[i]; + + dst->name_string_idx = rdib_idx_from_string_map(task->string_map, src->name); + dst->link_name_string_idx = rdib_idx_from_string_map(task->string_map, src->link_name); + dst->link_flags = src->link_flags; + dst->type_idx = rdib_idx_from_type(src->type); + dst->root_scope_idx = rdib_idx_from_scope(src->scope); + + if (src->container_type != 0) { + AssertAlways(!src->container_proc); + dst->link_flags |= RDI_LinkFlag_TypeScoped; + dst->container_idx = rdib_idx_from_udt_type(src->container_type); + } + + if (src->container_proc != 0) { + AssertAlways(!src->container_type); + dst->link_flags |= RDI_LinkFlag_ProcScoped; + dst->container_idx = rdib_idx_from_procedure(0); Assert(!"TODO"); // src->container_proc + } + } + + str8_list_push(arena, &task->procs_out[task_id], str8_array(procs, chunk->count)); + } + + ProfEnd(); +} + +internal void +rdib_data_sections_from_procedures(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + U64 total_count, + U64 chunk_count, + RDIB_ProcedureChunk **chunks) +{ + ProfBeginDynamic("Procedures [Total Count: %llu, Chunk Count: %llu]", total_count, chunk_count); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Build"); + RDIB_BuildSymbolSectionTask task = {0}; + task.string_map = string_map; + task.ranges = tp_divide_work(scratch.arena, chunk_count, tp->worker_count); + task.procs_rdib = chunks; + task.procs_out = push_array(scratch.arena, String8List, tp->worker_count); + tp_for_parallel(tp, arena, tp->worker_count, rdib_build_procs_section_task, &task); + ProfEnd(); + + RDIB_DataSection procs_sect = { .tag = RDI_SectionKind_Procedures }; + str8_list_concat_in_place_array(&procs_sect.data, task.procs_out, tp->worker_count); + rdib_data_section_list_push(arena->v[0], sect_list, procs_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_count_scopes_task) +{ + ProfBeginFunction(); + RDIB_BuildSymbolSectionTask *task = raw_task; + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_ScopeChunk *chunk = task->scopes_rdib[chunk_idx]; + for (U64 scope_i = 0; scope_i < chunk->count; ++scope_i) { + RDIB_Scope *scope = &chunk->v[scope_i]; + + task->scope_voff_counts[task_id] += scope->ranges.count * 2; + task->local_counts[task_id] += scope->local_count; + + for (RDIB_Variable *var = scope->local_first; var != 0; var = var->next) { + for (RDIB_LocationNode *loc_n = var->locations.first; loc_n != 0; loc_n = loc_n->next) { + switch (loc_n->v.kind) { + case RDI_LocationKind_NULL: break; + case RDI_LocationKind_AddrBytecodeStream: + case RDI_LocationKind_ValBytecodeStream: { + task->loc_data_sizes[task_id] += loc_n->v.bytecode.size + /* stream ender: */ 1; + } break; + case RDI_LocationKind_ValReg: { + task->loc_data_sizes[task_id] += sizeof(RDI_LocationReg); + } break; + case RDI_LocationKind_AddrRegPlusU16: + case RDI_LocationKind_AddrAddrRegPlusU16: { + task->loc_data_sizes[task_id] += sizeof(RDI_LocationRegPlusU16); + } break; + default: InvalidPath; + } + + task->loc_block_counts[task_id] += loc_n->v.ranges.count; + task->loc_data_sizes[task_id] += AlignPadPow2(task->loc_data_sizes[task_id], 8); + } + } + } + } + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_scopes_task) +{ + RDIB_BuildSymbolSectionTask *task = raw_task; + ProfBeginDynamic("Scopes [Chunk Count: %llu]", task->ranges[task_id].max - task->ranges[task_id].min); + + // scope voff fill info + U64 scope_voff_cursor = task->scope_voff_offsets[task_id]; + U64 scope_voff_max = task->scope_voff_offsets[task_id] + task->scope_voff_counts[task_id]; + U64 *scope_voff_ptr = task->scope_voffs_rdi; + + // local fill info + U64 local_cursor = task->local_offsets[task_id]; + U64 local_max = task->local_offsets[task_id] + task->local_counts[task_id]; + RDI_Local *locals = task->locals_rdi; + + // location data fill info + U64 loc_data_max = task->loc_data_offsets[task_id] + task->loc_data_sizes[task_id]; + U64 loc_data_cursor = task->loc_data_offsets[task_id]; + U8 *loc_data = task->loc_data_rdi; + + // location block fill info + U64 loc_block_cursor = task->loc_block_offsets[task_id]; + U64 loc_block_max = task->loc_block_offsets[task_id] + task->loc_block_counts[task_id]; + RDI_LocationBlock *loc_blocks = task->loc_blocks_rdi; + + for (U64 ichunk = task->ranges[task_id].min; ichunk < task->ranges[task_id].max; ++ichunk) { + RDIB_ScopeChunk *chunk = task->scopes_rdib[ichunk]; + for (U64 iscope = 0; iscope < chunk->count; ++iscope) { + RDIB_Scope *scope_src = &chunk->v[iscope]; + U64 scope_idx = rdib_idx_from_scope(scope_src); + RDI_Scope *scope_dst = &task->scopes_rdi[scope_idx]; + + scope_dst->proc_idx = rdib_idx_from_procedure(scope_src->container_proc); + scope_dst->parent_scope_idx = rdib_idx_from_scope(scope_src->parent); + scope_dst->first_child_scope_idx = rdib_idx_from_scope(scope_src->first_child); + scope_dst->next_sibling_scope_idx = rdib_idx_from_scope(scope_src->next_sibling); + scope_dst->voff_range_first = scope_voff_cursor; + scope_dst->voff_range_opl = scope_voff_cursor + scope_src->ranges.count * 2; + scope_dst->local_count = scope_src->local_count; + if (scope_src->local_count > 0) { + scope_dst->local_first = local_cursor; + } else { + scope_dst->local_first = 0; + } + // TODO: static locals can be exported as local variables + //scope_dst->static_local_idx_run_first = ???; + //scope_dst->static_local_count = ???; + scope_dst->inline_site_idx = rdib_idx_from_inline_site(scope_src->inline_site); + + // fill out scope voffs + for (Rng1U64Node *range_n = scope_src->ranges.first; range_n != 0; range_n = range_n->next) { + Assert(scope_voff_cursor + 2 <= scope_voff_max); + scope_voff_ptr[scope_voff_cursor + 0] = range_n->v.min; + scope_voff_ptr[scope_voff_cursor + 1] = range_n->v.max; + scope_voff_cursor += 2; + } + + // fill out locals & locations + for (RDIB_Variable *local_src = scope_src->local_first; local_src != 0; local_src = local_src->next, ++local_cursor) { + U64 loc_block_first = loc_block_cursor; + + for (RDIB_LocationNode *loc_n = local_src->locations.first; loc_n != 0; loc_n = loc_n->next) { + RDIB_Location *loc = &loc_n->v; + + // fill out location data + U64 location_data_off = loc_data_cursor; + switch (loc->kind) { + case RDI_LocationKind_NULL: break; + case RDI_LocationKind_AddrBytecodeStream: + case RDI_LocationKind_ValBytecodeStream: { + // write opcodes & operands + for (RDIB_EvalBytecodeOp *op_node = loc->bytecode.first; op_node != 0; op_node = op_node->next) { + // opcode + Assert(loc_data_cursor + sizeof(op_node->op) <= loc_data_max); + MemoryCopy(loc_data + loc_data_cursor, &op_node->op, sizeof(op_node->op)); + loc_data_cursor += sizeof(op_node->op); + + // operand + Assert(loc_data_cursor + op_node->p_size <= loc_data_max); + MemoryCopy(loc_data + loc_data_cursor, &op_node->p, op_node->p_size); + loc_data_cursor += op_node->p_size; + } + + // stream ender + Assert(loc_data_cursor + 1 <= loc_data_max); + loc_data[loc_data_cursor] = 0; + loc_data_cursor += 1; + } break; + case RDI_LocationKind_AddrRegPlusU16: + case RDI_LocationKind_AddrAddrRegPlusU16: { + Assert(loc_data_cursor + sizeof(RDI_LocationRegPlusU16) <= loc_data_max); + RDI_LocationRegPlusU16 *dst = (RDI_LocationRegPlusU16 *) (loc_data + loc_data_cursor); + dst->kind = loc->kind; + dst->reg_code = loc->reg_code; + dst->offset = loc->offset; + + loc_data_cursor += sizeof(*dst); + } break; + case RDI_LocationKind_ValReg: { + Assert(loc_data_cursor + sizeof(RDI_LocationReg) <= loc_data_max); + RDI_LocationReg *dst = (RDI_LocationReg *) (loc_data + loc_data_cursor); + dst->kind = loc->kind; + dst->reg_code = loc->reg_code; + loc_data_cursor += sizeof(*dst); + } break; + default: InvalidPath; + } + + // zero out align bytes + U64 align_size = AlignPadPow2(loc_data_cursor, 8); + Assert(loc_data_cursor + align_size <= loc_data_max); + MemorySet(loc_data + loc_data_cursor, 0, align_size); + loc_data_cursor += align_size; + + // fill out location block + for (Rng1U64Node *range_n = loc->ranges.first; range_n != 0; range_n = range_n->next, ++loc_block_cursor) { + Assert(loc_block_cursor < loc_block_max); + RDI_LocationBlock *loc_block_dst = &loc_blocks[loc_block_cursor]; + loc_block_dst->scope_off_first = range_n->v.min; + loc_block_dst->scope_off_opl = range_n->v.max; + loc_block_dst->location_data_off = location_data_off; + } + } + + Assert(local_cursor <= local_max); + RDI_Local *local_dst = &locals[local_cursor]; + local_dst->kind = local_src->kind; + local_dst->name_string_idx = rdib_idx_from_string_map(task->string_map, local_src->name); + local_dst->type_idx = rdib_idx_from_type(local_src->type); + if (local_src->locations.count > 0) { + local_dst->location_first = loc_block_first; + local_dst->location_opl = loc_block_cursor; + } else { + local_dst->location_first = 0; + local_dst->location_opl = 0; + } + } + } + } + + Assert(scope_voff_cursor == scope_voff_max); + Assert(local_cursor == local_max); + Assert(loc_data_cursor == loc_data_max); + + ProfEnd(); +} + +internal void +rdib_data_sections_from_scopes(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + U64 total_scope_count, + U64 chunk_count, + RDIB_ScopeChunk **scopes) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + RDIB_BuildSymbolSectionTask task = {0}; + task.string_map = string_map; + task.ranges = tp_divide_work(scratch.arena, chunk_count, tp->worker_count); + task.scopes_rdib = scopes; + + ProfBegin("Count Locals & Locations"); + task.scope_voff_counts = push_array(scratch.arena, U64, tp->worker_count); + task.local_counts = push_array(scratch.arena, U64, tp->worker_count); + task.loc_block_counts = push_array(scratch.arena, U64, tp->worker_count); + task.loc_data_sizes = push_array(scratch.arena, U64, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_count_scopes_task, &task); + ProfEnd(); + + U64 total_scope_voff_count = sum_array_u64(tp->worker_count, task.scope_voff_counts); + U64 total_local_count = sum_array_u64(tp->worker_count, task.local_counts ); + U64 total_loc_block_count = sum_array_u64(tp->worker_count, task.loc_block_counts ); + U64 total_loc_data_size = sum_array_u64(tp->worker_count, task.loc_data_sizes ); + + ProfBegin("Fill out scopes, locals, location blocks, and location data"); + task.scope_voff_offsets = offsets_from_counts_array_u64(scratch.arena, task.scope_voff_counts, tp->worker_count); + task.local_offsets = offsets_from_counts_array_u64(scratch.arena, task.local_counts, tp->worker_count); + task.loc_block_offsets = offsets_from_counts_array_u64(scratch.arena, task.loc_block_counts, tp->worker_count); + task.loc_data_offsets = offsets_from_counts_array_u64(scratch.arena, task.loc_data_sizes, tp->worker_count); + + ProfBegin("Push"); + task.scope_voffs_rdi = push_array_no_zero(arena->v[0], U64, total_scope_voff_count); + task.scopes_rdi = push_array_no_zero(arena->v[0], RDI_Scope, total_scope_count ); + task.locals_rdi = push_array_no_zero(arena->v[0], RDI_Local, total_local_count ); + task.loc_blocks_rdi = push_array_no_zero(arena->v[0], RDI_LocationBlock, total_loc_block_count ); + task.loc_data_rdi = push_array_no_zero(arena->v[0], U8, total_loc_data_size ); + ProfEnd(); + + tp_for_parallel(tp, 0, tp->worker_count, rdib_build_scopes_task, &task); + ProfEnd(); + + RDIB_DataSection scopes_sect = { .tag = RDI_SectionKind_Scopes }; + RDIB_DataSection scope_voffs_sect = { .tag = RDI_SectionKind_ScopeVOffData }; + RDIB_DataSection locals_sect = { .tag = RDI_SectionKind_Locals }; + RDIB_DataSection loc_blocks_sect = { .tag = RDI_SectionKind_LocationBlocks }; + RDIB_DataSection loc_data_sect = { .tag = RDI_SectionKind_LocationData }; + + str8_list_push(arena->v[0], &scopes_sect.data, str8_array(task.scopes_rdi, total_scope_count )); + str8_list_push(arena->v[0], &scope_voffs_sect.data, str8_array(task.scope_voffs_rdi, total_scope_voff_count)); + str8_list_push(arena->v[0], &locals_sect.data, str8_array(task.locals_rdi, total_local_count )); + str8_list_push(arena->v[0], &loc_blocks_sect.data, str8_array(task.loc_blocks_rdi, total_loc_block_count )); + str8_list_push(arena->v[0], &loc_data_sect.data, str8_array(task.loc_data_rdi, total_loc_data_size )); + + rdib_data_section_list_push(arena->v[0], sect_list, scopes_sect ); + rdib_data_section_list_push(arena->v[0], sect_list, scope_voffs_sect); + rdib_data_section_list_push(arena->v[0], sect_list, locals_sect ); + rdib_data_section_list_push(arena->v[0], sect_list, loc_blocks_sect ); + rdib_data_section_list_push(arena->v[0], sect_list, loc_data_sect ); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_name_map_task) +{ + ProfBeginFunction("Build Name Map"); + Temp scratch = scratch_begin(&arena, 1); + + RDIB_NameMapBuilderTask *task = raw_task; + RDI_NameMapKind name_map_idx = (RDI_NameMapKind)task_id; + + U64 out_node_count = task->in_bucket_counts[name_map_idx]; + U64 load_factor = 4; + U64 out_bucket_count = CeilIntegerDiv(out_node_count, load_factor); + + ProfBegin("Build temp hash map"); + struct Node { + struct Node *next; + RDIB_StringMapBucket *name; + }; + struct NodeList { + struct Node *first; + struct Node *last; + U64 node_count; + }; + struct NodeList *temp_map = push_array(scratch.arena, struct NodeList, out_bucket_count); + struct Node *temp_nodes = push_array_no_zero(scratch.arena, struct Node, out_node_count); + for (U64 i = 0; i < task->in_bucket_counts[name_map_idx]; ++i) { + RDIB_StringMapBucket *src_bucket = task->in_buckets[name_map_idx][i]; + + U64 hash = rdi_hash(src_bucket->string.str, src_bucket->string.size); + U64 bucket_idx = hash % out_bucket_count; + + struct Node *node = temp_nodes + i; + node->next = 0; + node->name = src_bucket; + + SLLQueuePush(temp_map[bucket_idx].first, temp_map[bucket_idx].last, node); + ++temp_map[bucket_idx].node_count; + } + ProfEnd(); + + ProfBegin("Push buckets and nodes"); + RDI_NameMapBucket *out_buckets = push_array_no_zero(arena, RDI_NameMapBucket, out_bucket_count); + RDI_NameMapNode *out_nodes = push_array_no_zero(arena, RDI_NameMapNode, out_node_count); + ProfEnd(); + + ProfBegin("Fill out buckets"); + for (U64 bucket_idx = 0, node_cursor = 0; bucket_idx < out_bucket_count; ++bucket_idx) { + struct NodeList *src_bucket = &temp_map[bucket_idx]; + RDI_NameMapBucket *dst_bucket = &out_buckets[bucket_idx]; + + if (src_bucket->node_count == 0) { + dst_bucket->first_node = 0; + dst_bucket->node_count = 0; + continue; + } + + dst_bucket->first_node = safe_cast_u32(node_cursor); + dst_bucket->node_count = src_bucket->node_count; + + for (struct Node *n = src_bucket->first; n != 0; n = n->next, ++node_cursor) { + RDIB_StringMapBucket *src_name = n->name; + + RDI_NameMapNode *dst_node = &out_nodes[node_cursor]; + dst_node->string_idx = rdib_idx_from_string_map(task->string_map, src_name->string); + dst_node->match_count = src_name->count; + if (src_name->count > 1) { + dst_node->match_idx_or_idx_run_first = task->idx_run_map->buckets[src_name->idx_run_bucket_idx]->index_in_output_array; + } else { + dst_node->match_idx_or_idx_run_first = src_name->match_idx; + } + } + } + ProfEnd(); + + // fill out output + task->out_buckets[name_map_idx] = out_buckets; + task->out_nodes[name_map_idx] = out_nodes; + task->out_bucket_counts[name_map_idx] = out_bucket_count; + task->out_node_counts[name_map_idx] = out_node_count; + + scratch_end(scratch); + ProfEnd(); +} + +internal void +rdib_data_sections_from_name_maps(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + RDIB_IndexRunMap *idx_run_map, + RDIB_StringMapBucket **src_name_maps[RDI_NameMapKind_COUNT], + U64 src_name_map_counts[RDI_NameMapKind_COUNT]) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Build Name Maps"); + RDIB_NameMapBuilderTask task = {0}; + task.string_map = string_map; + task.idx_run_map = idx_run_map; + task.in_bucket_counts = src_name_map_counts; + task.in_buckets = src_name_maps; + task.out_buckets = push_array(scratch.arena, RDI_NameMapBucket *, RDI_NameMapKind_COUNT); + task.out_nodes = push_array(scratch.arena, RDI_NameMapNode *, RDI_NameMapKind_COUNT); + task.out_bucket_counts = push_array(scratch.arena, U64, RDI_NameMapKind_COUNT); + task.out_node_counts = push_array(scratch.arena, U64, RDI_NameMapKind_COUNT); + tp_for_parallel(tp, arena, RDI_NameMapKind_COUNT, rdib_build_name_map_task, &task); + ProfEnd(); + + U64 *bucket_offsets = offsets_from_counts_array_u64(scratch.arena, task.out_bucket_counts, RDI_NameMapKind_COUNT); + U64 *node_offsets = offsets_from_counts_array_u64(scratch.arena, task.out_node_counts, RDI_NameMapKind_COUNT); + + String8List raw_name_maps = {0}, raw_name_map_buckets = {0}, raw_name_map_nodes = {0}; + for (U64 i = 0; i < RDI_NameMapKind_COUNT; ++i) { + RDI_NameMap *dst_name_map = push_array(arena->v[0], RDI_NameMap, 1); + dst_name_map->bucket_base_idx = bucket_offsets[i]; + dst_name_map->node_base_idx = node_offsets[i]; + dst_name_map->bucket_count = task.out_bucket_counts[i]; + dst_name_map->node_count = task.out_node_counts[i]; + + str8_list_push(arena->v[0], &raw_name_maps, str8_struct(dst_name_map)); + str8_list_push(arena->v[0], &raw_name_map_buckets, str8_array(task.out_buckets[i], task.out_bucket_counts[i])); + str8_list_push(arena->v[0], &raw_name_map_nodes, str8_array(task.out_nodes[i], task.out_node_counts[i])); + } + + RDIB_DataSection name_maps_sect = { .tag = RDI_SectionKind_NameMaps, .data = raw_name_maps }; + RDIB_DataSection name_map_buckets_sect = { .tag = RDI_SectionKind_NameMapBuckets, .data = raw_name_map_buckets }; + RDIB_DataSection name_map_nodes_sect = { .tag = RDI_SectionKind_NameMapNodes, .data = raw_name_map_nodes }; + rdib_data_section_list_push(arena->v[0], sect_list, name_maps_sect); + rdib_data_section_list_push(arena->v[0], sect_list, name_map_buckets_sect); + rdib_data_section_list_push(arena->v[0], sect_list, name_map_nodes_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_src_line_map_task) +{ + // Line tables are ordered to perform 'virtual offset -> line number' maps, + // and thus we potentially can have multiple virtual offsets map to same line number. + // (e.g. in C/C++ if for-loop declaration site is on one line, intial-statment-condition + // and expression parts map to same line number). And so to make things easy on debugger + // we remove duplicates from source line map and reorient mapping to 'line number -> virtual offset' + // this way debugger can quickly compute virtual offsets when placing a breakpoint on a source line. + + Temp scratch = scratch_begin(&arena, 1); + RDIB_SrcLineMapsTask *task = raw_task; + + RDIB_SourceFile *src_file = task->src_file_arr[task_id]; + U64 src_file_idx = rdib_idx_from_source_file(src_file); + + //ProfBeginDynamic("Build Source Line Map [%.*s]", str8_varg(src_file->file_path)); + ProfBegin("Build Source Line Map"); + + ProfBegin("Count lines/virt offsets"); + U64 ln_voff_count = 0; + for (RDIB_LineTableFragment *frag = src_file->line_table_frags; frag != 0; frag = frag->next_src_file) { + ln_voff_count += frag->line_count; + } + ProfEnd(); + + ProfBegin("Push ln_voff_arr"); + PairU32 *ln_voff_arr = push_array_no_zero(scratch.arena, PairU32, ln_voff_count); + ProfEnd(); + + ProfBegin("Fill out ln_voff_arr"); + { + U64 cursor = 0; + for (RDIB_LineTableFragment *frag = src_file->line_table_frags; frag != 0; frag = frag->next_src_file) { + for (U64 line_idx = 0; line_idx < frag->line_count; ++line_idx) { + ln_voff_arr[cursor].v0 = frag->line_nums[line_idx]; + ln_voff_arr[cursor].v1 = frag->voffs[line_idx]; + ++cursor; + } + } + } + ProfEnd(); + + // sort on line number + ProfBegin("Sort"); + if (ln_voff_count < 512) { + // TODO: Radsort is buggy and inifte loops if we sort pair of u64. + // Check-in with Jeff on Monday about bugfix. For now workaround + // the bug wiht pair of u32s. There is no virtual offset larger + // than 4GiB in line table anyway. + radsort(ln_voff_arr, ln_voff_count, pair_u32_is_before_v0); + } else { + u32_pair_radix_sort(ln_voff_count, ln_voff_arr); + } + ProfEnd(); + + // TODO: leak, precompute unique line number count and push exact array lengths + U32 *line_nums = push_array_no_zero(arena, U32, ln_voff_count); + U32 *line_ranges = push_array_no_zero(arena, U32, ln_voff_count + 1); + U64 *voffs = push_array_no_zero(arena, U64, ln_voff_count); + + U64 voff_cursor = 0; + U64 line_num_cursor = 0; + if (ln_voff_count > 0) { + line_nums[line_num_cursor] = ln_voff_arr[0].v0; + voffs[voff_cursor] = ln_voff_arr[0].v1; + line_ranges[line_num_cursor] = voff_cursor; + + ++voff_cursor; + ++line_num_cursor; + + ProfBegin("Fill out output array"); + for (U64 i = 1; i < ln_voff_count; ++i) { + // does this voff belong to next line number? + if (ln_voff_arr[i].v0 != line_nums[line_num_cursor-1]) { + line_nums[line_num_cursor] = ln_voff_arr[i].v0; + line_ranges[line_num_cursor] = (U32)voff_cursor; + ++line_num_cursor; + } + voffs[voff_cursor++] = ln_voff_arr[i].v1; + } + ProfEnd(); + + // did we fill out voff array correctly? + Assert(voff_cursor == ln_voff_count); + + // close last line range + line_ranges[line_num_cursor] = voff_cursor; + } + + // fill out result + task->out_line_counts[src_file_idx] = line_num_cursor; + task->out_voff_counts[src_file_idx] = safe_cast_u32(voff_cursor); + task->out_line_nums[src_file_idx] = line_nums; + task->out_line_ranges[src_file_idx] = line_ranges; + task->out_voffs[src_file_idx] = voffs; + + scratch_end(scratch); + ProfEnd(); +} + +internal void +rdib_data_sections_from_source_line_maps(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + U64 total_src_file_count, + U64 src_file_chunk_count, + RDIB_SourceFileChunk **src_file_chunks) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + ProfBegin("Prepare Source File Array"); + RDIB_SourceFile **src_file_arr = push_array_no_zero(scratch.arena, RDIB_SourceFile *, total_src_file_count); + for (U64 chunk_idx = 0, cursor = 0; chunk_idx < src_file_chunk_count; ++chunk_idx) { + RDIB_SourceFileChunk *chunk = src_file_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + src_file_arr[cursor++] = &chunk->v[i]; + } + } + ProfEnd(); + + ProfBegin("Init Task Context"); + RDIB_SrcLineMapsTask task = {0}; + task.src_file_arr = src_file_arr; + task.out_line_counts = push_array_no_zero(scratch.arena, U32, total_src_file_count); + task.out_voff_counts = push_array_no_zero(scratch.arena, U32, total_src_file_count); + task.out_line_nums = push_array_no_zero(scratch.arena, U32 *, total_src_file_count); + task.out_line_ranges = push_array_no_zero(scratch.arena, U32 *, total_src_file_count); + task.out_voffs = push_array_no_zero(scratch.arena, U64 *, total_src_file_count); + ProfEnd(); + + ProfBegin("Build Source Line Maps"); + tp_for_parallel(tp, arena, total_src_file_count, rdib_build_src_line_map_task, &task); + ProfEnd(); + + ProfBegin("Fill out Source Line Maps"); + RDIB_DataSection src_line_maps_sect = { .tag = RDI_SectionKind_SourceLineMaps }; + RDIB_DataSection src_line_nums_sect = { .tag = RDI_SectionKind_SourceLineMapNumbers }; + RDIB_DataSection src_line_ranges_sect = { .tag = RDI_SectionKind_SourceLineMapRanges }; + RDIB_DataSection src_line_voffs_sect = { .tag = RDI_SectionKind_SourceLineMapVOffs }; + + ProfBegin("Push"); + RDI_SourceLineMap *src_line_maps = push_array_no_zero(arena->v[0], RDI_SourceLineMap, total_src_file_count + 1); + ProfEnd(); + + U64 src_line_map_cursor = 0; + U64 line_num_cursor = 0; + U64 line_range_cursor = 0; + U64 voff_cursor = 0; + + // zero-out null source line map + MemoryZeroStruct(&src_line_maps[src_line_map_cursor]); + ++src_line_map_cursor; + + for (U64 chunk_idx = 0; chunk_idx < src_file_chunk_count; ++chunk_idx) { + RDIB_SourceFileChunk *chunk = src_file_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_SourceFile *src_file = chunk->v + i; + U64 src_file_idx = rdib_idx_from_source_file(src_file); + + if (task.out_line_counts[src_file_idx] > 0) { + src_file->src_line_map_idx = src_line_map_cursor; + + RDI_SourceLineMap *sm = src_line_maps + src_line_map_cursor++; + sm->line_count = task.out_line_counts[src_file_idx]; + sm->voff_count = task.out_voff_counts[src_file_idx]; + sm->line_map_nums_base_idx = line_num_cursor; + sm->line_map_range_base_idx = line_range_cursor; + sm->line_map_voff_base_idx = voff_cursor; + + str8_list_push(arena->v[0], &src_line_nums_sect.data, str8_array(task.out_line_nums[src_file_idx], task.out_line_counts[src_file_idx])); + str8_list_push(arena->v[0], &src_line_ranges_sect.data, str8_array(task.out_line_ranges[src_file_idx], task.out_line_counts[src_file_idx] + 1)); + str8_list_push(arena->v[0], &src_line_voffs_sect.data, str8_array(task.out_voffs[src_file_idx], task.out_voff_counts[src_file_idx])); + + line_num_cursor += task.out_line_counts[src_file_idx]; + line_range_cursor += task.out_line_counts[src_file_idx] + 1; + voff_cursor += task.out_voff_counts[src_file_idx]; + } else { + src_file->src_line_map_idx = 0; + } + } + } + ProfEnd(); + + str8_list_push(arena->v[0], &src_line_maps_sect.data, str8_array(src_line_maps, src_line_map_cursor)); + + rdib_data_section_list_push(arena->v[0], sect_list, src_line_maps_sect); + rdib_data_section_list_push(arena->v[0], sect_list, src_line_nums_sect); + rdib_data_section_list_push(arena->v[0], sect_list, src_line_ranges_sect); + rdib_data_section_list_push(arena->v[0], sect_list, src_line_voffs_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_build_line_tables_task) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + RDIB_BuildLineTablesTask *task = raw_task; + Rng1U64 range = task->ranges[task_id]; + + for (U64 chunk_idx = range.min; chunk_idx < range.max; ++chunk_idx) { + RDIB_LineTableChunk *chunk = task->chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_LineTable *line_table = &chunk->v[i]; + U64 line_table_idx = chunk->base + i; + + U64 total_line_count = 0; + for (RDIB_LineTableFragment *frag = line_table->first; frag != 0; frag = frag->next_line_table) { + total_line_count += frag->line_count + /* range terminator */ 1; + } + + if (total_line_count > 0) { + struct Value { + U32 src_file_idx; + U32 line_num; + U16 col_first; + U16 col_opl; + }; + KeyValuePair *pairs = push_array_no_zero(scratch.arena, KeyValuePair, total_line_count); + struct Value *values = push_array_no_zero(scratch.arena, struct Value, total_line_count); + U64 pair_cursor = 0; + + for (RDIB_LineTableFragment *frag = line_table->first; frag != 0; frag = frag->next_line_table) { + for (U64 line_idx = 0; line_idx < frag->line_count; ++line_idx, ++pair_cursor) { + struct Value *value = &values[pair_cursor]; + KeyValuePair *pair = &pairs[pair_cursor]; + + value->src_file_idx = rdib_idx_from_source_file(frag->src_file); + value->line_num = frag->line_nums[line_idx]; + if (frag->col_count > 0) { + value->col_first = frag->col_nums[line_idx*2]; + value->col_opl = frag->col_nums[line_idx*2 + 1]; + } else { + value->col_first = 0; + value->col_opl = 0; + } + + pair->key_u64 = frag->voffs[line_idx]; + pair->value_raw = value; + } + + // emit terminator + { + KeyValuePair *pair = &pairs[pair_cursor]; + struct Value *value = &values[pair_cursor]; + pair_cursor += 1; + + value->src_file_idx = 0; + value->line_num = 0; + value->col_first = 0; + value->col_opl = 0; + + pair->key_u64 = frag->voffs[frag->line_count]; + pair->value_raw = value; + } + } + + // sort on virtual offset + sort_key_value_pairs_as_u64(pairs, pair_cursor); + + // fill out RDI_Line output + U64 line_count = pair_cursor + 1; + U64 *voffs = push_array_no_zero(arena, U64, line_count); + RDI_Line *lines = push_array_no_zero(arena, RDI_Line, line_count); + + U64 line_cursor = 0; + for (U64 line_idx = 0; line_idx < pair_cursor; ++line_idx) { + // remove terminator if there is a real line number + if (line_idx + 1 < pair_cursor && pairs[line_idx].key_u64 == pairs[line_idx+1].key_u64) { + continue; + } + struct Value *value = pairs[line_idx].value_raw; + voffs[line_cursor] = pairs[line_idx].key_u64; + lines[line_cursor].file_idx = value->src_file_idx; + lines[line_cursor].line_num = value->line_num; + line_cursor += 1; + } + + // fill out terminators + voffs[line_cursor] = ~0llu; + MemoryZeroStruct(&lines[line_cursor]); + line_cursor += 1; + + // fill out line table output + task->out_line_table_counts[line_table_idx] = line_cursor; + task->out_line_table_voffs[line_table_idx] = voffs; + task->out_line_table_lines[line_table_idx] = lines; + } else { + task->out_line_table_counts[line_table_idx] = 0; + task->out_line_table_voffs[line_table_idx] = 0; + task->out_line_table_lines[line_table_idx] = 0; + } + } + } + + scratch_end(scratch); + ProfEnd(); +} + +internal void +rdib_data_sections_from_line_tables(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + U64 total_line_table_count, + U64 chunk_count, + RDIB_LineTableChunk **chunks) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + + ProfBegin("Build Line Tables"); + RDIB_BuildLineTablesTask task = {0}; + task.chunks = chunks; + task.ranges = tp_divide_work(scratch.arena, chunk_count, tp->worker_count); + task.out_line_table_counts = push_array_no_zero(scratch.arena, U64, total_line_table_count); + task.out_line_table_voffs = push_array_no_zero(scratch.arena, U64 *, total_line_table_count); + task.out_line_table_lines = push_array_no_zero(scratch.arena, RDI_Line *, total_line_table_count); + tp_for_parallel(tp, arena, tp->worker_count, rdib_build_line_tables_task, &task); + ProfEnd(); + + RDIB_DataSection line_tables_sect = { .tag = RDI_SectionKind_LineTables }; + RDIB_DataSection line_table_voffs_sect = { .tag = RDI_SectionKind_LineInfoVOffs }; + RDIB_DataSection line_table_lines_sect = { .tag = RDI_SectionKind_LineInfoLines }; + RDIB_DataSection line_table_cols_sect = { .tag = RDI_SectionKind_LineInfoColumns }; + + ProfBegin("Fill out Line Tables"); + + ProfBegin("Push"); + RDI_LineTable *line_tables_rdi = push_array_no_zero(arena->v[0], RDI_LineTable, total_line_table_count); + ProfEnd(); + + U64 line_table_cursor = 0; + U64 line_table_voff_cursor = 0; + U64 line_table_line_cursor = 0; + //U64 line_table_col_cursor = 0; + + // fill out null line table + MemoryZeroStruct(&line_tables_rdi[line_table_cursor]); + ++line_table_cursor; + + for (U64 chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { + RDIB_LineTableChunk *chunk = chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_LineTable *src = &chunk->v[i]; + U64 src_idx = rdib_idx_from_line_table(src); + if (task.out_line_table_counts[src_idx] > 0) { + RDI_LineTable *dst = &line_tables_rdi[line_table_cursor]; + + src->output_array_idx = line_table_cursor; + + dst->voffs_base_idx = line_table_voff_cursor; + dst->lines_base_idx = line_table_line_cursor; + dst->cols_base_idx = 0; //line_table_cols_cursor; + dst->lines_count = task.out_line_table_counts[src_idx] - 1; + dst->cols_count = 0; //src->col_count; + + str8_list_push(arena->v[0], &line_table_voffs_sect.data, str8_array(task.out_line_table_voffs[src_idx], task.out_line_table_counts[src_idx])); + str8_list_push(arena->v[0], &line_table_lines_sect.data, str8_array(task.out_line_table_lines[src_idx], task.out_line_table_counts[src_idx])); + //str8_list_push(arena->v[0], &line_table_cols_sect.data, str8_array(task.out_line_table_cols[src_idx], task.out_line_table_col_counts[src_idx])); + + line_table_voff_cursor += task.out_line_table_counts[src_idx]; + line_table_line_cursor += task.out_line_table_counts[src_idx]; + //line_table_col_cursor += task.out_line_table_col_counts[src_idx]; + + line_table_cursor += 1; + } else { + src->output_array_idx = 0; + } + } + } + + str8_list_push(arena->v[0], &line_tables_sect.data, str8_array(line_tables_rdi, line_table_cursor)); + + ProfEnd(); + + rdib_data_section_list_push(arena->v[0], sect_list, line_tables_sect); + rdib_data_section_list_push(arena->v[0], sect_list, line_table_voffs_sect); + rdib_data_section_list_push(arena->v[0], sect_list, line_table_lines_sect); + rdib_data_section_list_push(arena->v[0], sect_list, line_table_cols_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal +THREAD_POOL_TASK_FUNC(rdib_fill_src_files_task) +{ + RDIB_FillSourceFilesTask *task = raw_task; + + for (U64 chunk_idx = task->ranges[task_id].min; chunk_idx < task->ranges[task_id].max; ++chunk_idx) { + RDIB_SourceFileChunk *chunk = task->src_file_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_SourceFile *src = chunk->v + i; + U32 src_file_idx = rdib_idx_from_source_file(src); + RDI_SourceFile *dst = task->src_files_dst + src_file_idx; + + dst->file_path_node_idx = rdib_idx_from_path_tree(task->path_tree, src->file_path); + dst->normal_full_path_string_idx = rdib_idx_from_string_map(task->string_map, src->normal_full_path); + dst->source_line_map_idx = src->src_line_map_idx; + } + } +} + +internal void +rdib_data_sections_from_source_files(TP_Context *tp, + TP_Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + RDIB_PathTree *path_tree, + U64 total_src_file_count, + U64 src_file_chunk_count, + RDIB_SourceFileChunk **src_file_chunks) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + RDIB_FillSourceFilesTask task = {0}; + task.ranges = tp_divide_work(scratch.arena, src_file_chunk_count, tp->worker_count); + task.string_map = string_map; + task.path_tree = path_tree; + task.src_file_chunks = src_file_chunks; + task.src_files_dst = push_array_no_zero(arena->v[0], RDI_SourceFile, total_src_file_count); + tp_for_parallel(tp, 0, tp->worker_count, rdib_fill_src_files_task, &task); + + RDIB_DataSection src_files_sect = { .tag = RDI_SectionKind_SourceFiles }; + str8_list_push(arena->v[0], &src_files_sect.data, str8_array(task.src_files_dst, total_src_file_count)); + rdib_data_section_list_push(arena->v[0], sect_list, src_files_sect); + + scratch_end(scratch); + ProfEnd(); +} + +internal void +rdib_data_sections_from_inline_sites(TP_Context *tp, + Arena *arena, + RDIB_DataSectionList *sect_list, + RDIB_StringMap *string_map, + U64 total_inline_site_count, + U64 inline_site_chunk_count, + RDIB_InlineSiteChunk **inline_site_chunks) +{ + ProfBeginFunction(); + + RDI_InlineSite *dst_arr = push_array(arena, RDI_InlineSite, total_inline_site_count); + + for (U64 chunk_idx = 0; chunk_idx < inline_site_chunk_count; ++chunk_idx) { + RDIB_InlineSiteChunk *chunk = inline_site_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + RDIB_InlineSite *src = &chunk->v[i]; + U64 idx = rdib_idx_from_inline_site(src); + RDI_InlineSite *dst = &dst_arr[idx]; + + dst->name_string_idx = rdib_idx_from_string_map(string_map, src->name); + dst->type_idx = rdib_idx_from_type(src->type); + dst->owner_type_idx = rdib_idx_from_type(src->owner); + dst->line_table_idx = src->line_table->output_array_idx; + } + } + + RDIB_DataSection inline_site_sect = { .tag = RDI_SectionKind_InlineSites }; + str8_list_push(arena, &inline_site_sect.data, str8_array(dst_arr, total_inline_site_count)); + rdib_data_section_list_push(arena, sect_list, inline_site_sect); + + ProfEnd(); +} + +internal void +rdib_data_sections_from_checksums(TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list) +{ + NotImplemented; +} + +//////////////////////////////// + +internal RDIB_Input +rdib_init_input(Arena *arena) +{ + ProfBeginFunction(); + + RDIB_Input input = {0}; + input.unit_chunk_cap = 128; + input.src_file_chunk_cap = 4096; + input.symbol_chunk_cap = 4096; + input.line_table_cap = 4096; + input.inline_site_cap = 4096; + input.type_cap = 1024; + input.udt_cap = 4096; + + RDIB_SourceFile *null_src_file = rdib_source_file_chunk_list_push_zero(arena, &input.src_files, 1); + RDIB_LineTable *null_line_table = rdib_line_table_chunk_list_push_zero (arena, &input.line_tables, 1); + RDIB_LineTableFragment *null_frag = rdib_line_table_push (arena, null_line_table); + RDIB_Type *null_type = rdib_type_chunk_list_push_zero (arena, &input.types, 1); + RDIB_Scope *null_scope = rdib_scope_chunk_list_push_zero (arena, &input.scopes, 1); + RDIB_Unit *null_unit = rdib_unit_chunk_list_push_zero (arena, &input.units, 1); + RDIB_Procedure *null_proc = rdib_procedure_chunk_list_push_zero (arena, &input.procs, 1); + RDIB_Variable *null_local = rdib_variable_chunk_list_push_zero (arena, &input.locals, 1); + RDIB_Variable *null_gvar = rdib_variable_chunk_list_push_zero (arena, &input.gvars, 1); + RDIB_Variable *null_tvar = rdib_variable_chunk_list_push_zero (arena, &input.tvars, 1); + RDIB_UDTMember *null_udt_member = rdib_udt_member_chunk_list_push_zero (arena, &input.udt_members, 1); + RDIB_UDTMember *null_enum_member = rdib_udt_member_chunk_list_push_zero (arena, &input.enum_members, 1); + RDIB_InlineSite *null_inline_site = rdib_inline_site_chunk_list_push_zero(arena, &input.inline_sites, 1); + { + // Line Table Fragment + null_frag->src_file = null_src_file; + null_frag->voffs = push_array(arena, U64, 1); + + // Source File + null_src_file->line_table_frags = null_frag; + + // Unit + null_unit->arch = RDI_Arch_NULL; + null_unit->unit_name = str8_lit(""); + null_unit->compiler_name = str8_lit(""); + null_unit->source_file = str8_lit(""); + null_unit->object_file = str8_lit(""); + null_unit->archive_file = str8_lit(""); + null_unit->build_path = str8_lit(""); + null_unit->virt_range_count = 1; + null_unit->virt_ranges = push_array(arena, Rng1U64, 1); + null_unit->virt_ranges[0] = rng_1u64(0,0); + null_unit->line_table = null_line_table; + + // Scope + rng_1u64_list_push(arena, &null_scope->ranges, rng_1u64(0,max_U32)); + + // Location + RDIB_Location null_loc = {0}; + rng_1u64_list_push(arena, &null_loc.ranges, rng_1u64(0,0)); + RDIB_LocationList null_loc_list = {0}; + rdib_location_list_push(arena, &null_loc_list, null_loc); + + // Proc + null_proc->type = null_type; + null_proc->scope = null_scope; + + // Global Var + null_gvar->link_flags = RDI_LinkFlag_External; + null_gvar->type = null_type; + null_gvar->locations = null_loc_list; + + // Thread Var + null_tvar->link_flags = RDI_LinkFlag_External; + null_tvar->type = null_type; + null_tvar->locations = null_loc_list; + + // Local Var + null_local->type = null_type; + null_local->locations = null_loc_list; + + // Inline Site + null_inline_site->type = null_type; + null_inline_site->owner = 0; + null_inline_site->line_table = null_line_table; + } + + input.null_src_file = null_src_file; + input.null_line_table = null_line_table; + input.null_frag = null_frag; + input.null_type = null_type; + input.null_scope = null_scope; + input.null_unit = null_unit; + input.null_proc = null_proc; + input.null_local = null_local; + input.null_gvar = null_gvar; + input.null_tvar = null_tvar; + input.null_udt_member = null_udt_member; + input.null_enum_member = null_enum_member; + input.null_inline_site = null_inline_site; + + input.variadic_type = rdib_type_chunk_list_push(arena, &input.types, 1); + input.variadic_type->kind = RDI_TypeKind_Variadic; + + ProfEnd(); + return input; +} + +internal String8List +rdib_finish(TP_Context *tp, TP_Arena *arena, RDIB_Input *input) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(arena->v, arena->count); + + RDIB_UnitChunkList all_units = {0}; + RDIB_SourceFileChunkList all_src_files = {0}; + RDIB_LineTableChunkList all_line_tables = {0}; + RDIB_VariableChunkList all_locals = {0}; + RDIB_VariableChunkList all_tvars = {0}; + RDIB_VariableChunkList all_gvars = {0}; + RDIB_ProcedureChunkList all_procs = {0}; + RDIB_ScopeChunkList all_scopes = {0}; + RDIB_InlineSiteChunkList all_inline_sites = {0}; + RDIB_TypeChunkList all_types = {0}; + RDIB_TypeChunkList all_param_types = {0}; + RDIB_TypeChunkList all_udt_member_types = {0}; + RDIB_TypeChunkList all_enum_member_types = {0}; + RDIB_UDTMemberChunkList all_udt_members = {0}; + RDIB_UDTMemberChunkList all_enum_members = {0}; + + //U64 type_chunk_count = types.count; + //U64 struct_chunk_count = struct_list.count; + //U64 union_chunk_count = union_list.count; + //U64 enum_chunk_count = enum_list.count; + //U64 total_struct_count = rdib_type_chunk_list_total_count(struct_list); + //U64 total_union_count = rdib_type_chunk_list_total_count(union_list); + //U64 total_enum_count = rdib_type_chunk_list_total_count(enum_list); + //U64 extern_gvar_chunk_count = extern_gvars.count; + //U64 extern_tvar_chunk_count = extern_tvars.count; + //U64 extern_proc_chunk_count = extern_procs.count; + //U64 static_gvar_chunk_count = static_gvars.count; + //U64 static_tvar_chunk_count = static_tvars.count; + //U64 static_proc_chunk_count = static_procs.count; + //U64 total_extern_gvar_count = rdib_variable_chunk_list_total_count (extern_gvars); + //U64 total_extern_tvar_count = rdib_variable_chunk_list_total_count (extern_tvars); + //U64 total_extern_proc_count = rdib_procedure_chunk_list_total_count(extern_procs); + + ProfBegin("Concat Chunk Lists"); + rdib_unit_chunk_list_concat_in_place (&all_units, &input->units ); + rdib_source_file_chunk_list_concat_in_place(&all_src_files, &input->src_files ); + rdib_line_table_chunk_list_concat_in_place (&all_line_tables, &input->line_tables ); + rdib_scope_chunk_list_concat_in_place (&all_scopes, &input->scopes ); + rdib_variable_chunk_list_concat_in_place (&all_locals, &input->locals ); + rdib_variable_chunk_list_concat_in_place (&all_tvars, &input->tvars ); + rdib_variable_chunk_list_concat_in_place (&all_tvars, &input->extern_tvars ); + rdib_variable_chunk_list_concat_in_place (&all_tvars, &input->static_tvars ); + rdib_variable_chunk_list_concat_in_place (&all_gvars, &input->gvars ); + rdib_variable_chunk_list_concat_in_place (&all_gvars, &input->extern_gvars ); + rdib_variable_chunk_list_concat_in_place (&all_gvars, &input->static_gvars ); + rdib_procedure_chunk_list_concat_in_place (&all_procs, &input->procs ); + rdib_procedure_chunk_list_concat_in_place (&all_procs, &input->extern_procs ); + rdib_procedure_chunk_list_concat_in_place (&all_procs, &input->static_procs ); + rdib_inline_site_chunk_list_concat_in_place(&all_inline_sites, &input->inline_sites ); + rdib_type_chunk_list_concat_in_place (&all_types, &input->types ); + rdib_type_chunk_list_concat_in_place (&all_types, &input->struct_list ); + rdib_type_chunk_list_concat_in_place (&all_types, &input->union_list ); + rdib_type_chunk_list_concat_in_place (&all_types, &input->enum_list ); + rdib_type_chunk_list_concat_in_place (&all_param_types, &input->param_types ); + rdib_type_chunk_list_concat_in_place (&all_udt_member_types, &input->member_types ); + rdib_type_chunk_list_concat_in_place (&all_enum_member_types, &input->enum_types ); + rdib_udt_member_chunk_list_concat_in_place (&all_udt_members, &input->udt_members ); + rdib_udt_member_chunk_list_concat_in_place (&all_enum_members, &input->enum_members ); + ProfEnd(); + + ProfBegin("Chunk Lists -> Chunk Arrays"); + RDIB_UnitChunk **all_unit_chunks = rdib_array_from_unit_chunk_list (scratch.arena, all_units ); + RDIB_SourceFileChunk **all_src_file_chunks = rdib_array_from_source_file_chunk_list(scratch.arena, all_src_files ); + RDIB_LineTableChunk **all_line_table_chunks = rdib_array_from_line_table_chunk_list (scratch.arena, all_line_tables ); + RDIB_ScopeChunk **all_scope_chunks = rdib_array_from_scope_chunk_list (scratch.arena, all_scopes ); + RDIB_VariableChunk **all_local_chunks = rdib_array_from_variable_chunk_list (scratch.arena, all_locals ); + RDIB_VariableChunk **all_gvar_chunks = rdib_array_from_variable_chunk_list (scratch.arena, all_gvars ); + RDIB_VariableChunk **all_tvar_chunks = rdib_array_from_variable_chunk_list (scratch.arena, all_tvars ); + RDIB_ProcedureChunk **all_proc_chunks = rdib_array_from_procedure_chunk_list (scratch.arena, all_procs ); + RDIB_InlineSiteChunk **all_inline_site_chunks = rdib_array_from_inline_site_chunk_list(scratch.arena, all_inline_sites ); + RDIB_TypeChunk **all_type_chunks = rdib_array_from_type_chunk_list (scratch.arena, all_types ); + //RDIB_TypeChunk **all_param_type_chunks = rdib_array_from_type_chunk_list (scratch.arena, all_param_types ); + RDIB_TypeChunk **all_udt_member_type_chunks = rdib_array_from_type_chunk_list (scratch.arena, all_udt_member_types ); + RDIB_TypeChunk **all_enum_member_type_chunks = rdib_array_from_type_chunk_list (scratch.arena, all_enum_member_types); + RDIB_UDTMemberChunk **all_udt_member_chunks = rdib_array_from_udt_member_chunk_list (scratch.arena, all_udt_members ); + RDIB_UDTMemberChunk **all_enum_member_chunks = rdib_array_from_udt_member_chunk_list (scratch.arena, all_enum_members ); + ProfEnd(); + + ProfBegin("Count Symbols, Types, and etc."); + U64 total_unit_count = rdib_unit_chunk_list_total_count (all_units ); + U64 total_src_file_count = rdib_source_file_chunk_list_total_count(all_src_files ); + U64 total_line_table_count = rdib_line_table_chunk_list_total_count (all_line_tables ); + U64 total_scope_count = rdib_scope_chunk_list_total_count (all_scopes ); + U64 total_local_count = rdib_variable_chunk_list_total_count (all_locals ); + U64 total_inline_site_count = rdib_inline_site_chunk_list_total_count(all_inline_sites ); + U64 total_udt_member_count = rdib_udt_member_chunk_list_total_count (all_udt_members ); + U64 total_enum_member_count = rdib_udt_member_chunk_list_total_count (all_enum_members ); + U64 total_type_count = rdib_type_chunk_list_total_count (all_types ); + U64 total_param_type_count = rdib_type_chunk_list_total_count (all_param_types ); + //U64 total_udt_member_type_count = rdib_type_chunk_list_total_count (all_udt_member_types ); + //U64 total_enum_member_type_count = rdib_type_chunk_list_total_count (all_enum_member_types); + U64 total_tvar_count = rdib_variable_chunk_list_total_count (all_tvars ); + U64 total_gvar_count = rdib_variable_chunk_list_total_count (all_gvars ); + U64 total_proc_count = rdib_procedure_chunk_list_total_count (all_procs ); + ProfEnd(); + + // +1 to skip nulls + //RDIB_VariableChunk **extern_gvar_chunks = all_gvar_chunks + 1; + //RDIB_VariableChunk **extern_tvar_chunks = all_tvar_chunks + 1; + //RDIB_ProcedureChunk **extern_proc_chunks = all_proc_chunks + 1; + //RDIB_VariableChunk **static_gvar_chunks = extern_gvar_chunks + extern_gvar_chunk_count; + //RDIB_VariableChunk **static_tvar_chunks = extern_tvar_chunks + extern_tvar_chunk_count; + //RDIB_ProcedureChunk **static_proc_chunks = extern_proc_chunks + extern_proc_chunk_count; + //RDIB_TypeChunk **type_chunks = all_type_chunks + 1; + //RDIB_TypeChunk **struct_chunks = type_chunks + type_chunk_count; + //RDIB_TypeChunk **union_chunks = struct_chunks + struct_chunk_count; + //RDIB_TypeChunk **enum_chunks = union_chunks + union_chunk_count; + //RDIB_TypeChunk **udt_chunks = struct_chunks; + //U64 udt_chunk_count = struct_chunk_count + union_chunk_count + enum_chunk_count; + + ProfBegin("Assign Type Indices"); + U64 total_type_node_count = 0; + { + struct TypeNode { + struct TypeNode *next; + RDIB_Type *type; + }; + struct TypeNode *stack = 0; + struct TypeNode *free_nodes = 0; +#define push_node(t) do { \ +if (((RDIB_Type*)(t))->kind == RDI_TypeKindExt_VirtualTable) break; \ + struct TypeNode *n; \ + if (free_nodes == 0) { \ + n = push_array(scratch.arena, struct TypeNode, 1); \ + } else { \ + n = free_nodes; \ + SLLStackPop(free_nodes); \ + } \ + Assert(t); \ + n->type = t; \ + SLLStackPush(stack, n); \ +} while (0) + + for (U64 chunk_idx = 0; chunk_idx < all_types.count; ++chunk_idx) { + RDIB_TypeChunk *chunk = all_type_chunks[chunk_idx]; + for (U64 i = 0; i < chunk->count; ++i) { + push_node(&chunk->v[i]); + + for (struct TypeNode *cursor = stack; cursor != 0; cursor = cursor->next) { + if (cursor->type->kind == RDI_TypeKind_NULL){ + // no type refs + } else if (cursor->type->kind == RDI_TypeKind_Variadic) { + // no type refs + } else if (cursor->type->kind == RDI_TypeKind_Union) { + // no type refs + } else if (RDI_IsBuiltinType(cursor->type->kind)) { + // no type refs + } else if (cursor->type->kind == RDI_TypeKind_IncompleteStruct) { + // no type refs + } else if (cursor->type->kind == RDI_TypeKind_IncompleteUnion) { + // no type refs + } else if (cursor->type->kind == RDI_TypeKind_IncompleteClass) { + // no type refs + } else if (cursor->type->kind == RDI_TypeKind_IncompleteEnum) { + push_node(cursor->type->udt.enum_type.base_type); + } else if (cursor->type->kind == RDI_TypeKind_Modifier) { + push_node(cursor->type->modifier.type_ref); + } else if (RDI_IsPtrType(cursor->type->kind)) { + push_node(cursor->type->ptr.type_ref); + } else if (cursor->type->kind == RDI_TypeKind_Function) { + push_node(cursor->type->func.return_type); + push_node(cursor->type->func.params_type); + RDIB_Type *params = cursor->type->func.params_type; + for (U64 i = 0; i < params->params.count; ++i) { + push_node(params->params.types[i]); + } + } else if (cursor->type->kind == RDI_TypeKind_Method) { + push_node(cursor->type->method.class_type); + push_node(cursor->type->method.this_type); + push_node(cursor->type->method.return_type); + RDIB_Type *params = cursor->type->method.params_type; + for (U64 i = 0; i < params->params.count; ++i) { + push_node(params->params.types[i]); + } + } else if (cursor->type->kind == RDI_TypeKindExt_StaticMethod) { + push_node(cursor->type->static_method.class_type); + push_node(cursor->type->static_method.return_type); + RDIB_Type *params = cursor->type->static_method.params_type; + for (U64 i = 0; i < params->params.count; ++i) { + push_node(params->params.types[i]); + } + } else if (cursor->type->kind == RDI_TypeKind_Bitfield) { + push_node(cursor->type->bitfield.value_type); + } else if (cursor->type->kind == RDI_TypeKind_Array) { + push_node(cursor->type->array.entry_type); + } else if (cursor->type->kind == RDI_TypeKind_Struct || cursor->type->kind == RDI_TypeKind_Class) { + if (cursor->type->udt.struct_type.derived != 0) { + push_node(cursor->type->udt.struct_type.derived); + } + //push_node(cursor->type->udt.struct_type.vtshape); + } else if (cursor->type->kind == RDI_TypeKind_Enum) { + push_node(cursor->type->udt.enum_type.base_type); + } else if (cursor->type->kind > RDI_TypeKindExt_Lo) { + InvalidPath; + } else { + InvalidPath; + } + } + + for (struct TypeNode *cursor = stack; cursor != 0; cursor = cursor->next) { + // was this type visisted? + if (cursor->type->final_idx == 0) { + cursor->type->final_idx = total_type_node_count; + ++total_type_node_count; + } + } + + free_nodes = stack; + stack = 0; + } + } +#undef push_node + } + ProfEnd(); + + ProfBegin("Type Stats"); + RDIB_TypeStats type_stats = {0}; + { + type_stats.udt_counts = push_array(scratch.arena, U64, all_types.count); + RDIB_TypeStatsTask task = { .chunks = all_type_chunks, .type_stats = &type_stats }; + tp_for_parallel(tp, 0, all_types.count, rdib_type_stats_task, &task); + } + ProfEnd(); + + RDIB_PathTree *path_tree = rdib_build_path_tree(arena->v[0], + tp->worker_count, + input->null_src_file, + all_units.count, + all_unit_chunks, + all_src_files.count, + all_src_file_chunks); + + // loop over structs and build a map with every possible string + ProfBegin("String Map"); + RDIB_StringMap *string_map; + { + U64 top_level_string_count = 2; + U64 sect_string_count = 1; + U64 src_file_string_count = 1; + U64 unit_string_count = 6; + U64 variable_string_count = 2; + U64 procedure_string_count = 2; + U64 scope_string_count = 0; + U64 inline_site_string_count = 0; + U64 member_string_count = 2; + U64 type_string_count = 3; + U64 path_tree_node_count = 1; + + U64 total_string_count = 1 /* :string_map_null */ + + 1 * top_level_string_count + + input->sect_count * sect_string_count + + total_src_file_count * src_file_string_count + + total_unit_count * unit_string_count + + total_local_count * variable_string_count + + total_gvar_count * variable_string_count + + total_tvar_count * variable_string_count + + total_proc_count * procedure_string_count + + total_inline_site_count * inline_site_string_count + + total_udt_member_count * member_string_count + + total_enum_member_count * member_string_count + + total_type_count * type_string_count + + path_tree->node_count * path_tree_node_count + + total_scope_count * scope_string_count; + + string_map = rdib_init_string_map(arena->v[0], total_string_count); + + RDIB_CollectStringsTask task = {0}; + task.string_map = string_map; + task.string_map_update_func = rdib_string_map_update_null; + task.free_buckets = push_array(scratch.arena, RDIB_StringMapBucket *, tp->worker_count); + task.element_indices = push_array(scratch.arena, U64, tp->worker_count); + + // :string_map_null + rdib_string_map_insert_string_table_item(arena->v[0], &task, 0, str8_lit("")); + + // top level info + rdib_string_map_insert_string_table_item(arena->v[0], &task, 0, input->top_level_info.exe_name); + rdib_string_map_insert_string_table_item(arena->v[0], &task, 0, input->top_level_info.producer_string); + + ProfBegin("Sections"); + task.ranges = tp_divide_work(scratch.arena, input->sect_count, tp->worker_count); + task.sects = input->sections; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_sects_task, &task); + ProfEnd(); + + ProfBegin("Units"); + task.ranges = tp_divide_work(scratch.arena, all_units.count, tp->worker_count); + task.units = all_unit_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_units_task, &task); + ProfEnd(); + + ProfBegin("Source Files"); + task.ranges = tp_divide_work(scratch.arena, all_src_files.count, tp->worker_count); + task.src_file_chunks = all_src_file_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_source_files_task, &task); + ProfEnd(); + + ProfBegin("Locals"); + task.ranges = tp_divide_work(scratch.arena, all_locals.count, tp->worker_count); + task.vars = all_local_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_vars_task, &task); + ProfEnd(); + + ProfBegin("Global Variables"); + task.ranges = tp_divide_work(scratch.arena, all_gvars.count, tp->worker_count); + task.vars = all_gvar_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_vars_task, &task); + ProfEnd(); + + ProfBegin("Thread Variables"); + task.ranges = tp_divide_work(scratch.arena, all_tvars.count, tp->worker_count); + task.vars = all_tvar_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_vars_task, &task); + ProfEnd(); + + ProfBegin("Procedures"); + task.ranges = tp_divide_work(scratch.arena, all_procs.count, tp->worker_count); + task.procs = all_proc_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_procs_task, &task); + ProfEnd(); + + ProfBegin("Inline Sites"); + task.ranges = tp_divide_work(scratch.arena, all_inline_sites.count, tp->worker_count); + task.inline_sites = all_inline_site_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_inline_sites_task, &task); + ProfEnd(); + + ProfBegin("UDT Members"); + task.ranges = tp_divide_work(scratch.arena, all_udt_members.count, tp->worker_count); + task.udt_members = all_udt_member_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_udt_members_task, &task); + ProfEnd(); + + ProfBegin("Enum Members"); + task.ranges = tp_divide_work(scratch.arena, all_enum_members.count, tp->worker_count); + task.udt_members = all_enum_member_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_enum_members_task, &task); + ProfEnd(); + + ProfBegin("Types"); + task.ranges = tp_divide_work(scratch.arena, all_types.count, tp->worker_count); + task.types = all_type_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_types_task, &task); + ProfEnd(); + + ProfBegin("Path Tree"); + task.ranges = tp_divide_work(scratch.arena, path_tree->list_count, tp->worker_count); + task.path_node_lists = path_tree->node_lists; + tp_for_parallel(tp, arena, tp->worker_count, rdib_collect_strings_path_nodes_task, &task); + ProfEnd(); + } + ProfEnd(); + + ProfBegin("Name Maps"); + RDIB_StringMap *name_maps[RDI_NameMapKind_COUNT] = {0}; + { + name_maps[RDI_NameMapKind_NULL ] = rdib_init_string_map(scratch.arena, 1 ); + name_maps[RDI_NameMapKind_GlobalVariables ] = rdib_init_string_map(scratch.arena, total_gvar_count ); + name_maps[RDI_NameMapKind_ThreadVariables ] = rdib_init_string_map(scratch.arena, total_tvar_count ); + name_maps[RDI_NameMapKind_Procedures ] = rdib_init_string_map(scratch.arena, total_proc_count ); + name_maps[RDI_NameMapKind_Types ] = rdib_init_string_map(scratch.arena, total_type_count ); + name_maps[RDI_NameMapKind_LinkNameProcedures] = rdib_init_string_map(scratch.arena, total_proc_count ); + name_maps[RDI_NameMapKind_NormalSourcePaths ] = rdib_init_string_map(scratch.arena, total_src_file_count); + + RDIB_CollectStringsTask task = {0}; + task.string_map = 0; + task.string_map_update_func = rdib_string_map_update_concat_void_list_atomic; + task.free_buckets = push_array(scratch.arena, RDIB_StringMapBucket *, tp->worker_count); + task.element_indices = push_array(scratch.arena, U64, tp->worker_count); + + ProfBegin("Global Variables"); + task.string_map = name_maps[RDI_NameMapKind_GlobalVariables]; + task.ranges = tp_divide_work(scratch.arena, all_gvars.count, tp->worker_count); + task.vars = all_gvar_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_name_map_var_task, &task); + ProfEnd(); + + ProfBegin("Thread Variables"); + task.string_map = name_maps[RDI_NameMapKind_ThreadVariables]; + task.ranges = tp_divide_work(scratch.arena, all_tvars.count, tp->worker_count); + task.vars = all_tvar_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_name_map_var_task, &task); + ProfEnd(); + + ProfBegin("Procedure Names"); + task.string_map = name_maps[RDI_NameMapKind_Procedures]; + task.ranges = tp_divide_work(scratch.arena, all_procs.count, tp->worker_count); + task.procs = all_proc_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_name_map_procedure_task, &task); + ProfEnd(); + + ProfBegin("Types"); + task.string_map = name_maps[RDI_NameMapKind_Types]; + task.ranges = tp_divide_work(scratch.arena, all_types.count, tp->worker_count); + task.types = all_type_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_name_map_types_task, &task); + ProfEnd(); + + ProfBegin("Normal Source Paths"); + task.string_map = name_maps[RDI_NameMapKind_NormalSourcePaths]; + task.ranges = tp_divide_work(scratch.arena, all_src_files.count, tp->worker_count); + task.src_file_chunks = all_src_file_chunks; + tp_for_parallel(tp, arena, tp->worker_count, rdib_name_map_normal_paths_task, &task); + ProfEnd(); + } + ProfEnd(); + + ProfBeginDynamic("Extract String Table Buckets [Cap: %llu]", string_map->cap); + U64 string_map_bucket_count; + RDIB_StringMapBucket **string_map_buckets = rdib_extant_buckets_from_string_map(tp, scratch.arena, string_map, &string_map_bucket_count); + rdib_string_map_sort_buckets(tp, string_map_buckets, string_map_bucket_count, tp->worker_count); + rdib_string_map_assign_indices(string_map_buckets, string_map_bucket_count); + ProfEnd(); + + ProfBegin("Extract Name Maps Buckets"); + RDIB_StringMapBucket **name_map_buckets[RDI_NameMapKind_COUNT]; + U64 name_map_bucket_counts[RDI_NameMapKind_COUNT]; + for (U64 i = 0; i < ArrayCount(name_map_buckets); ++i) { + ProfBeginDynamic("Name Map: %.*s", str8_varg(rdi_string_from_name_map_kind(i))); + name_map_buckets[i] = rdib_extant_buckets_from_string_map(tp, scratch.arena, name_maps[i], &name_map_bucket_counts[i]); + rdib_string_map_sort_buckets(tp, name_map_buckets[i], name_map_bucket_counts[i], tp->worker_count); + rdib_string_map_assign_indices(name_map_buckets[i], name_map_bucket_counts[i]); + ProfEnd(); + } + ProfEnd(); + + ProfBegin("Index Run Map"); + RDIB_IndexRunMap *idx_run_map; + RDIB_IndexRunBucket **idx_run_buckets; + U64 idx_run_bucket_count; + { + // TODO: we over-allocate for name map index runs since not every bucket has > 1 value + U64 total_name_map_value_count = 0; + for (U64 i = 0; i < ArrayCount(name_map_bucket_counts); ++i) { + total_name_map_value_count += name_map_bucket_counts[i]; + } + + // rough bucket estimate + U64 idx_run_cap = (total_param_type_count + total_name_map_value_count) * 2; + idx_run_map = rdib_init_index_run_map(arena->v[0], idx_run_cap); + + // setup task context + RDIB_BuildIndexRunsTask task = {0}; + task.idx_run_map = idx_run_map; + task.free_buckets = push_array(scratch.arena, RDIB_IndexRunBucket *, tp->worker_count); + + ProfBegin("Type Params Pass"); + task.type_chunks = all_type_chunks; + tp_for_parallel(tp, arena, all_types.count, rdib_build_idx_runs_params_task, &task); + task.sorter_idx += 1; + ProfEnd(); + + ProfBegin("Name Maps Pass - Build Index Runs"); + for (U64 name_map_kind = 0; name_map_kind < ArrayCount(name_maps); ++name_map_kind) { + ProfBeginDynamic("Name Map: %.*s", str8_varg(rdi_string_from_name_map_kind(name_map_kind))); + task.name_map_kind = name_map_kind; + task.ranges = tp_divide_work(scratch.arena, name_map_bucket_counts[name_map_kind], tp->worker_count); + task.name_map_buckets = name_map_buckets[name_map_kind]; + tp_for_parallel(tp, arena, tp->worker_count, rdib_build_idx_runs_name_map_buckets_task, &task); + task.sorter_idx += 1; + ProfEnd(); + } + ProfEnd(); + + idx_run_buckets = rdib_extant_buckets_from_index_run_map(tp, arena->v[0], idx_run_map, &idx_run_bucket_count); + rdib_index_run_map_sort_buckets(tp, idx_run_buckets, idx_run_bucket_count, task.sorter_idx); + rdib_index_run_map_assign_indices(idx_run_buckets, idx_run_bucket_count); + } + ProfEnd(); + + ProfBegin("Serialize Data Sections"); + RDIB_DataSectionList sections = {0}; + rdib_data_sections_from_top_level_info(arena->v[0], §ions, string_map, &input->top_level_info); + rdib_data_sections_from_binary_sections(arena->v[0], §ions, string_map, input->sections, input->sect_count); + rdib_data_sections_from_path_tree(tp, arena->v[0], §ions, string_map, path_tree); + rdib_data_sections_from_string_map(tp, arena->v[0], §ions, string_map_buckets, string_map_bucket_count); + rdib_data_sections_from_index_runs(tp, arena->v[0], §ions, idx_run_buckets, idx_run_bucket_count); + rdib_data_sections_from_name_maps(tp, arena, §ions, string_map, idx_run_map, name_map_buckets, name_map_bucket_counts); + rdib_data_sections_from_types(tp, arena->v[0], §ions, input->top_level_info.arch, string_map, idx_run_map, all_udt_member_types.count, all_udt_member_type_chunks, all_enum_member_types.count, all_enum_member_type_chunks, total_type_node_count, all_types.count, all_type_chunks, type_stats); + rdib_data_sections_from_line_tables(tp, arena, §ions, total_line_table_count, all_line_tables.count, all_line_table_chunks); + rdib_data_sections_from_source_line_maps(tp, arena, §ions, total_src_file_count, all_src_files.count, all_src_file_chunks); + rdib_data_sections_from_source_files(tp, arena, §ions, string_map, path_tree, total_src_file_count, all_src_files.count, all_src_file_chunks); + rdib_data_sections_from_units(arena->v[0], §ions, string_map, path_tree, total_unit_count, all_units.count, all_unit_chunks); + rdib_data_sections_from_global_variables(tp, arena, §ions, string_map, total_gvar_count, all_gvars.count, all_gvar_chunks); + rdib_data_sections_from_thread_variables(tp, arena, §ions, string_map, total_tvar_count, all_tvars.count, all_tvar_chunks); + rdib_data_sections_from_procedures(tp, arena, §ions, string_map, total_proc_count, all_procs.count, all_proc_chunks); + rdib_data_sections_from_scopes(tp, arena, §ions, string_map, total_scope_count, all_scopes.count, all_scope_chunks); + rdib_data_sections_from_unit_gvar_scope_vmaps(tp, arena, §ions, all_units.count, all_unit_chunks, all_gvars.count, all_gvar_chunks, all_scopes.count, all_scope_chunks); + rdib_data_sections_from_inline_sites(tp, arena->v[0], §ions, string_map, total_inline_site_count, all_inline_sites.count, all_inline_site_chunks); + //rdib_data_sections_from_checksums(tp, arena->v[0], §ions); + ProfEnd(); + + ProfBegin("Make RDI header and sections"); + String8List rdi_data = {0}; + { + // concat section datas + String8List raw_section_datas[RDI_SectionKind_COUNT] = {0}; + for (RDIB_DataSectionNode *n = sections.first; n != 0; n = n->next) { + str8_list_concat_in_place(&raw_section_datas[n->v.tag], &n->v.data); + } + + RDI_Header *rdi_header = push_array(arena->v[0], RDI_Header, 1); + RDI_Section *rdi_sections = push_array(arena->v[0], RDI_Section, RDI_SectionKind_COUNT); + + rdi_header->magic = RDI_MAGIC_CONSTANT; + rdi_header->encoding_version = RDI_ENCODING_VERSION; + rdi_header->data_section_off = sizeof(*rdi_header); + rdi_header->data_section_count = RDI_SectionKind_COUNT; + + str8_list_push(arena->v[0], &rdi_data, str8_struct(rdi_header)); + str8_list_push(arena->v[0], &rdi_data, str8_array(rdi_sections, RDI_SectionKind_COUNT)); + + for (U64 sect_idx = 0; sect_idx < RDI_SectionKind_COUNT; ++sect_idx) { + RDI_Section *dst = &rdi_sections[sect_idx]; + dst->encoding = RDI_SectionEncoding_Unpacked; + dst->pad = 0; + dst->off = 0; + dst->encoded_size = 0; + dst->unpacked_size = 0; + + if (raw_section_datas[sect_idx].total_size > 0) { + str8_list_push_aligner(arena->v[0], &rdi_data, 0, 8); + + dst->off = rdi_data.total_size; + dst->encoded_size = raw_section_datas[sect_idx].total_size; + dst->unpacked_size = raw_section_datas[sect_idx].total_size; + + str8_list_concat_in_place(&rdi_data, &raw_section_datas[sect_idx]); + +#if BUILD_DEBUG + { + U64 expected_total_size = 0; + for (String8Node *n = rdi_data.first; n != 0; n = n->next) { + expected_total_size += n->string.size; + } + Assert(expected_total_size == rdi_data.total_size); + } +#endif + } + } + } + ProfEnd(); + + scratch_end(scratch); + ProfEnd(); + return rdi_data; +} + diff --git a/src/linker/rdi/rdi_builder.h b/src/linker/rdi/rdi_builder.h new file mode 100644 index 00000000..8d7cf927 --- /dev/null +++ b/src/linker/rdi/rdi_builder.h @@ -0,0 +1,1241 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +StaticAssert(sizeof(RDI_Header) == AlignPow2(sizeof(RDI_Header), 8), g_rdi_header_align_check); + +//////////////////////////////// +// TODO: move to rdi_format.h + +#define RDI_IsCompleteUserDefinedTypeKind(x) ((x) == RDI_TypeKind_Class || (x) == RDI_TypeKind_Struct || (x) == RDI_TypeKind_Union || (x) == RDI_TypeKind_Enum) +#define RDI_IsIncompleteUsedDefinedTypeKind(x) (RDI_TypeKind_FirstIncomplete <= (x) && (x) <= RDI_TypeKind_LastIncomplete) +#define RDI_IsUserDefinedType(x) (RDI_IsCompleteUserDefinedTypeKind(x) || RDI_IsIncompleteUsedDefinedTypeKind(x)) +#define RDI_IsBuiltinType(x) (RDI_TypeKind_FirstBuiltIn <= (x) && (x) <= RDI_TypeKind_LastBuiltIn) +#define RDI_IsPtrType(x) ((x) == RDI_TypeKind_Ptr || (x) == RDI_TypeKind_LRef || (x) == RDI_TypeKind_RRef) + +typedef enum +{ + RDI_Checksum_Null, + RDI_Checksum_MD5, + RDI_Checksum_SHA1, + RDI_Checksum_SHA256, + RDI_Checksum_TimeStamp +} RDI_ChecksumKind; + +//////////////////////////////// + +typedef enum +{ + RDIB_DataModel_Null, + RDIB_DataModel_ILP32, + RDIB_DataModel_LLP64, + RDIB_DataModel_LP64, + RDIB_DataModel_ILP64, + RDIB_DataModel_SILP64 +} RDIB_DataModel; + +//////////////////////////////// + +typedef void * RDIB_TypeRef; + +typedef struct RDIB_EvalBytecodeOp +{ + struct RDIB_EvalBytecodeOp *next; + RDI_EvalOp op; + U64 p_size; + U64 p; +} RDIB_EvalBytecodeOp; + +typedef struct RDIB_EvalBytecode +{ + U64 count; + U64 size; + RDIB_EvalBytecodeOp *first; + RDIB_EvalBytecodeOp *last; +} RDIB_EvalBytecode; + +typedef struct RDIB_Location +{ + RDI_LocationKind kind; + Rng1U64List ranges; + union { + struct { + RDI_RegCode reg_code; + U64 offset; + }; + RDIB_EvalBytecode bytecode; + }; + + // used by RDI builder + U64 data_offset; +} RDIB_Location; + +typedef struct RDIB_LocationNode +{ + struct RDIB_LocationNode *next; + RDIB_Location v; +} RDIB_LocationNode; + +typedef struct RDIB_LocationList +{ + U64 count; + RDIB_LocationNode *first; + RDIB_LocationNode *last; +} RDIB_LocationList; + + +typedef struct RDIB_Variable +{ + RDI_LinkFlags link_flags; + String8 name; + String8 link_name; + RDI_LocalKind kind; + struct RDIB_Type *type; + struct RDIB_Type *container_type; + struct RDIB_Procedure *container_proc; + RDIB_LocationList locations; + struct RDIB_VariableChunk *chunk; + struct RDIB_Variable *next; +} RDIB_Variable; + +typedef struct RDIB_VariableNode +{ + struct RDIB_VariableNode *next; + RDIB_Variable v; +} RDIB_VariableNode; + +typedef struct RDIB_VariableList +{ + U64 count; + RDIB_VariableNode *first; + RDIB_VariableNode *last; +} RDIB_VariableList; + +//////////////////////////////// + +typedef struct +{ + RDI_Arch arch; + U64 exe_hash; + U64 voff_max; + String8 exe_name; + String8 producer_string; +} RDIB_TopLevelInfo; + +typedef struct +{ + String8 name; + RDI_BinarySectionFlags flags; + U64 voff_first; + U64 voff_opl; + U64 foff_first; + U64 foff_opl; +} RDIB_BinarySection; + +typedef struct RDIB_LineTableFragment +{ + struct RDIB_SourceFile *src_file; + PairU32 ln_voff; + U64 *voffs; + U32 *line_nums; + U16 *col_nums; + U64 line_count; + U64 col_count; + struct RDIB_LineTableFragment *next_src_file; + struct RDIB_LineTableFragment *next_line_table; + struct RDIB_LineTableFragmentChunk *chunk; +} RDIB_LineTableFragment; + +typedef struct RDIB_LineTableFragmentChunk +{ + struct RDIB_LineTableFragmentChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_LineTableFragment *v; +} RDIB_LineTableFragmentChunk; + +typedef struct RDIB_LineTableFragmentChunkList +{ + U64 count; + RDIB_LineTableFragmentChunk *first; + RDIB_LineTableFragmentChunk *last; +} RDIB_LineTableFragmentChunkList; + +typedef struct RDIB_LineTable +{ + struct RDIB_LineTableChunk *chunk; + U64 count; + RDIB_LineTableFragment *first; + RDIB_LineTableFragment *last; + U32 output_array_idx; +} RDIB_LineTable; + +typedef struct RDIB_LineTableChunk +{ + struct RDIB_LineTableChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_LineTable *v; +} RDIB_LineTableChunk; + +typedef struct RDIB_LineTableChunkList +{ + U64 count; + RDIB_LineTableChunk *first; + RDIB_LineTableChunk *last; +} RDIB_LineTableChunkList; + +typedef struct RDIB_SourceFile +{ + String8 file_path; + String8 normal_full_path; + RDI_ChecksumKind checksum_kind; + String8 checksum; + RDIB_LineTableFragment *line_table_frags; + + U64 src_line_map_idx; + U64 line_table_idx; + + struct RDIB_SourceFileChunk *chunk; +} RDIB_SourceFile; + +typedef struct RDIB_SourceFileChunk +{ + struct RDIB_SourceFileChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_SourceFile *v; +} RDIB_SourceFileChunk; + +typedef struct RDIB_SourceFileChunkList +{ + U64 count; + RDIB_SourceFileChunk *first; + RDIB_SourceFileChunk *last; +} RDIB_SourceFileChunkList; + +typedef struct RDIB_Procedure +{ + RDI_LinkFlags link_flags; + String8 name; + String8 link_name; + struct RDIB_Type *type; + struct RDIB_Type *container_type; + struct RDIB_Procedure *container_proc; + + struct RDIB_Scope *scope; + struct RDIB_ScopeNode *scope_first; + struct RDIB_ScopeNode *scope_last; + + U64 scope_count; + struct RDIB_ProcedureChunk *chunk; +} RDIB_Procedure; + +typedef struct RDIB_Scope +{ + struct RDIB_Scope *parent; + struct RDIB_Scope *first_child; + struct RDIB_Scope *last_child; + struct RDIB_Scope *next_sibling; + struct RDIB_Procedure *container_proc; + Rng1U64List ranges; + RDIB_Variable *local_first; + RDIB_Variable *local_last; + U64 local_count; + struct RDIB_InlineSite *inline_site; + + // used by RDI builder + U64 local_first_idx; + + struct RDIB_ScopeChunk *chunk; +} RDIB_Scope; + +typedef struct RDIB_InlineSite +{ + String8 name; + struct RDIB_Type *type; + struct RDIB_Type *owner; + struct RDIB_InlineSiteChunk *chunk; + union { + struct RDIB_LineTable *line_table; + struct { + void *ud0; + U64 ud1; + U64 ud2; + } convert_ref; + }; +} RDIB_InlineSite; + +typedef RDI_MemberKind RDI_MemberKindExt; +enum +{ + RDI_MemberKind_COUNT = RDI_MemberKind_NestedType, + RDI_MemberKindExt_MemberListPointer // NOTE: must always be last in the list! +}; + +typedef struct RDIB_UDTMember +{ + RDI_MemberKindExt kind; + union { + struct { + String8 name; + U64 offset; + RDIB_TypeRef type_ref; + } data_field; + struct { + String8 name; + RDIB_TypeRef type_ref; + } static_data; + RDIB_TypeRef member_list_pointer; + struct { + RDI_MemberKind kind; + String8 name; + RDIB_TypeRef type_ref; + U64 vftable_offset; + } method; + struct { + String8 name; + RDIB_TypeRef type_ref; + } nested_type; + struct { + RDIB_TypeRef type_ref; + U64 offset; + } base_class; + struct { + RDIB_TypeRef type_ref; + U64 vbptr_off; + U64 vtable_off; + } virtual_base_class; + struct { + String8 name; + U64 value; + } enumerate; + }; + struct RDIB_UDTMember *next; + struct RDIB_UDTMemberChunk *chunk; +} RDIB_UDTMember; + +typedef struct RDIB_UDTMemberList +{ + U64 count; + RDIB_UDTMember *first; + RDIB_UDTMember *last; +} RDIB_UDTMemberList; + +typedef struct RDIB_UDT +{ + struct RDIB_Type *self_type; + struct RDIB_Type *members; + RDIB_SourceFile *decl_src_file; + U32 decl_line_num; + U32 decl_col_num; +} RDIB_UDT; + +enum +{ + RDI_TypeKindExt_Lo = RDI_TypeKind_Count, + RDI_TypeKindExt_VirtualTable, + RDI_TypeKindExt_StaticMethod, + RDI_TypeKindExt_Members, + RDI_TypeKindExt_Params, + RDI_TypeKindExt_Count, +}; +typedef RDI_TypeKind RDI_TypeKindExt; + +typedef struct RDIB_Type +{ + RDI_TypeKindExt kind; + U64 final_idx; + U64 itype; + union { + struct { + String8 name; + U64 size; + } builtin; + struct { + RDI_TypeModifierFlags flags; + RDIB_TypeRef type_ref; + } modifier; + struct { + RDIB_TypeRef type_ref; + U64 size; + } ptr; + struct { + RDIB_TypeRef return_type; + RDIB_TypeRef params_type; + + U64 param_idx_run_bucket_idx; + } func; + struct { + RDIB_TypeRef class_type; + RDIB_TypeRef this_type; + RDIB_TypeRef return_type; + RDIB_TypeRef params_type; + + U64 param_idx_run_bucket_idx; + } method; + struct { + RDIB_TypeRef class_type; + RDIB_TypeRef return_type; + RDIB_TypeRef params_type; + + U64 param_idx_run_bucket_idx; + } static_method; + struct { + U64 off; + U64 count; + RDIB_TypeRef value_type; + } bitfield; + struct { + RDIB_TypeRef entry_type; + U64 size; + } array; + struct { + String8 name; + String8 link_name; + RDIB_TypeRef members; + // assigned in UDT build step + U64 udt_idx; + union { + struct { + U64 size; + RDIB_TypeRef derived; + RDIB_TypeRef vtshape; + } struct_type; + struct { + U64 size; + } union_type; + struct { + String8 name; + RDIB_TypeRef base_type; + } enum_type; + }; + } udt; + struct { + U64 count; + RDIB_TypeRef *types; + } params; + struct { + RDIB_UDTMemberList list; + + // assigned in member build step + B32 is_head; + U64 first_member_idx; + } members, enum_members; + }; + struct RDIB_TypeChunk *chunk; +} RDIB_Type; + +typedef struct RDIB_Unit +{ + RDI_Arch arch; + String8 unit_name; + String8 compiler_name; + String8 source_file; + String8 object_file; + String8 archive_file; + String8 build_path; + RDI_Language language; + RDIB_LineTable *line_table; + U64 virt_range_count; + Rng1U64 *virt_ranges; + + struct RDIB_UnitChunk *chunk; +} RDIB_Unit; + +typedef struct RDIB_DataSection +{ + RDI_SectionKind tag; + String8List data; +} RDIB_DataSection; + +typedef struct RDIB_DataSectionNode +{ + struct RDIB_DataSectionNode *next; + RDIB_DataSection v; +} RDIB_DataSectionNode; + +typedef struct RDIB_DataSectionList +{ + U64 count; + RDIB_DataSectionNode *first; + RDIB_DataSectionNode *last; +} RDIB_DataSectionList; + + +typedef struct RDIB_UnitChunk +{ + U64 base; + U64 count; + U64 cap; + RDIB_Unit *v; + struct RDIB_UnitChunk *next; +} RDIB_UnitChunk; + +typedef struct RDIB_UnitChunkList +{ + U64 count; + RDIB_UnitChunk *first; + RDIB_UnitChunk *last; +} RDIB_UnitChunkList; + +typedef struct RDIB_VariableChunk +{ + struct RDIB_VariableChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_Variable *v; +} RDIB_VariableChunk; + +typedef struct RDIB_VariableChunkList +{ + U64 count; + RDIB_VariableChunk *first; + RDIB_VariableChunk *last; +} RDIB_VariableChunkList; + +typedef struct RDIB_ProcedureChunk +{ + struct RDIB_ProcedureChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_Procedure *v; +} RDIB_ProcedureChunk; + +typedef struct RDIB_ProcedureChunkList +{ + U64 count; + RDIB_ProcedureChunk *first; + RDIB_ProcedureChunk *last; +} RDIB_ProcedureChunkList; + +typedef struct RDIB_ScopeChunk +{ + struct RDIB_ScopeChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_Scope *v; +} RDIB_ScopeChunk; + +typedef struct RDIB_ScopeChunkList +{ + U64 count; + RDIB_ScopeChunk *first; + RDIB_ScopeChunk *last; +} RDIB_ScopeChunkList; + +typedef struct RDIB_ScopeNode +{ + struct RDIB_ScopeNode *next; + RDIB_Scope *v; +} RDIB_ScopeNode; + +typedef struct RDIB_ScopeList +{ + U64 count; + RDIB_Scope *first; + RDIB_Scope *last; +} RDIB_ScopeList; + +typedef struct RDIB_InlineSiteChunk +{ + struct RDIB_InlineSiteChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_InlineSite *v; +} RDIB_InlineSiteChunk; + +typedef struct RDIB_InlineSiteChunkList +{ + U64 count; + RDIB_InlineSiteChunk *first; + RDIB_InlineSiteChunk *last; +} RDIB_InlineSiteChunkList; + +typedef struct RDIB_TypeChunk +{ + struct RDIB_TypeChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_Type *v; +} RDIB_TypeChunk; + +typedef struct +{ + U64 count; + RDIB_TypeChunk *first; + RDIB_TypeChunk *last; +} RDIB_TypeChunkList; + +typedef struct RDIB_UDTMemberChunk +{ + struct RDIB_UDTMemberChunk *next; + U64 base; + U64 count; + U64 cap; + RDIB_UDTMember *v; +} RDIB_UDTMemberChunk; + +typedef struct RDIB_UDTMemberChunkList +{ + U64 count; + RDIB_UDTMemberChunk *first; + RDIB_UDTMemberChunk *last; +} RDIB_UDTMemberChunkList; + +//////////////////////////////// +// UDT Forward Ref Map + +typedef struct +{ + struct RDIB_Type *type; + U64 idx; +} RDIB_UDTFwdrefBucket; + +//////////////////////////////// +// String Map + +typedef struct RDIB_StringMapBucket +{ + String8 string; + + union { + // to get deterministic output we assign each bucket a unique index + union { + struct { + U32 lo; + U32 hi; + }; + U64 v; + } sorter; + + // after buckets are sorted we replace 'sorter' with indices into output array + U64 idx; + }; + + union { + // depending on the usage context sotres: pointers to variables, procedures, and etc. + VoidNode *raw_values; + + // during index-run-map build step 'raw_values' are replaced with index-run bucket index + struct { + U32 count; + // if we have single index - store it in the bucket + union { + U64 idx_run_bucket_idx; + U32 match_idx; + }; + }; + }; +} RDIB_StringMapBucket; + +typedef struct RDIB_StringMap +{ + U64 cap; + RDIB_StringMapBucket **buckets; +} RDIB_StringMap; + +#define RDIB_STRING_MAP_UPDATE_FUNC(name) void name(VoidNode **head, VoidNode *node) +typedef RDIB_STRING_MAP_UPDATE_FUNC(RDIB_StringMapUpdateFunc); + +typedef struct +{ + RDIB_StringMap *string_map; + Rng1U64 *ranges; + U64 *counts; + U64 *offsets; + RDIB_StringMapBucket **result; +} RDIB_GetExtantBucketsStringMapTask; + +typedef struct +{ + U32 *string_table; + U64 string_data_size; + U8 *string_data; + RDIB_StringMapBucket **buckets; + Rng1U64 *ranges; +} RDIB_CopyStringDataTask; + +typedef struct +{ + U64 chunk_idx_opl; + Rng1U64 *ranges; + RDIB_StringMapBucket **src; + RDIB_StringMapBucket **dst; + U32 *chunk_histo; + U32 *chunk_offsets; +} RDIB_StringMapRadixSort; + +//////////////////////////////// +// Index Run Map + +typedef struct RDIB_IndexRunBucket +{ + union { + struct { + U32 lo; + U32 hi; + }; + U64 v; + } sorter; + U32Array indices; + U64 index_in_output_array; +} RDIB_IndexRunBucket; + +typedef struct RDIB_IndexRunMap +{ + U64 cap; + RDIB_IndexRunBucket **buckets; +} RDIB_IndexRunMap; + +//////////////////////////////// + +typedef struct +{ + U64 voff; + U32 size; + U32 idx; +} RDIB_VMapRange; + +//////////////////////////////// + +typedef struct RDIB_PathTreeNode +{ + struct RDIB_PathTreeNode *parent; + struct RDIB_PathTreeNode *next_order; + struct RDIB_PathTreeNode *next_sibling; + struct RDIB_PathTreeNode *first_child; + struct RDIB_PathTreeNode *last_child; + U64 node_idx; + String8 sub_path; + RDIB_SourceFile *src_file; +} RDIB_PathTreeNode; + +typedef struct RDIB_PathTreeNodeList +{ + U64 count; + RDIB_PathTreeNode *first; + RDIB_PathTreeNode *last; +} RDIB_PathTreeNodeList; + +typedef struct RDIB_PathTree +{ + RDIB_PathTreeNode *root; + U64 node_count; + U64 next_list_idx; + U64 list_count; + RDIB_PathTreeNodeList *node_lists; +} RDIB_PathTree; + +//////////////////////////////// + +typedef struct RDIB_Input +{ + U64 unit_chunk_cap; + U64 src_file_chunk_cap; + U64 symbol_chunk_cap; + U64 line_table_cap; + U64 inline_site_cap; + U64 type_cap; + U64 udt_cap; + + RDIB_TopLevelInfo top_level_info; + U64 sect_count; + RDIB_BinarySection *sections; + RDIB_UnitChunkList units; + RDIB_SourceFileChunkList src_files; + RDIB_LineTableChunkList line_tables; + RDIB_ScopeChunkList scopes; + RDIB_VariableChunkList locals; + RDIB_VariableChunkList gvars; + RDIB_VariableChunkList extern_gvars; + RDIB_VariableChunkList static_gvars; + RDIB_VariableChunkList tvars; + RDIB_VariableChunkList extern_tvars; + RDIB_VariableChunkList static_tvars; + RDIB_ProcedureChunkList procs; + RDIB_ProcedureChunkList extern_procs; + RDIB_ProcedureChunkList static_procs; + RDIB_InlineSiteChunkList inline_sites; + RDIB_TypeChunkList types; + RDIB_TypeChunkList struct_list; + RDIB_TypeChunkList union_list; + RDIB_TypeChunkList enum_list; + RDIB_TypeChunkList param_types; + RDIB_TypeChunkList member_types; + RDIB_TypeChunkList enum_types; + RDIB_UDTMemberChunkList udt_members; + RDIB_UDTMemberChunkList enum_members; + + RDIB_SourceFile *null_src_file; + RDIB_LineTable *null_line_table; + RDIB_LineTableFragment *null_frag; + RDIB_Type *null_type; + RDIB_Scope *null_scope; + RDIB_Unit *null_unit; + RDIB_Procedure *null_proc; + RDIB_Variable *null_local; + RDIB_Variable *null_gvar; + RDIB_Variable *null_tvar; + RDIB_UDTMember *null_udt_member; + RDIB_UDTMember *null_enum_member; + RDIB_InlineSite *null_inline_site; + + RDIB_Type *variadic_type; + + //RDIB_TypeChunkList struct_list; + //RDIB_TypeChunkList union_list; + //RDIB_TypeChunkList enum_list; + //RDIB_TypeChunkList param_types; +} RDIB_Input; + +//////////////////////////////// +// Parallel For Tasks + +typedef struct +{ + U64 *udt_counts; +} RDIB_TypeStats; + +typedef struct +{ + RDIB_TypeChunk **chunks; + RDIB_TypeStats *type_stats; +} RDIB_TypeStatsTask; + +typedef struct +{ + Rng1U64 *ranges; + U64 *counts; + U64 *offsets; + RDIB_TypeChunk **type_chunks; + RDIB_StringMap *string_map; + union { + RDI_Member *udt_members_rdi; + RDI_EnumMember *enum_members_rdi; + }; +} RDIB_MembersTask; + +typedef struct +{ + RDIB_TypeChunk **type_chunks; + RDIB_TypeStats type_stats; + U64 *udt_base_idx; + RDI_UDT *udts; +} RDIB_UserDefinesTask; + +typedef struct +{ + U64 addr_size; + RDIB_StringMap *string_map; + RDIB_IndexRunMap *idx_run_map; + RDIB_TypeChunk **type_chunks; + RDIB_TypeStats type_stats; + RDI_TypeNode *type_nodes; +} RDIB_TypeNodesTask; + +typedef struct +{ + RDIB_StringMap *string_map; + Rng1U64 *ranges; + RDIB_StringMapUpdateFunc *string_map_update_func; + RDIB_StringMapBucket **free_buckets; + U64 *insert_counts; + U64 *element_indices; + union + { + RDIB_UnitChunk **units; + RDIB_BinarySection *sects; + RDIB_SourceFileChunk **src_file_chunks; + RDIB_VariableChunk **vars; + RDIB_ProcedureChunk **procs; + RDIB_InlineSiteChunk **inline_sites; + RDIB_UDTMemberChunk **udt_members; + RDIB_UDTMemberChunk **enum_members; + RDIB_TypeChunk **types; + RDIB_PathTreeNodeList *path_node_lists; + }; +} RDIB_CollectStringsTask; + +typedef struct +{ + RDIB_StringMap *string_map; + Rng1U64 *ranges; + RDIB_TypeChunk **chunks; + String8List *data_lists; +} RDIB_BuildTypeDataTask; + +typedef struct +{ + RDIB_StringMap *string_map; + Rng1U64 *ranges; + union { + struct { + RDIB_VariableChunk **gvars_rdib; + String8List *gvars_out; + }; + struct { + RDIB_VariableChunk **tvars_rdib; + String8List *tvars_out; + }; + struct { + RDIB_ProcedureChunk **procs_rdib; + String8List *procs_out; + }; + struct { + RDIB_ScopeChunk **scopes_rdib; + U64 *scope_voff_counts; + U64 *loc_data_sizes; + U64 *local_counts; + U64 *loc_block_counts; + U64 *scope_voff_offsets; + U64 *local_offsets; + U64 *loc_block_offsets; + U64 *loc_data_offsets; + U64 *scope_voffs_rdi; + RDI_Scope *scopes_rdi; + RDI_Local *locals_rdi; + RDI_LocationBlock *loc_blocks_rdi; + U8 *loc_data_rdi; + }; + }; +} RDIB_BuildSymbolSectionTask; + +typedef union +{ + struct { + U64 *counts; + U64 *offsets; + Rng1U64 *ranges; + RDIB_VMapRange *vmap; + union { + RDIB_UnitChunk **unit_chunks; + RDIB_VariableChunk **gvar_chunks; + RDIB_ScopeChunk **scope_chunks; + }; + }; + + struct { + U64 vmap_counts[3]; + RDIB_VMapRange *vmaps[3]; + String8List raw_vmaps[3]; + }; +} RDIB_VMapBuilderTask; + +typedef struct +{ + U64 sorter_idx; + RDI_NameMapKind name_map_kind; + RDIB_IndexRunMap *idx_run_map; + RDIB_IndexRunBucket **free_buckets; + Rng1U64 *ranges; + union { + RDIB_TypeChunk **type_chunks; + RDIB_StringMapBucket **name_map_buckets; + }; +} RDIB_BuildIndexRunsTask; + +typedef struct +{ + RDIB_IndexRunBucket **buckets; + Rng1U64 *ranges; + U32 *output_array; +} RDIB_IdxRunCopyTask; + +typedef struct +{ + RDIB_IndexRunMap *idx_run_map; + Rng1U64 *ranges; + U64 *counts; + U64 *offsets; + RDIB_IndexRunBucket **result; +} RDIB_GetExtantBucketsIndexRunMapTask; + +typedef struct +{ + U64 chunk_idx_opl; + Rng1U64 *ranges; + RDIB_IndexRunBucket **src; + RDIB_IndexRunBucket **dst; + U32 *chunk_histo; + U32 *chunk_offsets; +} RDIB_IndexRunMapRadixSort; + +typedef struct +{ + RDIB_StringMap *string_map; + RDIB_IndexRunMap *idx_run_map; + U64 *in_bucket_counts; + RDIB_StringMapBucket ***in_buckets; + + RDI_NameMapBucket **out_buckets; + RDI_NameMapNode **out_nodes; + U64 *out_bucket_counts; + U64 *out_node_counts; +} RDIB_NameMapBuilderTask; + +typedef struct +{ + RDIB_PathTree *path_tree; + RDIB_StringMap *string_map; + RDI_FilePathNode *nodes_dst; +} RDIB_BuildFilePathNodesTask; + +typedef struct +{ + RDIB_SourceFile **src_file_arr; + U32 *out_line_counts; + U32 *out_voff_counts; + U32 **out_line_nums; + U32 **out_line_ranges; + U64 **out_voffs; +} RDIB_SrcLineMapsTask; + +typedef struct +{ + RDIB_LineTableChunk **chunks; + Rng1U64 *ranges; + + U64 *out_line_table_counts; + U64 **out_line_table_voffs; + RDI_Line **out_line_table_lines; +} RDIB_BuildLineTablesTask; + +typedef struct +{ + Rng1U64 *ranges; + RDIB_StringMap *string_map; + RDIB_PathTree *path_tree; + RDIB_SourceFileChunk **src_file_chunks; + RDI_SourceFile *src_files_dst; +} RDIB_FillSourceFilesTask; + +//////////////////////////////// +// Data Model Helpers + +internal RDIB_DataModel rdib_infer_data_model(OperatingSystem os, RDI_Arch arch); + +internal RDI_TypeKind rdib_short_type_from_data_model (RDIB_DataModel data_model); +internal RDI_TypeKind rdib_unsigned_short_type_from_data_model (RDIB_DataModel data_model); +internal RDI_TypeKind rdib_int_type_from_data_model (RDIB_DataModel data_model); +internal RDI_TypeKind rdib_unsigned_int_type_from_data_model (RDIB_DataModel data_model); +internal RDI_TypeKind rdib_long_type_from_data_model (RDIB_DataModel data_model); +internal RDI_TypeKind rdib_unsigned_long_type_from_data_model (RDIB_DataModel data_model); +internal RDI_TypeKind rdib_long_long_type_from_data_model (RDIB_DataModel data_model); +internal RDI_TypeKind rdib_unsigned_long_long_type_from_data_model(RDIB_DataModel data_model); +internal RDI_TypeKind rdib_pointer_size_t_type_from_data_model (RDIB_DataModel data_model); + +//////////////////////////////// + +internal void rdib_udt_member_list_push_node (RDIB_UDTMemberList *list, RDIB_UDTMember *node); +internal void rdib_udt_member_list_concat_in_place(RDIB_UDTMemberList *list, RDIB_UDTMemberList *to_concat); + +internal RDIB_LineTableFragment * rdib_line_table_push(Arena *arena, RDIB_LineTable *list); + +//////////////////////////////// +// Chunk Lists + +// push +internal RDIB_Unit * rdib_unit_chunk_list_push (Arena *arena, RDIB_UnitChunkList *list, U64 cap); +internal RDIB_Scope * rdib_scope_chunk_list_push (Arena *arena, RDIB_ScopeChunkList *list, U64 cap); +internal RDIB_Procedure * rdib_procedure_chunk_list_push (Arena *arena, RDIB_ProcedureChunkList *list, U64 cap); +internal RDIB_Variable * rdib_variable_chunk_list_push (Arena *arena, RDIB_VariableChunkList *list, U64 cap); +internal RDIB_LineTable * rdib_line_table_chunk_list_push (Arena *arena, RDIB_LineTableChunkList *list, U64 cap); +internal RDIB_Type * rdib_type_chunk_list_push (Arena *arena, RDIB_TypeChunkList *list, U64 cap); +internal RDIB_UDTMember * rdib_udt_member_chunk_list_push (Arena *arena, RDIB_UDTMemberChunkList *list, U64 cap); +internal RDIB_SourceFile * rdib_source_file_chunk_list_push(Arena *arena, RDIB_SourceFileChunkList *list, U64 cap); +internal RDIB_InlineSite * rdib_inline_site_chunk_list_push(Arena *arena, RDIB_InlineSiteChunkList *list, U64 cap); + +internal RDIB_Scope * rdib_scope_chunk_list_push_zero (Arena *arena, RDIB_ScopeChunkList *list, U64 cap); +internal RDIB_Procedure * rdib_procedure_chunk_list_push_zero (Arena *arena, RDIB_ProcedureChunkList *list, U64 cap); +internal RDIB_Variable * rdib_variable_chunk_list_push_zero (Arena *arena, RDIB_VariableChunkList *list, U64 cap); +internal RDIB_LineTable * rdib_line_table_chunk_list_push_zero (Arena *arena, RDIB_LineTableChunkList *list, U64 cap); +internal RDIB_Type * rdib_type_chunk_list_push_zero (Arena *arena, RDIB_TypeChunkList *list, U64 cap); +internal RDIB_UDTMember * rdib_udt_member_chunk_list_push_zero (Arena *arena, RDIB_UDTMemberChunkList *list, U64 cap); +internal RDIB_SourceFile * rdib_source_file_chunk_list_push_zero(Arena *arena, RDIB_SourceFileChunkList *list, U64 cap); +internal RDIB_InlineSite * rdib_inline_site_chunk_list_push_zero(Arena *arena, RDIB_InlineSiteChunkList *list, U64 cap); + +// push many +internal RDIB_UnitChunk * rdib_unit_chunk_list_reserve_ex(Arena *arena, RDIB_UnitChunkList *list, U64 chunk_count, U64 item_count); + +internal void rdib_unit_chunk_list_reserve (Arena *arena, RDIB_UnitChunkList *list, U64 cap); +internal void rdib_type_chunk_list_reserve (Arena *arena, RDIB_TypeChunkList *list, U64 cap); +internal void rdib_source_file_chunk_list_reserve(Arena *arena, RDIB_SourceFileChunkList *list, U64 cap); + +// concat in place +internal void rdib_scope_chunk_list_concat_in_place (RDIB_ScopeChunkList *list, RDIB_ScopeChunkList *to_concat); +internal void rdib_udt_member_chunk_list_concat_in_place (RDIB_UDTMemberChunkList *list, RDIB_UDTMemberChunkList *to_concat); +internal void rdib_procedure_chunk_list_concat_in_place (RDIB_ProcedureChunkList *list, RDIB_ProcedureChunkList *to_concat); +internal void rdib_variable_chunk_list_concat_in_place (RDIB_VariableChunkList *list, RDIB_VariableChunkList *to_concat); +internal void rdib_inline_site_chunk_list_concat_in_place(RDIB_InlineSiteChunkList *list, RDIB_InlineSiteChunkList *to_concat); +internal void rdib_inline_site_chunk_list_concat_in_place(RDIB_InlineSiteChunkList *list, RDIB_InlineSiteChunkList *to_concat); +internal void rdib_type_chunk_list_concat_in_place (RDIB_TypeChunkList *list, RDIB_TypeChunkList *to_concat); +internal void rdib_source_file_chunk_list_concat_in_place(RDIB_SourceFileChunkList *list, RDIB_SourceFileChunkList *to_concat); + +// concat in place many +internal void rdib_type_chunk_list_concat_in_place_many (RDIB_TypeChunkList *list, RDIB_TypeChunkList *to_concat, U64 count); +internal void rdib_udt_member_chunk_list_concat_in_place_many(RDIB_UDTMemberChunkList *list, RDIB_UDTMemberChunkList *to_concat, U64 count); + +// array from chunk list +internal RDIB_UnitChunk ** rdib_array_from_unit_chunk_list (Arena *arena, RDIB_UnitChunkList list); +internal RDIB_ScopeChunk ** rdib_array_from_scope_chunk_list (Arena *arena, RDIB_ScopeChunkList list); +internal RDIB_ProcedureChunk ** rdib_array_from_procedure_chunk_list (Arena *arena, RDIB_ProcedureChunkList list); +internal RDIB_VariableChunk ** rdib_array_from_variable_chunk_list (Arena *arena, RDIB_VariableChunkList list); +internal RDIB_LineTableChunk ** rdib_array_from_line_table_chunk_list (Arena *arena, RDIB_LineTableChunkList list); +internal RDIB_InlineSiteChunk ** rdib_array_from_inline_site_chunk_list(Arena *arena, RDIB_InlineSiteChunkList list); +internal RDIB_UDTMemberChunk ** rdib_array_from_udt_member_chunk_list (Arena *arena, RDIB_UDTMemberChunkList list); +internal RDIB_TypeChunk ** rdib_array_from_type_chunk_list (Arena *arena, RDIB_TypeChunkList list); +internal RDIB_SourceFileChunk ** rdib_array_from_source_file_chunk_list(Arena *arena, RDIB_SourceFileChunkList list); + +// total count from chunk list +internal U64 rdib_unit_chunk_list_total_count (RDIB_UnitChunkList list); +internal U64 rdib_scope_chunk_list_total_count (RDIB_ScopeChunkList list); +internal U64 rdib_variable_chunk_list_total_count (RDIB_VariableChunkList list); +internal U64 rdib_line_table_chunk_list_total_count (RDIB_LineTableChunkList list); +internal U64 rdib_procedure_chunk_list_total_count (RDIB_ProcedureChunkList list); +internal U64 rdib_inline_site_chunk_list_total_count(RDIB_InlineSiteChunkList list); +internal U64 rdib_udt_member_chunk_list_total_count (RDIB_UDTMemberChunkList list); +internal U64 rdib_type_chunk_list_total_count (RDIB_TypeChunkList list); +internal U64 rdib_source_file_chunk_list_total_count(RDIB_SourceFileChunkList list); + +// pointer -> array index +internal U32 rdib_idx_from_unit (RDIB_Unit *n); +internal U32 rdib_idx_from_scope (RDIB_Scope *n); +internal U32 rdib_idx_from_variable (RDIB_Variable *n); +internal U32 rdib_idx_from_procedure (RDIB_Procedure *n); +internal U32 rdib_idx_from_source_file(RDIB_SourceFile *n); +internal U32 rdib_idx_from_line_table (RDIB_LineTable *n); +internal U32 rdib_idx_from_type (RDIB_Type *n); +internal U32 rdib_idx_from_udt_type (RDIB_Type *n); +internal U32 rdib_idx_from_inline_site(RDIB_InlineSite *n); + +//////////////////////////////// + +//- Source File + +internal B32 rdib_source_file_match(RDIB_SourceFile *a, RDIB_SourceFile *b, OperatingSystem os); + +//- Eval Ops + +internal RDIB_EvalBytecodeOp * rdib_bytecode_push_op (Arena *arena, RDIB_EvalBytecode *bytecode, RDI_EvalOp op, RDI_U64 p); +internal void rdib_bytecode_push_ucsont(Arena *arena, RDIB_EvalBytecode *bytecode, RDI_U64 uconst); +internal void rdib_bytecode_push_sconst(Arena *arena, RDIB_EvalBytecode *bytecode, RDI_S64 sconst); + +//- Location + +internal RDIB_Location rdib_make_location_addr_byte_stream (Rng1U64List ranges, RDIB_EvalBytecode bytecode); +internal RDIB_Location rdib_make_location_addr_bytecode_stream (Rng1U64List ranges, RDIB_EvalBytecode bytecode); +internal RDIB_Location rdib_make_location_val_bytecode_stream (Rng1U64List ranges, RDIB_EvalBytecode bytecode); +internal RDIB_Location rdib_make_location_addr_reg_plus_u16 (Rng1U64List ranges, RDI_RegCode reg_code, RDI_U16 offset); + +internal RDIB_Location rdib_make_location_addr_addr_reg_plus_u16(Rng1U64List ranges, RDI_RegCode reg_code, RDI_U16 offset); +internal RDIB_Location rdib_make_location_addr_reg_plus_u16 (Rng1U64List ranges, RDI_RegCode reg_code, RDI_U16 offset); +internal RDIB_Location rdib_make_location_val_reg (Rng1U64List ranges, RDI_RegCode reg_code); + +internal RDIB_LocationNode * rdib_push_location_addr_reg_off(Arena *arena, RDIB_LocationList *list, RDI_Arch arch, RDI_RegCode reg_code, U32 reg_byte_size, U32 reg_byte_pos, S64 offset, B32 is_reference, Rng1U64List ranges); + +//- UDT Fwdrefs + +internal U64 rdib_udt_fwdref_map_hash(String8 string); +internal RDIB_UDTFwdrefBucket * rdib_udt_fwdref_map_insert_or_update(RDIB_UDTFwdrefBucket **buckets, U64 cap, U64 hash, RDIB_UDTFwdrefBucket *new_bucket); +internal RDIB_UDTFwdrefBucket * rdib_udt_fwdrefmap_map_lookup(RDIB_UDTFwdrefBucket **buckets, U64 cap, U64 hash, String8 name); + +//- Types + +internal RDIB_TypeRef rdib_make_type_ref(Arena *arena, RDIB_Type *type); +internal void rdib_deref_type_refs(TP_Context *tp, RDIB_TypeChunkList *list); + +internal RDIB_TypeStats rdib_sum_type_stats (RDIB_TypeStats *stats, U64 count); +internal U64 rdib_udt_count_from_type_stats (RDIB_TypeStats *stats); +internal U64 rdib_type_node_count_from_type_stats(RDIB_TypeStats *stats); + +internal U64 rdib_size_from_type (RDIB_Type *type); +internal U64 rdib_count_members_deep(RDIB_Type *type); + +//- Path Tree + +internal RDIB_PathTree * rdib_path_tree_init (Arena *arena, U64 list_count); +internal void rdib_path_tree_insert (Arena *arena, RDIB_PathTree *tree, String8 path, RDIB_SourceFile *src_file); +internal U32 rdib_idx_from_path_tree(RDIB_PathTree *tree, String8 path); + + +//- String Map + +internal U64 rdib_string_map_hash (String8 string); +internal RDIB_StringMap * rdib_init_string_map (Arena *arena, U64 cap); +internal U32 rdib_idx_from_string_map (RDIB_StringMap *string_map, String8 string); +internal RDIB_StringMapBucket * rdib_string_map_insert_or_update (RDIB_StringMapBucket **buckets, U64 cap, U64 hash, RDIB_StringMapBucket *new_bucket, RDIB_StringMapUpdateFunc *update_func); +internal void rdib_string_map_assign_indices (RDIB_StringMapBucket **buckets, U64 bucket_count); +internal RDIB_StringMapBucket ** rdib_extant_buckets_from_string_map(TP_Context *tp, Arena *arena, RDIB_StringMap *string_map, U64 *bucket_count_out); +internal void rdib_string_map_sort_buckets (TP_Context *tp, RDIB_StringMapBucket **buckets, U64 bucket_count, U64 max_sorter); + +//- String Map Specialized Inserters + +internal void rdib_string_map_insert_item (Arena *arena, RDIB_CollectStringsTask *task, U64 task_id, String8 string, void *value); +internal void rdib_string_map_insert_string_table_item(Arena *arena, RDIB_CollectStringsTask *task, U64 task_id, String8 string); +internal void rdib_string_map_insert_name_map_item (Arena *arena, RDIB_CollectStringsTask *task, U64 task_id, String8 string, VoidNode *node); + +//- Index Run Map + +internal U64 rdib_index_run_hash (U32 count, U32 *idxs); +internal RDIB_IndexRunMap * rdib_init_index_run_map (Arena *arena, U64 cap); +internal RDIB_IndexRunBucket * rdib_index_run_map_insert_or_update (Arena *arena, RDIB_IndexRunBucket **buckets, U64 cap, U64 hash, RDIB_IndexRunBucket *new_bucket, U64 *bucket_idx_out); +internal U32 rdib_idx_run_from_bucket_idx (RDIB_IndexRunMap *map, U64 bucket_idx); +internal void rdib_index_run_map_assign_indices (RDIB_IndexRunBucket **buckets, U64 bucket_count); +internal RDIB_IndexRunBucket ** rdib_extant_buckets_from_index_run_map(TP_Context *tp, Arena *arena, RDIB_IndexRunMap *idx_run_map, U64 *bucket_count_out); +internal void rdib_index_run_map_sort_buckets (TP_Context *tp, RDIB_IndexRunBucket **buckets, U64 bucket_count, U64 chunk_idx_opl); + +//- Index Map Spesialized Query + +internal U32 rdib_idx_from_params(RDIB_IndexRunMap *map, RDIB_Type *params); + +//- Data Sections + +internal void rdib_data_section_list_push_node (RDIB_DataSectionList *list, RDIB_DataSectionNode *node); +internal RDIB_DataSectionNode * rdib_data_section_list_push (Arena *arena, RDIB_DataSectionList *list, RDIB_DataSection v); +internal void rdib_data_section_list_concat_in_place(RDIB_DataSectionList *list, RDIB_DataSectionList *to_concat); + +internal void rdib_data_sections_from_top_level_info (Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_TopLevelInfo *src); +internal void rdib_data_sections_from_binary_sections (Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_BinarySection *binary_sects, U64 binary_sects_count); +internal void rdib_data_sections_from_string_map (TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMapBucket **buckets, U64 bucket_count); +internal void rdib_data_sections_from_index_runs (TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_IndexRunBucket **buckets, U64 bucket_count); +internal void rdib_data_sections_from_file_path_nodes (TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_PathTree *tree); +internal void rdib_data_sections_from_source_files (TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_PathTree *path_tree, U64 total_src_file_count, U64 src_file_chunk_count, RDIB_SourceFileChunk **src_file_chunks); +internal void rdib_data_sections_from_units (Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_PathTree *path_tree, U64 total_unit_count, U64 unit_chunk_count, RDIB_UnitChunk **unit_chunks); +internal void rdib_data_sections_from_string_map (TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMapBucket **buckets, U64 bucket_count); +internal void rdib_data_sections_from_types(TP_Context *tp, + Arena *arena, + RDIB_DataSectionList *sect_list, + RDI_Arch arch, + RDIB_StringMap *string_map, + RDIB_IndexRunMap *idx_run_map, + U64 udt_member_chunk_count, + RDIB_TypeChunk **udt_member_type_chunks, + U64 enum_member_chunk_count, + RDIB_TypeChunk **enum_member_type_chunks, + U64 total_type_node_count, + U64 type_chunk_count, + RDIB_TypeChunk **type_chunks, + RDIB_TypeStats type_stats); +internal void rdib_data_sections_from_global_variables (TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, U64 total_count, U64 chunk_count, RDIB_VariableChunk **chunks); +internal void rdib_data_sections_from_thread_variables (TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, U64 total_count, U64 chunk_count, RDIB_VariableChunk **chunks); +internal void rdib_data_sections_from_procedures (TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, U64 total_count, U64 chunk_count, RDIB_ProcedureChunk **chunks); +internal void rdib_data_sections_from_scopes (TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, U64 total_count, U64 chunk_count, RDIB_ScopeChunk **chunks); +internal void rdib_data_sections_from_name_maps (TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, RDIB_IndexRunMap *idx_run_map, RDIB_StringMapBucket **src_name_maps[RDI_NameMapKind_COUNT], U64 src_name_map_counts[RDI_NameMapKind_COUNT]); +internal void rdib_data_sections_from_source_line_maps (TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, U64 total_src_file_count, U64 src_file_chunk_count, RDIB_SourceFileChunk **src_fille_chunks); +internal void rdib_data_sections_from_unit_gvar_scope_vmaps(TP_Context *tp, TP_Arena *arena, RDIB_DataSectionList *sect_list, U64 unit_chunk_count, RDIB_UnitChunk **unit_chunks, U64 gvar_chunk_count, RDIB_VariableChunk **gvar_chunks, U64 scope_chunk_count, RDIB_ScopeChunk **scope_chunks); +internal void rdib_data_sections_from_inline_sites (TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list, RDIB_StringMap *string_map, U64 total_inline_site_count, U64 inline_site_chunk_count, RDIB_InlineSiteChunk **inline_site_chunks); +internal void rdib_data_sections_from_checksums (TP_Context *tp, Arena *arena, RDIB_DataSectionList *sect_list); + +internal RDIB_Input rdib_init_input(Arena *arena); +internal String8List rdib_finish(TP_Context *tp, TP_Arena *arena, RDIB_Input *input); + diff --git a/src/linker/rdi/rdi_coff.c b/src/linker/rdi/rdi_coff.c new file mode 100644 index 00000000..07a3892f --- /dev/null +++ b/src/linker/rdi/rdi_coff.c @@ -0,0 +1,53 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal RDI_Arch +rdi_arch_from_coff_machine(COFF_MachineType machine) +{ + switch (machine) { + case COFF_MachineType_X86: return RDI_Arch_X86; + case COFF_MachineType_X64: return RDI_Arch_X64; + + case COFF_MachineType_UNKNOWN: + case COFF_MachineType_AM33: + case COFF_MachineType_ARM: + case COFF_MachineType_ARM64: + case COFF_MachineType_ARMNT: + case COFF_MachineType_EBC: + case COFF_MachineType_IA64: + case COFF_MachineType_M32R: + case COFF_MachineType_MIPS16: + case COFF_MachineType_MIPSFPU: + case COFF_MachineType_MIPSFPU16: + case COFF_MachineType_POWERPC: + case COFF_MachineType_POWERPCFP: + case COFF_MachineType_R4000: + case COFF_MachineType_RISCV32: + case COFF_MachineType_RISCV64: + case COFF_MachineType_SH3: + case COFF_MachineType_SH3DSP: + case COFF_MachineType_SH4: + case COFF_MachineType_SH5: + case COFF_MachineType_THUMB: + case COFF_MachineType_WCEMIPSV2: + NotImplemented; + default: + return RDI_Arch_NULL; + } +} + +internal RDI_BinarySectionFlags +rdi_binary_section_flags_from_coff_section_flags(COFF_SectionFlags flags) +{ + RDI_BinarySectionFlags result = 0; + if (flags & COFF_SectionFlag_MEM_READ) { + result |= RDI_BinarySectionFlag_Read; + } + if (flags & COFF_SectionFlag_MEM_WRITE) { + result |= RDI_BinarySectionFlag_Write; + } + if (flags & COFF_SectionFlag_MEM_EXECUTE) { + result |= RDI_BinarySectionFlag_Execute; + } + return result; +} diff --git a/src/linker/rdi/rdi_coff.h b/src/linker/rdi/rdi_coff.h new file mode 100644 index 00000000..d946e237 --- /dev/null +++ b/src/linker/rdi/rdi_coff.h @@ -0,0 +1,7 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal RDI_Arch rdi_arch_from_coff_machine(COFF_MachineType machine); + diff --git a/src/linker/rdi/rdi_cv.c b/src/linker/rdi/rdi_cv.c new file mode 100644 index 00000000..330161d0 --- /dev/null +++ b/src/linker/rdi/rdi_cv.c @@ -0,0 +1,244 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +internal RDI_Arch +rdi_arch_from_cv_arch(CV_Arch arch) +{ + switch (arch) { + case CV_Arch_8086: return RDI_Arch_X86; + case CV_Arch_X64: return RDI_Arch_X64; + + case CV_Arch_8080: + case CV_Arch_80286: + case CV_Arch_80386: + case CV_Arch_80486: + case CV_Arch_PENTIUM: + case CV_Arch_PENTIUMII: + case CV_Arch_PENTIUMIII: + case CV_Arch_MIPS: + case CV_Arch_MIPS16: + case CV_Arch_MIPS32: + case CV_Arch_MIPS64: + case CV_Arch_MIPSI: + case CV_Arch_MIPSII: + case CV_Arch_MIPSIII: + case CV_Arch_MIPSIV: + case CV_Arch_MIPSV: + case CV_Arch_M68000: + case CV_Arch_M68010: + case CV_Arch_M68020: + case CV_Arch_M68030: + case CV_Arch_M68040: + case CV_Arch_ALPHA: + case CV_Arch_ALPHA_21164: + case CV_Arch_ALPHA_21164A: + case CV_Arch_ALPHA_21264: + case CV_Arch_ALPHA_21364: + case CV_Arch_PPC601: + case CV_Arch_PPC603: + case CV_Arch_PPC604: + case CV_Arch_PPC620: + case CV_Arch_PPCFP: + case CV_Arch_PPCBE: + case CV_Arch_SH3: + case CV_Arch_SH3E: + case CV_Arch_SH3DSP: + case CV_Arch_SH4: + case CV_Arch_SHMEDIA: + case CV_Arch_ARM3: + case CV_Arch_ARM4: + case CV_Arch_ARM4T: + case CV_Arch_ARM5: + case CV_Arch_ARM5T: + case CV_Arch_ARM6: + case CV_Arch_ARM_XMAC: + case CV_Arch_ARM_WMMX: + case CV_Arch_ARM7: + case CV_Arch_OMNI: + case CV_Arch_IA64_1: + case CV_Arch_IA64_2: + case CV_Arch_CEE: + case CV_Arch_AM33: + case CV_Arch_M32R: + case CV_Arch_TRICORE: + case CV_Arch_EBC: + case CV_Arch_THUMB: + case CV_Arch_ARMNT: + case CV_Arch_ARM64: + case CV_Arch_D3D11_SHADER: + NotImplemented; + default: + return RDI_Arch_NULL; + } +} + +internal RDI_Language +rdi_language_from_cv_language(CV_Language language) +{ + switch (language) { + case CV_Language_C: return RDI_Language_C; + case CV_Language_CXX: return RDI_Language_CPlusPlus; + case CV_Language_MASM: return RDI_Language_Masm; + case CV_Language_LINK: return RDI_Language_NULL; + case CV_Language_CVTRES: return RDI_Language_NULL; + + case CV_Language_FORTRAN: + case CV_Language_PASCAL: + case CV_Language_BASIC: + case CV_Language_COBOL: + case CV_Language_CVTPGD: + case CV_Language_CSHARP: + case CV_Language_VB: + case CV_Language_ILASM: + case CV_Language_JAVA: + case CV_Language_JSCRIPT: + case CV_Language_MSIL: + case CV_Language_HLSL: + NotImplemented; + default: + return RDI_Language_NULL; + } +} + +internal RDI_TypeModifierFlags +rdi_type_modifier_flags_from_cv_modifier_flags(CV_ModifierFlags flags) +{ + RDI_TypeModifierFlags result = 0; + if (flags & CV_ModifierFlag_Const) { + result |= RDI_TypeModifierFlag_Const; + } + if (flags & CV_ModifierFlag_Volatile) { + result |= RDI_TypeModifierFlag_Volatile; + } + return result; +} + +internal RDI_TypeModifierFlags +rdi_type_modifier_flags_from_cv_pointer_attribs(CV_PointerAttribs attribs) +{ + RDI_TypeModifierFlags result = 0; + if (attribs & CV_PointerAttrib_Const) { + result |= RDI_TypeModifierFlag_Const; + } + if (attribs & CV_PointerAttrib_Volatile) { + result |= RDI_TypeModifierFlag_Volatile; + } + return result; +} + +internal RDI_TypeKind +rdi_type_kind_from_pointer(CV_PointerAttribs attribs, CV_PointerMode mode) +{ + RDI_TypeKind result = RDI_TypeKind_Ptr; + + if (attribs & CV_PointerAttrib_LRef) { + result = RDI_TypeKind_LRef; + } else if (attribs & CV_PointerAttrib_RRef) { + result = RDI_TypeKind_RRef; + } + + if (mode == CV_PointerMode_LRef) { + result = RDI_TypeKind_LRef; + } else if (mode == CV_PointerMode_RRef) { + result = RDI_TypeKind_RRef; + } + + return result; +} + +internal RDI_TypeKind +rdi_type_kind_from_cv_basic_type(CV_BasicType basic_type) +{ + switch (basic_type) { + case CV_BasicType_NOTYPE : return RDI_TypeKind_NULL; + case CV_BasicType_ABS : return RDI_TypeKind_NULL; + case CV_BasicType_SEGMENT : return RDI_TypeKind_NULL; + case CV_BasicType_VOID : return RDI_TypeKind_Void; + case CV_BasicType_CURRENCY : return RDI_TypeKind_NULL; + case CV_BasicType_NBASICSTR : return RDI_TypeKind_NULL; + case CV_BasicType_FBASICSTR : return RDI_TypeKind_NULL; + case CV_BasicType_HRESULT : return RDI_TypeKind_Handle; + case CV_BasicType_CHAR : return RDI_TypeKind_Char8; + case CV_BasicType_SHORT : return RDI_TypeKind_S16; + case CV_BasicType_LONG : return RDI_TypeKind_S32; + case CV_BasicType_QUAD : return RDI_TypeKind_S64; + case CV_BasicType_OCT : return RDI_TypeKind_S128; + case CV_BasicType_UCHAR : return RDI_TypeKind_UChar8; + case CV_BasicType_USHORT : return RDI_TypeKind_U16; + case CV_BasicType_ULONG : return RDI_TypeKind_U32; + case CV_BasicType_UQUAD : return RDI_TypeKind_U64; + case CV_BasicType_UOCT : return RDI_TypeKind_U128; + case CV_BasicType_BOOL8 : return RDI_TypeKind_S8; + case CV_BasicType_BOOL16 : return RDI_TypeKind_S16; + case CV_BasicType_BOOL32 : return RDI_TypeKind_S32; + case CV_BasicType_BOOL64 : return RDI_TypeKind_S64; + case CV_BasicType_FLOAT32 : return RDI_TypeKind_F32; + case CV_BasicType_FLOAT64 : return RDI_TypeKind_F64; + case CV_BasicType_FLOAT80 : return RDI_TypeKind_F80; + case CV_BasicType_FLOAT128 : return RDI_TypeKind_F128; + case CV_BasicType_FLOAT48 : return RDI_TypeKind_F48; + case CV_BasicType_FLOAT32PP : return RDI_TypeKind_F32PP; + case CV_BasicType_FLOAT16 : return RDI_TypeKind_F16; + case CV_BasicType_COMPLEX32 : return RDI_TypeKind_ComplexF32; + case CV_BasicType_COMPLEX64 : return RDI_TypeKind_ComplexF64; + case CV_BasicType_COMPLEX80 : return RDI_TypeKind_ComplexF80; + case CV_BasicType_COMPLEX128: return RDI_TypeKind_ComplexF128; + case CV_BasicType_BIT : return RDI_TypeKind_NULL; + case CV_BasicType_PASCHAR : return RDI_TypeKind_NULL; + case CV_BasicType_BOOL32FF : return RDI_TypeKind_NULL; + case CV_BasicType_INT8 : return RDI_TypeKind_S8; + case CV_BasicType_UINT8 : return RDI_TypeKind_U8; + case CV_BasicType_RCHAR : return RDI_TypeKind_Char8; + case CV_BasicType_WCHAR : return RDI_TypeKind_UChar16; + case CV_BasicType_CHAR16 : return RDI_TypeKind_Char16; + case CV_BasicType_CHAR32 : return RDI_TypeKind_Char32; + case CV_BasicType_INT16 : return RDI_TypeKind_S16; + case CV_BasicType_UINT16 : return RDI_TypeKind_U16; + case CV_BasicType_INT32 : return RDI_TypeKind_S32; + case CV_BasicType_UINT32 : return RDI_TypeKind_U32; + case CV_BasicType_INT64 : return RDI_TypeKind_S64; + case CV_BasicType_UINT64 : return RDI_TypeKind_U64; + case CV_BasicType_INT128 : return RDI_TypeKind_S128; + case CV_BasicType_UINT128 : return RDI_TypeKind_U128; + case CV_BasicType_CHAR8 : return RDI_TypeKind_Char8; + case CV_BasicType_PTR : return RDI_TypeKind_Ptr; + } + return RDI_TypeKind_NULL; +} + +internal RDI_RegCode +rdi_reg_code_from_cv(CV_Arch arch, CV_Reg reg) +{ + RDI_RegCode result = 0; + switch (arch) { + case CV_Arch_8086: { + switch (reg) { +#define X(CVN,C,RDN,BP,BZ) case C: result = RDI_RegCodeX86_##RDN; break; + CV_Reg_X86_XList(X) +#undef X + } + } break; + case CV_Arch_X64: { + switch (reg) { +#define X(CVN,C,RDN,BP,BZ) case C: result = RDI_RegCodeX64_##RDN; break; + CV_Reg_X64_XList(X) +#undef X + } + } break; + default: NotImplemented; + } + return result; +} + +internal RDI_ChecksumKind +rdi_checksum_from_cv_c13(CV_C13ChecksumKind kind) +{ + switch (kind) { + case CV_C13ChecksumKind_Null: return RDI_Checksum_Null; + case CV_C13ChecksumKind_MD5: return RDI_Checksum_MD5; + case CV_C13ChecksumKind_SHA1: return RDI_Checksum_SHA1; + case CV_C13ChecksumKind_SHA256: return RDI_Checksum_SHA256; + } + InvalidPath; + return RDI_Checksum_Null; +} diff --git a/src/linker/rdi/rdi_cv.h b/src/linker/rdi/rdi_cv.h new file mode 100644 index 00000000..0201c688 --- /dev/null +++ b/src/linker/rdi/rdi_cv.h @@ -0,0 +1,14 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +internal RDI_Arch rdi_arch_from_cv_arch(CV_Arch arch); +internal RDI_Language rdi_language_from_cv_language(CV_Language language); +internal RDI_TypeModifierFlags rdi_type_modifier_flags_from_cv_pointer_attribs(CV_PointerAttribs attribs); +internal RDI_TypeKind rdi_type_kind_from_cv_basic_type(CV_BasicType basic_type); +internal RDI_RegCode rdi_reg_code_from_cv(CV_Arch arch, CV_Reg reg); + +internal RDI_ChecksumKind rdi_checksum_from_cv_c13(CV_C13ChecksumKind kind); + + diff --git a/src/linker/rdi/rdi_overrides.h b/src/linker/rdi/rdi_overrides.h new file mode 100644 index 00000000..b593000e --- /dev/null +++ b/src/linker/rdi/rdi_overrides.h @@ -0,0 +1,20 @@ +#pragma once + +typedef U8 RDI_U8; +typedef U16 RDI_U16; +typedef U32 RDI_U32; +typedef U64 RDI_U64; +typedef S8 RDI_S8; +typedef S16 RDI_S16; +typedef S32 RDI_S32; +typedef S64 RDI_S64; + +#define RDI_PROC internal +#define RDIM_MEMSET_OVERRIDE +#define rdim_memset MemorySet + +#define RDIM_MEMCPY_OVERRIDE +#define rdim_memcpy MemoryCopy + +#define rdim_vsnprintf raddbg_vsnprintf + diff --git a/src/linker/third_party_ext/blake3/asm/LICENSE b/src/linker/third_party_ext/blake3/asm/LICENSE new file mode 100644 index 00000000..f5892efc --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/LICENSE @@ -0,0 +1,330 @@ +This work is released into the public domain with CC0 1.0. Alternatively, it is +licensed under the Apache License 2.0. + +------------------------------------------------------------------------------- + +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. + +------------------------------------------------------------------------------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/linker/third_party_ext/blake3/asm/blake3.c b/src/linker/third_party_ext/blake3/asm/blake3.c new file mode 100644 index 00000000..692f4b02 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3.c @@ -0,0 +1,616 @@ +#include +#include +#include + +#include "blake3.h" +#include "blake3_impl.h" + +const char *blake3_version(void) { return BLAKE3_VERSION_STRING; } + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_len(size_t content_len) { + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide(const uint8_t *input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, uint8_t *out) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, + chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide( + right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void compress_subtree_to_parent_node( + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array); + assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + // The second half of this loop condition is always true, and we just + // asserted it above. But GCC can't tell that it's always true, and if NDEBUG + // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious + // warnings here. GCC 8.5 is particularly sensitive, so if you're changing + // this code, test it against that version. + while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} + +void blake3_hasher_reset(blake3_hasher *self) { + chunk_state_reset(&self->chunk, self->key, 0); + self->cv_stack_len = 0; +} diff --git a/src/linker/third_party_ext/blake3/asm/blake3.h b/src/linker/third_party_ext/blake3/asm/blake3.h new file mode 100644 index 00000000..f694dcf2 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3.h @@ -0,0 +1,82 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include +#include + +#if !defined(BLAKE3_API) +# if defined(_WIN32) || defined(__CYGWIN__) +# if defined(BLAKE3_DLL) +# if defined(BLAKE3_DLL_EXPORTS) +# define BLAKE3_API __declspec(dllexport) +# else +# define BLAKE3_API __declspec(dllimport) +# endif +# define BLAKE3_PRIVATE +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +# elif __GNUC__ >= 4 +# define BLAKE3_API __attribute__((visibility("default"))) +# define BLAKE3_PRIVATE __attribute__((visibility("hidden"))) +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "1.5.0" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +BLAKE3_API const char *blake3_version(void); +BLAKE3_API void blake3_hasher_init(blake3_hasher *self); +BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); +BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/src/linker/third_party_ext/blake3/asm/blake3_avx2_x86-64_unix.S b/src/linker/third_party_ext/blake3/asm/blake3_avx2_x86-64_unix.S new file mode 100644 index 00000000..812bb856 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_avx2_x86-64_unix.S @@ -0,0 +1,1815 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/src/linker/third_party_ext/blake3/asm/blake3_avx2_x86-64_windows_msvc.asm b/src/linker/third_party_ext/blake3/asm/blake3_avx2_x86-64_windows_msvc.asm new file mode 100644 index 00000000..352298ed --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_avx2_x86-64_windows_msvc.asm @@ -0,0 +1,1828 @@ +public _blake3_hash_many_avx2 +public blake3_hash_many_avx2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx2 PROC +_blake3_hash_many_avx2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+2D0H], xmm6 + vmovdqa xmmword ptr [rsp+2E0H], xmm7 + vmovdqa xmmword ptr [rsp+2F0H], xmm8 + vmovdqa xmmword ptr [rsp+300H], xmm9 + vmovdqa xmmword ptr [rsp+310H], xmm10 + vmovdqa xmmword ptr [rsp+320H], xmm11 + vmovdqa xmmword ptr [rsp+330H], xmm12 + vmovdqa xmmword ptr [rsp+340H], xmm13 + vmovdqa xmmword ptr [rsp+350H], xmm14 + vmovdqa xmmword ptr [rsp+360H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+260H], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0] + vpand ymm2, ymm0, ymmword ptr [ADD1] + vmovdqa ymmword ptr [rsp+2A0H], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+220H], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm3 + shl rdx, 6 + mov qword ptr [rsp+2C0H], rdx + cmp rsi, 8 + jc final7blocks +outerloop8: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+2C0H] + cmove eax, ebx + mov dword ptr [rsp+200H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+20H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+40H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+60H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+80H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0E0H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+100H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+120H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+140H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+160H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+180H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+1A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+1C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+1E0H], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+200H] + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+220H] + vpxor ymm13, ymm1, ymmword ptr [rsp+240H] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+2A0H] + vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] + vmovdqa ymmword ptr [rsp+220H], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+240H] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + sub rsi, 8 + cmp rsi, 8 + jnc outerloop8 + test rsi, rsi + jnz final7blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+2D0H] + vmovdqa xmm7, xmmword ptr [rsp+2E0H] + vmovdqa xmm8, xmmword ptr [rsp+2F0H] + vmovdqa xmm9, xmmword ptr [rsp+300H] + vmovdqa xmm10, xmmword ptr [rsp+310H] + vmovdqa xmm11, xmmword ptr [rsp+320H] + vmovdqa xmm12, xmmword ptr [rsp+330H] + vmovdqa xmm13, xmmword ptr [rsp+340H] + vmovdqa xmm14, xmmword ptr [rsp+350H] + vmovdqa xmm15, xmmword ptr [rsp+360H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+2C0H] + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + test rsi, 4H + je final3blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+220H] + vbroadcasti128 ymm13, xmmword ptr [rsp+240H] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 50H + vpermq ymm15, ymm15, 50H + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] + vpblendd ymm14, ymm14, ymm12, 44H + vpblendd ymm15, ymm15, ymm12, 44H + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+20H], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vmovups ymm2, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + vmovups ymm10, ymmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 93H + vpshufd ymm15, ymm15, 93H + vpbroadcastd ymm2, dword ptr [rsp+200H] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+20H] + vpblendd ymm3, ymm3, ymm2, 88H + vpblendd ymm11, ymm11, ymm2, 88H + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vmovdqa ymm10, ymm2 + mov al, 7 +roundloop4: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+40H], ymm4 + nop + vmovdqa ymmword ptr [rsp+60H], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+80H], ymm5 + vmovdqa ymmword ptr [rsp+0A0H], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 93H + vpshufd ymm8, ymm8, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 39H + vpshufd ymm10, ymm10, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 39H + vpshufd ymm8, ymm8, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 93H + vpshufd ymm10, ymm10, 93H + dec al + je endroundloop4 + vmovdqa ymm4, ymmword ptr [rsp+40H] + vmovdqa ymm5, ymmword ptr [rsp+80H] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0FH + vpshufd ymm4, ymm12, 39H + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0AAH + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 88H + vpshufd ymm12, ymm12, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymmword ptr [rsp+40H], ymm13 + vmovdqa ymmword ptr [rsp+80H], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+60H] + vmovdqa ymm13, ymmword ptr [rsp+0A0H] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0FH + vpshufd ymm12, ymm5, 39H + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0AAH + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 88H + vpshufd ymm5, ymm5, 78H + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 1EH + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+40H] + vmovdqa ymm6, ymmword ptr [rsp+80H] + jmp roundloop4 +endroundloop4: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqu xmmword ptr [rbx+40H], xmm8 + vmovdqu xmmword ptr [rbx+50H], xmm9 + vextracti128 xmmword ptr [rbx+60H], ymm8, 01H + vextracti128 xmmword ptr [rbx+70H], ymm9, 01H + vmovaps xmm8, xmmword ptr [rsp+260H] + vmovaps xmm0, xmmword ptr [rsp+220H] + vmovaps xmm1, xmmword ptr [rsp+230H] + vmovaps xmm2, xmmword ptr [rsp+240H] + vmovaps xmm3, xmmword ptr [rsp+250H] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+220H], xmm0 + vmovaps xmmword ptr [rsp+240H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test rsi, 2H + je final1blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp+220H] + vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+224H] + vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + vbroadcasti128 ymm14, xmmword ptr [ROT16] + vbroadcasti128 ymm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+200H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovaps ymm8, ymmword ptr [rsp+260H] + vmovaps ymm0, ymmword ptr [rsp+220H] + vmovups ymm1, ymmword ptr [rsp+228H] + vmovaps ymm2, ymmword ptr [rsp+240H] + vmovups ymm3, ymmword ptr [rsp+248H] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+220H], ymm0 + vmovaps ymmword ptr [rsp+240H], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1blocks: + test rsi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm3, dword ptr [rsp+220H] + vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm14, xmmword ptr [ROT16] + vmovdqa xmm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx2 ENDP +blake3_hash_many_avx2 ENDP +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + +ADD1: + dd 8 dup (8) + +BLAKE3_IV_0: + dd 8 dup (6A09E667H) + +BLAKE3_IV_1: + dd 8 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 8 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 8 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 8 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +_RDATA ENDS +END diff --git a/src/linker/third_party_ext/blake3/asm/blake3_avx512_x86-64_unix.S b/src/linker/third_party_ext/blake3/asm/blake3_avx512_x86-64_unix.S new file mode 100644 index 00000000..a06aede0 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_avx512_x86-64_unix.S @@ -0,0 +1,2585 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +#ifdef __APPLE__ +.text +#else +.section .text +#endif +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/src/linker/third_party_ext/blake3/asm/blake3_avx512_x86-64_windows_msvc.asm b/src/linker/third_party_ext/blake3/asm/blake3_avx512_x86-64_windows_msvc.asm new file mode 100644 index 00000000..b19efbaa --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_avx512_x86-64_windows_msvc.asm @@ -0,0 +1,2634 @@ +public _blake3_hash_many_avx512 +public blake3_hash_many_avx512 +public blake3_compress_in_place_avx512 +public _blake3_compress_in_place_avx512 +public blake3_compress_xof_avx512 +public _blake3_compress_xof_avx512 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx512 PROC +_blake3_hash_many_avx512 PROC + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+90H], xmm6 + vmovdqa xmmword ptr [rsp+0A0H], xmm7 + vmovdqa xmmword ptr [rsp+0B0H], xmm8 + vmovdqa xmmword ptr [rsp+0C0H], xmm9 + vmovdqa xmmword ptr [rsp+0D0H], xmm10 + vmovdqa xmmword ptr [rsp+0E0H], xmm11 + vmovdqa xmmword ptr [rsp+0F0H], xmm12 + vmovdqa xmmword ptr [rsp+100H], xmm13 + vmovdqa xmmword ptr [rsp+110H], xmm14 + vmovdqa xmmword ptr [rsp+120H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32] + vpcmpud k2, ymm2, ymm0, 1 + vpcmpud k3, ymm3, ymm0, 1 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd ymm6, dword ptr [ADD1] + vpaddd ymm4 {k2}, ymm4, ymm6 + vpaddd ymm5 {k3}, ymm5, ymm6 + ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8} + ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+20H], ymm3 + vmovdqa ymmword ptr [rsp+40H], ymm4 + vmovdqa ymmword ptr [rsp+60H], ymm5 + shl rdx, 6 + mov qword ptr [rsp+80H], rdx + cmp rsi, 16 + jc final15blocks +outerloop16: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+1H*4H] + vpbroadcastd zmm2, dword ptr [rcx+2H*4H] + vpbroadcastd zmm3, dword ptr [rcx+3H*4H] + vpbroadcastd zmm4, dword ptr [rcx+4H*4H] + vpbroadcastd zmm5, dword ptr [rcx+5H*4H] + vpbroadcastd zmm6, dword ptr [rcx+6H*4H] + vpbroadcastd zmm7, dword ptr [rcx+7H*4H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop16: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0] + vmovdqa32 zmm31, zmmword ptr [INDEX1] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd zmm15, dword ptr [rsp+22H*4H] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop16 + mov rbx, qword ptr [rbp+90H] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 88H + vshufi32x4 zmm17, zmm1, zmm5, 88H + vshufi32x4 zmm18, zmm2, zmm6, 88H + vshufi32x4 zmm19, zmm3, zmm7, 88H + vshufi32x4 zmm20, zmm0, zmm4, 0DDH + vshufi32x4 zmm21, zmm1, zmm5, 0DDH + vshufi32x4 zmm22, zmm2, zmm6, 0DDH + vshufi32x4 zmm23, zmm3, zmm7, 0DDH + vshufi32x4 zmm0, zmm16, zmm17, 88H + vshufi32x4 zmm1, zmm18, zmm19, 88H + vshufi32x4 zmm2, zmm20, zmm21, 88H + vshufi32x4 zmm3, zmm22, zmm23, 88H + vshufi32x4 zmm4, zmm16, zmm17, 0DDH + vshufi32x4 zmm5, zmm18, zmm19, 0DDH + vshufi32x4 zmm6, zmm20, zmm21, 0DDH + vshufi32x4 zmm7, zmm22, zmm23, 0DDH + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1 + vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2 + vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3 + vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4 + vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5 + vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6 + vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H] + vmovdqa32 zmm2, zmm0 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd zmm4, dword ptr [ADD16] + vpbroadcastd zmm5, dword ptr [ADD1] + vpaddd zmm2{k1}, zmm0, zmm4 + ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16} + vpcmpud k2, zmm2, zmm0, 1 + vpaddd zmm1 {k2}, zmm1, zmm5 + ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+90H], rbx + sub rsi, 16 + cmp rsi, 16 + jnc outerloop16 + test rsi, rsi + jne final15blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+90H] + vmovdqa xmm7, xmmword ptr [rsp+0A0H] + vmovdqa xmm8, xmmword ptr [rsp+0B0H] + vmovdqa xmm9, xmmword ptr [rsp+0C0H] + vmovdqa xmm10, xmmword ptr [rsp+0D0H] + vmovdqa xmm11, xmmword ptr [rsp+0E0H] + vmovdqa xmm12, xmmword ptr [rsp+0F0H] + vmovdqa xmm13, xmmword ptr [rsp+100H] + vmovdqa xmm14, xmmword ptr [rsp+110H] + vmovdqa xmm15, xmmword ptr [rsp+120H] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final15blocks: + test esi, 8H + je final7blocks + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+40H] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd ymm15, dword ptr [rsp+88H] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+40H] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+40H], ymm2 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + add rdi, 64 + sub rsi, 8 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+80H] + movzx r13, byte ptr [rbp+78H] + movzx r12, byte ptr [rbp+88H] + test esi, 4H + je final3blocks + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+40H] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0DCH + vpermq ymm15, ymm15, 0DCH + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] + vinserti64x4 zmm13, zmm14, ymm15, 01H + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+22H*4H] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-1H*40H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-30H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-20H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-10H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 93H + vpshufd zmm7, zmm7, 93H + mov al, 7 +roundloop4: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 93H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 39H + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 39H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 93H + dec al + jz endroundloop4 + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0FH + vpshufd zmm4, zmm8, 39H + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 78H + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 1EH + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp roundloop4 +endroundloop4: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H + vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H + vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H + vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test esi, 2H + je final1block + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+4H] + vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+88H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx512 ENDP +blake3_hash_many_avx512 ENDP + +ALIGN 16 +blake3_compress_in_place_avx512 PROC +_blake3_compress_in_place_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rcx], xmm0 + vmovdqu xmmword ptr [rcx+10H], xmm1 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_in_place_avx512 ENDP +blake3_compress_in_place_avx512 ENDP + +ALIGN 16 +blake3_compress_xof_avx512 PROC +_blake3_compress_xof_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + mov r10, qword ptr [rsp+78H] + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, xmmword ptr [rcx] + vpxor xmm3, xmm3, xmmword ptr [rcx+10H] + vmovdqu xmmword ptr [r10], xmm0 + vmovdqu xmmword ptr [r10+10H], xmm1 + vmovdqu xmmword ptr [r10+20H], xmm2 + vmovdqu xmmword ptr [r10+30H], xmm3 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_xof_avx512 ENDP +blake3_compress_xof_avx512 ENDP + +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +INDEX0: + dd 0, 1, 2, 3, 16, 17, 18, 19 + dd 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + dd 4, 5, 6, 7, 20, 21, 22, 23 + dd 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + dd 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: + dd 1 +ADD16: + dd 16 +BLAKE3_BLOCK_LEN: + dd 64 +ALIGN 64 +BLAKE3_IV: +BLAKE3_IV_0: + dd 06A09E667H +BLAKE3_IV_1: + dd 0BB67AE85H +BLAKE3_IV_2: + dd 03C6EF372H +BLAKE3_IV_3: + dd 0A54FF53AH + +_RDATA ENDS +END diff --git a/src/linker/third_party_ext/blake3/asm/blake3_dispatch.c b/src/linker/third_party_ext/blake3/asm/blake3_dispatch.c new file mode 100644 index 00000000..6f814569 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_dispatch.c @@ -0,0 +1,278 @@ +#include +#include +#include + +#include "blake3_impl.h" + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#else +#undef IS_X86 /* Unimplemented! */ +#endif +#endif + +#define MAYBE_UNUSED(x) (void)((x)) + +#if defined(IS_X86) +static uint64_t xgetbv(void) { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +#if !defined(BLAKE3_TESTING) +static /* Allow the variable to be controlled manually for testing */ +#endif + volatile int g_cpu_features = UNDEFINED; + +#if !defined(BLAKE3_TESTING) +static +#endif + enum cpu_feature + get_cpu_features(void) { + + /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ + long features = g_cpu_features; + if (features != UNDEFINED) { + return (enum cpu_feature)features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 9)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + g_cpu_features = features; + return (enum cpu_feature)features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + return; + } +#endif +#endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + return; + } +#endif +#endif + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#endif + +#if BLAKE3_USE_NEON == 1 + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + return 8; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + return 4; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + return 4; + } +#endif +#endif +#if BLAKE3_USE_NEON == 1 + return 4; +#endif + return 1; +} diff --git a/src/linker/third_party_ext/blake3/asm/blake3_impl.h b/src/linker/third_party_ext/blake3/asm/blake3_impl.h new file mode 100644 index 00000000..beab5cf5 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_impl.h @@ -0,0 +1,285 @@ +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include +#include +#include +#include +#include + +#include "blake3.h" + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +// This C implementation tries to support recent versions of GCC, Clang, and +// MSVC. +#if defined(_MSC_VER) +#define INLINE static __forceinline +#else +#define INLINE static inline __attribute__((always_inline)) +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#define IS_AARCH64 +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#endif +#endif + +#if !defined(BLAKE3_USE_NEON) + // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness + #if defined(IS_AARCH64) + #if defined(__ARM_BIG_ENDIAN) + #define BLAKE3_USE_NEON 0 + #else + #define BLAKE3_USE_NEON 1 + #endif + #else + #define BLAKE3_USE_NEON 0 + #endif +#endif + +#if defined(IS_X86) +#define MAX_SIMD_DEGREE 16 +#elif BLAKE3_USE_NEON == 1 +#define MAX_SIMD_DEGREE 4 +#else +#define MAX_SIMD_DEGREE 1 +#endif + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +/* x is assumed to be nonzero. */ +static unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ (unsigned int)__builtin_clzll(x); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return index; +#elif defined(_MSC_VER) && defined(IS_X86_32) + if(x >> 32) { + unsigned long index; + _BitScanReverse(&index, (unsigned long)(x >> 32)); + return 32 + index; + } else { + unsigned long index; + _BitScanReverse(&index, (unsigned long)x); + return index; + } +#else + unsigned int c = 0; + if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if(x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +// Count the number of 1 bits. +INLINE unsigned int popcnt(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (unsigned int)__builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +// Largest power of two less than or equal to x. As a special case, returns 1 +// when x is 0. +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { + return 1ULL << highest_one(x | 1); +} + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } + +INLINE uint32_t counter_high(uint64_t counter) { + return (uint32_t)(counter >> 32); +} + +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +size_t blake3_simd_degree(void); + + +// Declarations for implementation-specific functions. +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if defined(IS_X86) +#if !defined(BLAKE3_NO_SSE2) +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_SSE41) +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX2) +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX512) +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#endif + +#if BLAKE3_USE_NEON == 1 +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + + +#endif /* BLAKE3_IMPL_H */ diff --git a/src/linker/third_party_ext/blake3/asm/blake3_neon.c b/src/linker/third_party_ext/blake3/asm/blake3_neon.c new file mode 100644 index 00000000..8a818fc7 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_neon.c @@ -0,0 +1,368 @@ +#include "blake3_impl.h" + +#include + +#ifdef __ARM_BIG_ENDIAN +#error "This implementation only supports little-endian ARM." +// It might be that all we need for big-endian support here is to get the loads +// and stores right, but step zero would be finding a way to test it in CI. +#endif + +INLINE uint32x4_t loadu_128(const uint8_t src[16]) { + // vld1q_u32 has alignment requirements. Don't use it. + uint32x4_t x; + memcpy(&x, src, 16); + return x; +} + +INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { + // vst1q_u32 has alignment requirements. Don't use it. + memcpy(dest, &src, 16); +} + +INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { + return vaddq_u32(a, b); +} + +INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { + return veorq_u32(a, b); +} + +INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } + +INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + uint32_t array[4] = {a, b, c, d}; + return vld1q_u32(array); +} + +INLINE uint32x4_t rot16_128(uint32x4_t x) { + // The straightfoward implementation would be two shifts and an or, but that's + // slower on microarchitectures we've tested. See + // https://github.com/BLAKE3-team/BLAKE3/pull/319. + // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); +} + +INLINE uint32x4_t rot12_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); + return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); +} + +INLINE uint32x4_t rot8_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); +#if defined(__clang__) + return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); +#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 + static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; + return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); +#else + return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); +#endif +} + +INLINE uint32x4_t rot7_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); + return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); +} + +// TODO: compress_neon + +// TODO: hash2_neon + +/* + * ---------------------------------------------------------------------------- + * hash4_neon + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { + // Individually transpose the four 2x2 sub-matrices in each corner. + uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); + uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); + + // Swap the top-right and bottom-left 2x2s (which just got transposed). + vecs[0] = + vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); + vecs[1] = + vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); + vecs[2] = + vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); + vecs[3] = + vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, uint32x4_t out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + uint32x4_t *out_low, uint32x4_t *out_high) { + uint64_t mask = (increment_counter ? ~0 : 0); + *out_low = set4( + counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); + *out_high = set4( + counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); +} + +void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + uint32x4_t h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + uint32x4_t counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + uint32x4_t block_flags_vec = set1_128(block_flags); + uint32x4_t msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + uint32x4_t v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_neon + * ---------------------------------------------------------------------------- + */ + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +INLINE void hash_one_neon(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + // TODO: Implement compress_neon. However note that according to + // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, + // compress_neon might not be any faster than compress_portable. + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 4) { + blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/src/linker/third_party_ext/blake3/asm/blake3_portable.c b/src/linker/third_party_ext/blake3/asm/blake3_portable.c new file mode 100644 index 00000000..062dd1b4 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_portable.c @@ -0,0 +1,160 @@ +#include "blake3_impl.h" +#include + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { + return (w >> c) | (w << (32 - c)); +} + +INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) { + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { + // Select the message schedule based on the round. + const uint8_t *schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/src/linker/third_party_ext/blake3/asm/blake3_sse2_x86-64_unix.S b/src/linker/third_party_ext/blake3/asm/blake3_sse2_x86-64_unix.S new file mode 100644 index 00000000..99f033fe --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_sse2_x86-64_unix.S @@ -0,0 +1,2291 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse2: +_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/src/linker/third_party_ext/blake3/asm/blake3_sse2_x86-64_windows_msvc.asm b/src/linker/third_party_ext/blake3/asm/blake3_sse2_x86-64_windows_msvc.asm new file mode 100644 index 00000000..507502f1 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_sse2_x86-64_windows_msvc.asm @@ -0,0 +1,2350 @@ +public _blake3_hash_many_sse2 +public blake3_hash_many_sse2 +public blake3_compress_in_place_sse2 +public _blake3_compress_in_place_sse2 +public blake3_compress_xof_sse2 +public _blake3_compress_xof_sse2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse2 PROC +_blake3_hash_many_sse2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + movd xmm13, dword ptr [rsp+124H] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + shl rax, 20H + or rax, 40H + movd xmm3, rax + movdqa xmmword ptr [rsp+20H], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + punpcklqdq xmm3, xmmword ptr [rsp+20H] + punpcklqdq xmm11, xmmword ptr [rsp+20H] + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm12, xmm13 + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+30H], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+30H] + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + mov eax, dword ptr [rsp+130H] + neg eax + mov r10d, dword ptr [rsp+110H+8*rax] + mov r11d, dword ptr [rsp+120H+8*rax] + mov dword ptr [rsp+110H], r10d + mov dword ptr [rsp+120H], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm10 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse2 ENDP +blake3_hash_many_sse2 ENDP + +blake3_compress_in_place_sse2 PROC +_blake3_compress_in_place_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse2 ENDP +blake3_compress_in_place_sse2 ENDP + +ALIGN 16 +blake3_compress_xof_sse2 PROC +_blake3_compress_xof_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse2 ENDP +blake3_compress_xof_sse2 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +PBLENDW_0x33_MASK: + dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H +PBLENDW_0xCC_MASK: + dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH +PBLENDW_0x3F_MASK: + dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H +PBLENDW_0xC0_MASK: + dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH + +_RDATA ENDS +END diff --git a/src/linker/third_party_ext/blake3/asm/blake3_sse41_x86-64_unix.S b/src/linker/third_party_ext/blake3/asm/blake3_sse41_x86-64_unix.S new file mode 100644 index 00000000..a3ff6426 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_sse41_x86-64_unix.S @@ -0,0 +1,2028 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse41: +_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/src/linker/third_party_ext/blake3/asm/blake3_sse41_x86-64_windows_msvc.asm b/src/linker/third_party_ext/blake3/asm/blake3_sse41_x86-64_windows_msvc.asm new file mode 100644 index 00000000..8966c7b8 --- /dev/null +++ b/src/linker/third_party_ext/blake3/asm/blake3_sse41_x86-64_windows_msvc.asm @@ -0,0 +1,2089 @@ +public _blake3_hash_many_sse41 +public blake3_hash_many_sse41 +public blake3_compress_in_place_sse41 +public _blake3_compress_in_place_sse41 +public blake3_compress_xof_sse41 +public _blake3_compress_xof_sse41 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse41 PROC +_blake3_hash_many_sse41 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + pinsrd xmm14, dword ptr [rsp+124H], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0CCH + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0C0H + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0CCH + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0C0H + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + movdqa xmm0, xmmword ptr [rsp+130H] + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm2, xmmword ptr [rsp+120H] + movdqu xmm3, xmmword ptr [rsp+118H] + movdqu xmm4, xmmword ptr [rsp+128H] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+110H], xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse41 ENDP +blake3_hash_many_sse41 ENDP + +blake3_compress_in_place_sse41 PROC +_blake3_compress_in_place_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse41 ENDP +blake3_compress_in_place_sse41 ENDP + +ALIGN 16 +blake3_compress_xof_sse41 PROC +_blake3_compress_xof_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse41 ENDP +blake3_compress_xof_sse41 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +_RDATA ENDS +END + diff --git a/src/linker/third_party_ext/blake3/c/LICENSE b/src/linker/third_party_ext/blake3/c/LICENSE new file mode 100644 index 00000000..f5892efc --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/LICENSE @@ -0,0 +1,330 @@ +This work is released into the public domain with CC0 1.0. Alternatively, it is +licensed under the Apache License 2.0. + +------------------------------------------------------------------------------- + +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. + +------------------------------------------------------------------------------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/linker/third_party_ext/blake3/c/blake3.c b/src/linker/third_party_ext/blake3/c/blake3.c new file mode 100644 index 00000000..692f4b02 --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3.c @@ -0,0 +1,616 @@ +#include +#include +#include + +#include "blake3.h" +#include "blake3_impl.h" + +const char *blake3_version(void) { return BLAKE3_VERSION_STRING; } + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_len(size_t content_len) { + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide(const uint8_t *input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, uint8_t *out) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, + chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide( + right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void compress_subtree_to_parent_node( + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array); + assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + // The second half of this loop condition is always true, and we just + // asserted it above. But GCC can't tell that it's always true, and if NDEBUG + // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious + // warnings here. GCC 8.5 is particularly sensitive, so if you're changing + // this code, test it against that version. + while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} + +void blake3_hasher_reset(blake3_hasher *self) { + chunk_state_reset(&self->chunk, self->key, 0); + self->cv_stack_len = 0; +} diff --git a/src/linker/third_party_ext/blake3/c/blake3.h b/src/linker/third_party_ext/blake3/c/blake3.h new file mode 100644 index 00000000..f694dcf2 --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3.h @@ -0,0 +1,82 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include +#include + +#if !defined(BLAKE3_API) +# if defined(_WIN32) || defined(__CYGWIN__) +# if defined(BLAKE3_DLL) +# if defined(BLAKE3_DLL_EXPORTS) +# define BLAKE3_API __declspec(dllexport) +# else +# define BLAKE3_API __declspec(dllimport) +# endif +# define BLAKE3_PRIVATE +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +# elif __GNUC__ >= 4 +# define BLAKE3_API __attribute__((visibility("default"))) +# define BLAKE3_PRIVATE __attribute__((visibility("hidden"))) +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "1.5.0" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +BLAKE3_API const char *blake3_version(void); +BLAKE3_API void blake3_hasher_init(blake3_hasher *self); +BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); +BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/src/linker/third_party_ext/blake3/c/blake3_avx2.c b/src/linker/third_party_ext/blake3/c/blake3_avx2.c new file mode 100644 index 00000000..46b5cecf --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_avx2.c @@ -0,0 +1,312 @@ +#include "blake3_impl.h" + +#include + +#define DEGREE 8 + +INLINE __m256i loadu(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE void storeu(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m256i rot16(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, + 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m256i rot12(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); +} + +INLINE __m256i rot8(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, + 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m256i rot7(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); +} + +INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m256i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch((const char *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[8]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); + const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i add1 = _mm256_and_si256(mask, add0); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1); + __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), + _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(h_vecs); + storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } +#if !defined(BLAKE3_NO_SSE41) + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); +#else + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +#endif +} diff --git a/src/linker/third_party_ext/blake3/c/blake3_avx512.c b/src/linker/third_party_ext/blake3/c/blake3_avx512.c new file mode 100644 index 00000000..f969bf37 --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_avx512.c @@ -0,0 +1,1220 @@ +#include "blake3_impl.h" + +#include + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu_128(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE __m256i loadu_256(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE __m512i loadu_512(const uint8_t src[64]) { + return _mm512_loadu_si512((const __m512i *)src); +} + +INLINE void storeu_128(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE void storeu_256(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } + +INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } + +INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } + +INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } + +INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } + +INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } + +INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } + +INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } + +INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } + +INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } + +INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } + +INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } + +INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } + +INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } + +/* + * ---------------------------------------------------------------------------- + * compress_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot16_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot12_128(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot8_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot7_128(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu_128((uint8_t *)&cv[0]); + rows[1] = loadu_128((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), &out[0]); + storeu_128(xor_128(rows[1], rows[3]), &out[16]); + storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); + storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); +} + +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +/* + * ---------------------------------------------------------------------------- + * hash4_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(__m128i vecs[4]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const char *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m256i mask_vec = _mm256_set1_epi64x(mask); + __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); + deltas = _mm256_and_si256(mask_vec, deltas); + __m256i counters = + _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); + *out_lo = _mm256_cvtepi64_epi32(counters); + *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); +} + +static +void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1_128(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash8_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[15] = rot16_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot12_256(v[4]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[15] = rot8_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot7_256(v[4]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot16_256(v[15]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[4] = rot12_256(v[4]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot8_256(v[15]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + v[4] = rot7_256(v[4]); +} + +INLINE void transpose_vecs_256(__m256i vecs[8]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch((const char *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_256(&out[0]); + transpose_vecs_256(&out[8]); +} + +INLINE void load_counters8(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m512i mask_vec = _mm512_set1_epi64(mask); + __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + deltas = _mm512_and_si512(mask_vec, deltas); + __m512i counters = + _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); + *out_lo = _mm512_cvtepi64_epi32(counters); + *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); +} + +static +void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), + set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters8(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1_256(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn8(v, msg_vecs, 0); + round_fn8(v, msg_vecs, 1); + round_fn8(v, msg_vecs, 2); + round_fn8(v, msg_vecs, 3); + round_fn8(v, msg_vecs, 4); + round_fn8(v, msg_vecs, 5); + round_fn8(v, msg_vecs, 6); + h_vecs[0] = xor_256(v[0], v[8]); + h_vecs[1] = xor_256(v[1], v[9]); + h_vecs[2] = xor_256(v[2], v[10]); + h_vecs[3] = xor_256(v[3], v[11]); + h_vecs[4] = xor_256(v[4], v[12]); + h_vecs[5] = xor_256(v[5], v[13]); + h_vecs[6] = xor_256(v[6], v[14]); + h_vecs[7] = xor_256(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_256(h_vecs); + storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash16_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[15] = rot16_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot12_512(v[4]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[15] = rot8_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot7_512(v[4]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot16_512(v[15]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[4] = rot12_512(v[4]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot8_512(v[15]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + v[4] = rot7_512(v[4]); +} + +// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order +#define LO_IMM8 0x88 + +INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, LO_IMM8); +} + +// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order +#define HI_IMM8 0xdd + +INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, HI_IMM8); +} + +INLINE void transpose_vecs_512(__m512i vecs[16]) { + // Interleave 32-bit lanes. The _0 unpack is lanes + // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes + // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. + __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); + __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); + __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); + __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); + __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); + __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); + __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); + __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); + __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); + __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); + __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); + __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); + __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); + __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); + __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); + __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); + + // Interleave 64-bit lanes. The _0 unpack is lanes + // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes + // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes + // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes + // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. + __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); + __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); + __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); + __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); + __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); + __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); + __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); + __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); + __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); + __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); + __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); + __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); + __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); + __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); + __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); + __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); + + // Interleave 128-bit lanes. The _0 unpack is + // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is + // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. + __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); + __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); + __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); + __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); + __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); + __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); + __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); + __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); + __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); + __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); + __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); + __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); + __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); + __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); + __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); + __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); + + // Interleave 128-bit lanes again for the final outputs. + vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); + vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); + vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); + vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); + vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); + vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); + vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); + vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); + vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); + vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); + vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); + vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); + vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); + vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); + vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); + vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); +} + +INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, + size_t block_offset, __m512i out[16]) { + out[0] = loadu_512(&inputs[0][block_offset]); + out[1] = loadu_512(&inputs[1][block_offset]); + out[2] = loadu_512(&inputs[2][block_offset]); + out[3] = loadu_512(&inputs[3][block_offset]); + out[4] = loadu_512(&inputs[4][block_offset]); + out[5] = loadu_512(&inputs[5][block_offset]); + out[6] = loadu_512(&inputs[6][block_offset]); + out[7] = loadu_512(&inputs[7][block_offset]); + out[8] = loadu_512(&inputs[8][block_offset]); + out[9] = loadu_512(&inputs[9][block_offset]); + out[10] = loadu_512(&inputs[10][block_offset]); + out[11] = loadu_512(&inputs[11][block_offset]); + out[12] = loadu_512(&inputs[12][block_offset]); + out[13] = loadu_512(&inputs[13][block_offset]); + out[14] = loadu_512(&inputs[14][block_offset]); + out[15] = loadu_512(&inputs[15][block_offset]); + for (size_t i = 0; i < 16; ++i) { + _mm_prefetch((const char *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_512(out); +} + +INLINE void load_counters16(uint64_t counter, bool increment_counter, + __m512i *out_lo, __m512i *out_hi) { + const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); + const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i masked_deltas = _mm512_and_si512(deltas, mask); + const __m512i low_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)counter), + masked_deltas); + // The carry bit is 1 if the high bit of the word was 1 before addition and is + // 0 after. + // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to + // compute the carry bits here, and originally we did, but that intrinsic is + // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271. + const __m512i carries = _mm512_srli_epi32( + _mm512_andnot_si512( + low_words, // 0 after (gets inverted by andnot) + _mm512_set1_epi32((int32_t)counter)), // and 1 before + 31); + const __m512i high_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)(counter >> 32)), + carries); + *out_lo = low_words; + *out_hi = high_words; +} + +static +void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { + __m512i h_vecs[8] = { + set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), + set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), + }; + __m512i counter_low_vec, counter_high_vec; + load_counters16(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); + __m512i block_flags_vec = set1_512(block_flags); + __m512i msg_vecs[16]; + transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m512i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn16(v, msg_vecs, 0); + round_fn16(v, msg_vecs, 1); + round_fn16(v, msg_vecs, 2); + round_fn16(v, msg_vecs, 3); + round_fn16(v, msg_vecs, 4); + round_fn16(v, msg_vecs, 5); + round_fn16(v, msg_vecs, 6); + h_vecs[0] = xor_512(v[0], v[8]); + h_vecs[1] = xor_512(v[1], v[9]); + h_vecs[2] = xor_512(v[2], v[10]); + h_vecs[3] = xor_512(v[3], v[11]); + h_vecs[4] = xor_512(v[4], v[12]); + h_vecs[5] = xor_512(v[5], v[13]); + h_vecs[6] = xor_512(v[6], v[14]); + h_vecs[7] = xor_512(v[7], v[15]); + + block_flags = flags; + } + + // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 + // state vectors. Pad the matrix with zeros. After transposition, store the + // lower half of each vector. + __m512i padded[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + }; + transpose_vecs_512(padded); + _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); + _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); + _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); + _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); + _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); + _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); + _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); + _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); + _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); + _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); + _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); + _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); + _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); + _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); + _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); + _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 16) { + blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 16; + } + inputs += 16; + num_inputs -= 16; + out = &out[16 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 8) { + blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 8; + } + inputs += 8; + num_inputs -= 8; + out = &out[8 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 4) { + blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/src/linker/third_party_ext/blake3/c/blake3_dispatch.c b/src/linker/third_party_ext/blake3/c/blake3_dispatch.c new file mode 100644 index 00000000..78f7ddb6 --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_dispatch.c @@ -0,0 +1,305 @@ +#include +#include +#include + +#include "blake3_impl.h" + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#include +#elif defined(__GNUC__) +#include +#else +#undef IS_X86 /* Unimplemented! */ +#endif +#endif + +#if !defined(BLAKE3_ATOMICS) +#if defined(__has_include) +#if __has_include() && !defined(_MSC_VER) +#define BLAKE3_ATOMICS 1 +#else +#define BLAKE3_ATOMICS 0 +#endif /* __has_include() && !defined(_MSC_VER) */ +#else +#define BLAKE3_ATOMICS 0 +#endif /* defined(__has_include) */ +#endif /* BLAKE3_ATOMICS */ + +#if BLAKE3_ATOMICS +#define ATOMIC_INT _Atomic int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#elif defined(_MSC_VER) +#define ATOMIC_INT LONG +#define ATOMIC_LOAD(x) InterlockedOr(&x, 0) +#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) +#else +#define ATOMIC_INT int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#endif + +#define MAYBE_UNUSED(x) (void)((x)) + +#if defined(IS_X86) +static uint64_t xgetbv(void) { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +#if !defined(BLAKE3_TESTING) +static /* Allow the variable to be controlled manually for testing */ +#endif + ATOMIC_INT g_cpu_features = UNDEFINED; + +#if !defined(BLAKE3_TESTING) +static +#endif + enum cpu_feature + get_cpu_features(void) { + + /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ + long features = ATOMIC_LOAD(g_cpu_features); + if (features != UNDEFINED) { + return (enum cpu_feature)features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 9)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + ATOMIC_STORE(g_cpu_features, features); + return (enum cpu_feature)features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + return; + } +#endif +#endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + return; + } +#endif +#endif + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#endif + +#if BLAKE3_USE_NEON == 1 + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + return 8; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + return 4; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + return 4; + } +#endif +#endif +#if BLAKE3_USE_NEON == 1 + return 4; +#endif + return 1; +} diff --git a/src/linker/third_party_ext/blake3/c/blake3_impl.h b/src/linker/third_party_ext/blake3/c/blake3_impl.h new file mode 100644 index 00000000..beab5cf5 --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_impl.h @@ -0,0 +1,285 @@ +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include +#include +#include +#include +#include + +#include "blake3.h" + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +// This C implementation tries to support recent versions of GCC, Clang, and +// MSVC. +#if defined(_MSC_VER) +#define INLINE static __forceinline +#else +#define INLINE static inline __attribute__((always_inline)) +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#define IS_AARCH64 +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#endif +#endif + +#if !defined(BLAKE3_USE_NEON) + // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness + #if defined(IS_AARCH64) + #if defined(__ARM_BIG_ENDIAN) + #define BLAKE3_USE_NEON 0 + #else + #define BLAKE3_USE_NEON 1 + #endif + #else + #define BLAKE3_USE_NEON 0 + #endif +#endif + +#if defined(IS_X86) +#define MAX_SIMD_DEGREE 16 +#elif BLAKE3_USE_NEON == 1 +#define MAX_SIMD_DEGREE 4 +#else +#define MAX_SIMD_DEGREE 1 +#endif + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +/* x is assumed to be nonzero. */ +static unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ (unsigned int)__builtin_clzll(x); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return index; +#elif defined(_MSC_VER) && defined(IS_X86_32) + if(x >> 32) { + unsigned long index; + _BitScanReverse(&index, (unsigned long)(x >> 32)); + return 32 + index; + } else { + unsigned long index; + _BitScanReverse(&index, (unsigned long)x); + return index; + } +#else + unsigned int c = 0; + if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if(x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +// Count the number of 1 bits. +INLINE unsigned int popcnt(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (unsigned int)__builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +// Largest power of two less than or equal to x. As a special case, returns 1 +// when x is 0. +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { + return 1ULL << highest_one(x | 1); +} + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } + +INLINE uint32_t counter_high(uint64_t counter) { + return (uint32_t)(counter >> 32); +} + +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +size_t blake3_simd_degree(void); + + +// Declarations for implementation-specific functions. +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if defined(IS_X86) +#if !defined(BLAKE3_NO_SSE2) +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_SSE41) +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX2) +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX512) +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#endif + +#if BLAKE3_USE_NEON == 1 +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + + +#endif /* BLAKE3_IMPL_H */ diff --git a/src/linker/third_party_ext/blake3/c/blake3_neon.c b/src/linker/third_party_ext/blake3/c/blake3_neon.c new file mode 100644 index 00000000..8a818fc7 --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_neon.c @@ -0,0 +1,368 @@ +#include "blake3_impl.h" + +#include + +#ifdef __ARM_BIG_ENDIAN +#error "This implementation only supports little-endian ARM." +// It might be that all we need for big-endian support here is to get the loads +// and stores right, but step zero would be finding a way to test it in CI. +#endif + +INLINE uint32x4_t loadu_128(const uint8_t src[16]) { + // vld1q_u32 has alignment requirements. Don't use it. + uint32x4_t x; + memcpy(&x, src, 16); + return x; +} + +INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { + // vst1q_u32 has alignment requirements. Don't use it. + memcpy(dest, &src, 16); +} + +INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { + return vaddq_u32(a, b); +} + +INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { + return veorq_u32(a, b); +} + +INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } + +INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + uint32_t array[4] = {a, b, c, d}; + return vld1q_u32(array); +} + +INLINE uint32x4_t rot16_128(uint32x4_t x) { + // The straightfoward implementation would be two shifts and an or, but that's + // slower on microarchitectures we've tested. See + // https://github.com/BLAKE3-team/BLAKE3/pull/319. + // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); +} + +INLINE uint32x4_t rot12_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); + return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); +} + +INLINE uint32x4_t rot8_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); +#if defined(__clang__) + return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); +#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 + static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; + return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); +#else + return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); +#endif +} + +INLINE uint32x4_t rot7_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); + return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); +} + +// TODO: compress_neon + +// TODO: hash2_neon + +/* + * ---------------------------------------------------------------------------- + * hash4_neon + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { + // Individually transpose the four 2x2 sub-matrices in each corner. + uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); + uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); + + // Swap the top-right and bottom-left 2x2s (which just got transposed). + vecs[0] = + vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); + vecs[1] = + vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); + vecs[2] = + vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); + vecs[3] = + vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, uint32x4_t out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + uint32x4_t *out_low, uint32x4_t *out_high) { + uint64_t mask = (increment_counter ? ~0 : 0); + *out_low = set4( + counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); + *out_high = set4( + counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); +} + +void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + uint32x4_t h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + uint32x4_t counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + uint32x4_t block_flags_vec = set1_128(block_flags); + uint32x4_t msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + uint32x4_t v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_neon + * ---------------------------------------------------------------------------- + */ + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +INLINE void hash_one_neon(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + // TODO: Implement compress_neon. However note that according to + // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, + // compress_neon might not be any faster than compress_portable. + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 4) { + blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/src/linker/third_party_ext/blake3/c/blake3_portable.c b/src/linker/third_party_ext/blake3/c/blake3_portable.c new file mode 100644 index 00000000..062dd1b4 --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_portable.c @@ -0,0 +1,160 @@ +#include "blake3_impl.h" +#include + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { + return (w >> c) | (w << (32 - c)); +} + +INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) { + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { + // Select the message schedule based on the round. + const uint8_t *schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/src/linker/third_party_ext/blake3/c/blake3_sse2.c b/src/linker/third_party_ext/blake3/c/blake3_sse2.c new file mode 100644 index 00000000..86bb17ab --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_sse2.c @@ -0,0 +1,566 @@ +#include "blake3_impl.h" + +#include + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { + const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + __m128i mask = _mm_set1_epi16(imm8); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const char *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/src/linker/third_party_ext/blake3/c/blake3_sse41.c b/src/linker/third_party_ext/blake3/c/blake3_sse41.c new file mode 100644 index 00000000..b28ae13b --- /dev/null +++ b/src/linker/third_party_ext/blake3/c/blake3_sse41.c @@ -0,0 +1,560 @@ +#include "blake3_impl.h" + +#include + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const char *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/src/linker/third_party_ext/md5/md5.c b/src/linker/third_party_ext/md5/md5.c new file mode 100644 index 00000000..57f429d4 --- /dev/null +++ b/src/linker/third_party_ext/md5/md5.c @@ -0,0 +1,293 @@ +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * (This is a heavily cut-down "BSD license".) + * + * This differs from Colin Plumb's older public domain implementation in that + * no exactly 32-bit integer data type is required (any 32-bit or wider + * unsigned integer data type will do), there's no compile-time endianness + * configuration, and the function prototypes match OpenSSL's. No code from + * Colin Plumb's implementation has been reused; this comment merely compares + * the properties of the two independent implementations. + * + * The primary goals of this implementation are portability and ease of use. + * It is meant to be fast, but not as fast as possible. Some known + * optimizations are not included to reduce source code size and avoid + * compile-time configuration. + */ + +#ifndef HAVE_OPENSSL + +#include + +#include "md5.h" + +/* + * The basic MD5 functions. + * + * F and G are optimized compared to their RFC 1321 definitions for + * architectures that lack an AND-NOT instruction, just like in Colin Plumb's + * implementation. + */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) +#define H(x, y, z) (((x) ^ (y)) ^ (z)) +#define H2(x, y, z) ((x) ^ ((y) ^ (z))) +#define I(x, y, z) ((y) ^ ((x) | ~(z))) + +/* + * The MD5 transformation for all four rounds. + */ +#define STEP(f, a, b, c, d, x, t, s) \ + (a) += f((b), (c), (d)) + (x) + (t); \ + (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ + (a) += (b); + +/* + * SET reads 4 input bytes in little-endian byte order and stores them in a + * properly aligned word in host byte order. + * + * The check for little-endian architectures that tolerate unaligned memory + * accesses is just an optimization. Nothing will break if it fails to detect + * a suitable architecture. + * + * Unfortunately, this optimization may be a C strict aliasing rules violation + * if the caller's data buffer has effective type that cannot be aliased by + * MD5_u32plus. In practice, this problem may occur if these MD5 routines are + * inlined into a calling function, or with future and dangerously advanced + * link-time optimizations. For the time being, keeping these MD5 routines in + * their own translation unit avoids the problem. + */ +#if defined(__i386__) || defined(__x86_64__) || defined(__vax__) +#define SET(n) \ + (*(MD5_u32plus *)&ptr[(n) * 4]) +#define GET(n) \ + SET(n) +#else +#define SET(n) \ + (ctx->block[(n)] = \ + (MD5_u32plus)ptr[(n) * 4] | \ + ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ + ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ + ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) +#define GET(n) \ + (ctx->block[(n)]) +#endif + +/* + * This processes one or more 64-byte data blocks, but does NOT update the bit + * counters. There are no alignment requirements. + */ +static const void *body(MD5_CTX *ctx, const void *data, unsigned long size) +{ + const unsigned char *ptr; + MD5_u32plus a, b, c, d; + MD5_u32plus saved_a, saved_b, saved_c, saved_d; + + ptr = (const unsigned char *)data; + + a = ctx->a; + b = ctx->b; + c = ctx->c; + d = ctx->d; + + do { + saved_a = a; + saved_b = b; + saved_c = c; + saved_d = d; + +/* Round 1 */ + STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) + STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) + STEP(F, c, d, a, b, SET(2), 0x242070db, 17) + STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) + STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) + STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) + STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) + STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) + STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) + STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) + STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) + STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) + STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) + STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) + STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) + STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) + +/* Round 2 */ + STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) + STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) + STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) + STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) + STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) + STEP(G, d, a, b, c, GET(10), 0x02441453, 9) + STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) + STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) + STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) + STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) + STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) + STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) + STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) + STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) + STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) + STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) + +/* Round 3 */ + STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) + STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11) + STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) + STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23) + STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) + STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11) + STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) + STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23) + STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) + STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11) + STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) + STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23) + STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) + STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11) + STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) + STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23) + +/* Round 4 */ + STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) + STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) + STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) + STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) + STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) + STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) + STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) + STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) + STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) + STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) + STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) + STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) + STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) + STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) + STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) + STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) + + a += saved_a; + b += saved_b; + c += saved_c; + d += saved_d; + + ptr += 64; + } while (size -= 64); + + ctx->a = a; + ctx->b = b; + ctx->c = c; + ctx->d = d; + + return ptr; +} + +void MD5_Init(MD5_CTX *ctx) +{ + ctx->a = 0x67452301; + ctx->b = 0xefcdab89; + ctx->c = 0x98badcfe; + ctx->d = 0x10325476; + + ctx->lo = 0; + ctx->hi = 0; +} + +void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size) +{ + MD5_u32plus saved_lo; + unsigned long used, available; + + saved_lo = ctx->lo; + if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) + ctx->hi++; + ctx->hi += size >> 29; + + used = saved_lo & 0x3f; + + if (used) { + available = 64 - used; + + if (size < available) { + memcpy(&ctx->buffer[used], data, size); + return; + } + + memcpy(&ctx->buffer[used], data, available); + data = (const unsigned char *)data + available; + size -= available; + body(ctx, ctx->buffer, 64); + } + + if (size >= 64) { + data = body(ctx, data, size & ~(unsigned long)0x3f); + size &= 0x3f; + } + + memcpy(ctx->buffer, data, size); +} + +#define MD5_OUT(dst, src) \ + (dst)[0] = (unsigned char)(src); \ + (dst)[1] = (unsigned char)((src) >> 8); \ + (dst)[2] = (unsigned char)((src) >> 16); \ + (dst)[3] = (unsigned char)((src) >> 24); + +void MD5_Final(unsigned char *result, MD5_CTX *ctx) +{ + unsigned long used, available; + + used = ctx->lo & 0x3f; + + ctx->buffer[used++] = 0x80; + + available = 64 - used; + + if (available < 8) { + memset(&ctx->buffer[used], 0, available); + body(ctx, ctx->buffer, 64); + used = 0; + available = 64; + } + + memset(&ctx->buffer[used], 0, available - 8); + + ctx->lo <<= 3; + MD5_OUT(&ctx->buffer[56], ctx->lo) + MD5_OUT(&ctx->buffer[60], ctx->hi) + + body(ctx, ctx->buffer, 64); + + MD5_OUT(&result[0], ctx->a) + MD5_OUT(&result[4], ctx->b) + MD5_OUT(&result[8], ctx->c) + MD5_OUT(&result[12], ctx->d) + + memset(ctx, 0, sizeof(*ctx)); +} + +#undef MD5_OUT + +#endif diff --git a/src/linker/third_party_ext/md5/md5.h b/src/linker/third_party_ext/md5/md5.h new file mode 100644 index 00000000..2da44bf3 --- /dev/null +++ b/src/linker/third_party_ext/md5/md5.h @@ -0,0 +1,45 @@ +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * See md5.c for more information. + */ + +#ifdef HAVE_OPENSSL +#include +#elif !defined(_MD5_H) +#define _MD5_H + +/* Any 32-bit or wider unsigned integer data type will do */ +typedef unsigned int MD5_u32plus; + +typedef struct { + MD5_u32plus lo, hi; + MD5_u32plus a, b, c, d; + unsigned char buffer[64]; + MD5_u32plus block[16]; +} MD5_CTX; + +extern void MD5_Init(MD5_CTX *ctx); +extern void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size); +extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); + +#endif diff --git a/src/linker/third_party_ext/radsort/radsort.h b/src/linker/third_party_ext/radsort/radsort.h new file mode 100644 index 00000000..eb77580c --- /dev/null +++ b/src/linker/third_party_ext/radsort/radsort.h @@ -0,0 +1,607 @@ +// New radsort. + +// To Use: +// Create a less_than function and then call radsort. +// +// So, for an array of unsigned ints: +// +// RSFORCEINLINE int int_is_before( void * elementa, void * elementb ) +// { +// return *(unsigned int*)elementa < *(unsigned int*)elementb; +// } +// +// radsort( buffer, count, int_is_before, unsigned int ); // type of each element is the last parameter +// +// If you comparison function is very complicated, then you might try +// dropping the RSFORCEINLINE. + +#include // for size_t + +#ifdef _MSC_VER +#define RSFORCEINLINE __forceinline __declspec(safebuffers) +#define CompilerReset(ptr) __assume(ptr) +#else +#define RSFORCEINLINE __attribute__((always_inline)) +#define CompilerReset(ptr) +#endif + +#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE__) || defined(__SSE2__) || defined(USE_SSE) +#include +#define RS_PREFETCH( addr ) _mm_prefetch( (addr), 0 ) +#endif + +// nonsense to make adding pointers a more convenient +#define rsadd_ptr( ptr, ind ) (((char*)(ptr))+(ptrdiff_t)(ind)) +#define rssub_ptr( ptr, ind ) (((char*)(ptr))-(ptrdiff_t)(ind)) +#define rsadd_ptr_elements( ptr, ind ) rsadd_ptr( ptr, (ptrdiff_t)(ind)*(ptrdiff_t)element_size ) +#define rsdiff_ptr_elements( ptra, ptrb ) ( (size_t)(((char*)(ptra))-((char*)(ptrb))) / (size_t)element_size ) + +// this is the maximum size of struct that we treat as a "simple" struct +typedef struct RS_MAX_SIMPLE_BUF { char b[32]; } RS_MAX_SIMPLE_BUF; // todo, 64-bit + + +// ============================================================================================================== +// swap and move utility functions +typedef struct bytes64 { char b[64]; } bytes64; // copying with this turns into m512 moves (when arch is set) +typedef struct bytes32 { char b[32]; } bytes32; // copying with this turns into m256 moves (when arch is set) +typedef struct bytes16 { char b[16]; } bytes16; // copying with this turns into m128 moves +typedef struct bytes8 { char b[8]; } bytes8; + +static RSFORCEINLINE void radsortswapper( void * a, void * b, size_t size ) +{ + #define RSSWAPMEM(type) ( size >= sizeof(type) ) { type v = *(type const*)a; *(type*)a = *(type const*)b; *(type*)b = v; a=rsadd_ptr(a,sizeof(type)); b=rsadd_ptr(b,sizeof(type)); size -= sizeof(type); } + + while RSSWAPMEM(bytes64); + if RSSWAPMEM(bytes32); + if RSSWAPMEM(bytes16); + if RSSWAPMEM(bytes8); + if RSSWAPMEM(int); + if RSSWAPMEM(short); + if RSSWAPMEM(char); + + #undef RSSWAPMEM +} + +// since size is always constant, this big function compiles down to 4 to 12 instructions (for normal structs 4-6) +static RSFORCEINLINE void radsortmover( void * a, void * b, size_t size ) +{ + #define RSMOVEMEM(type) ( size >= sizeof(type) ) { *(type*)a = *(type const*)b; a=rsadd_ptr(a,sizeof(type)); b=rsadd_ptr(b,sizeof(type)); size -= sizeof(type); } + + while RSMOVEMEM(bytes64); + if RSMOVEMEM(bytes32); + if RSMOVEMEM(bytes16); + if RSMOVEMEM(bytes8); + if RSMOVEMEM(int); + if RSMOVEMEM(short); + if RSMOVEMEM(char); + + #undef RSMOVEMEM +} + +// these macros generate tiny move/swap routines that don't go through the generic function above (mostly for debug build performance) +#define RS_SIMPLE_SIZES _X(1) _X(2) _X(4) _X(8) _X(12) _X(16) +#define rsmoverfunc( num ) static RSFORCEINLINE void radsortmover##num ( void * dest, void * src, size_t element_size ) { typedef struct rs { char x[num]; } rs; *(rs*)dest = *(rs*)src; } +#define rsswapperfunc( num ) static RSFORCEINLINE void radsortswapper##num( void * a, void * b, size_t element_size ) { typedef struct rs { char x[num]; } rs; rs temp; temp = *(rs*)a; *(rs*)a = *(rs*)b; *(rs*)b = temp; } + +#define _X rsmoverfunc +RS_SIMPLE_SIZES +#undef _X +#define _X rsswapperfunc +RS_SIMPLE_SIZES +#undef _X + +#undef RS_SIMPLE_SIZES +#undef rsmoverfunc +#undef rsswapperfunc + + +// ============================================================================================================== + +typedef int is_before_func( void * elementa, void * elementb ); +typedef void swap_func( void * elementa, void * elementb, size_t element_size ); +typedef void move_func( void * dest, void * src, size_t size ); +typedef void rs_small_sort_func( void * left, size_t n, size_t element_size, is_before_func * is_before, move_func * mover, void * tmp ); + +#define radsortswapsize( size ) ( ( size == 1 ) ? radsortswapper1 : ( ( size == 2 ) ? radsortswapper2 : ( ( size == 4 ) ? radsortswapper4 : ( ( size == 8 ) ? radsortswapper8 : ( ( size == 12 ) ? radsortswapper12 : ( ( size == 16 ) ? radsortswapper16 : radsortswapper ) ) ) ) ) ) +#define radsortmovesize( size ) ( ( size == 1 ) ? radsortmover1 : ( ( size == 2 ) ? radsortmover2 : ( ( size == 4 ) ? radsortmover4 : ( ( size == 8 ) ? radsortmover8 : ( ( size == 12 ) ? radsortmover12 : ( ( size == 16 ) ? radsortmover16 : radsortmover ) ) ) ) ) ) + +// todo - maybe no bubble at all? +#define RS_SMALL_FLIP_TO_INSERTION_GT_SIZE sizeof( size_t ) +typedef struct RS_MAX_BUBBLE_BUF { char b[RS_SMALL_FLIP_TO_INSERTION_GT_SIZE]; } RS_MAX_BUBBLE_BUF; + +#define radsort( start, len, is_before_func ) \ + do { \ + char __rs_tmp[ sizeof( (start)[0] ) ]; \ + radsortinternal( start, len, sizeof( (start)[0] ), \ + is_before_func, \ + radsortswapsize( sizeof( (start)[0] ) ), \ + radsortmovesize( sizeof( (start)[0] ) ), \ + ( sizeof( (start)[0] ) > RS_SMALL_FLIP_TO_INSERTION_GT_SIZE ) ? radinsertionsort : radbubble2sort, \ + ( sizeof( (start)[0] ) > RS_SMALL_FLIP_TO_INSERTION_GT_SIZE ) ? RSS_FLIP_TO_SMALL_SORT_INSERTION : RSS_FLIP_TO_SMALL_SORT_BUBBLE2, \ + &__rs_tmp \ + ); \ + } while (0) +#define radheapsort( start, len, is_before_func ) do { radheapsortinteral( start, len, sizeof( ((start)[0]) ), is_before_func, radsortswapsize( sizeof( ((start)[0]) ) ) ); } while (0) + + +//=================================================================================================== +// small heap sort - this sort is around 200 bytes compiled - can use directly when size is important + +RSFORCEINLINE void radheapsortinteral( void * start, size_t len, size_t element_size, is_before_func * is_before, swap_func * swapper ) +{ + void * left; + void * right; + size_t length; + + left = start; + right = rsadd_ptr_elements( start, len - 1 ); + length = len; + + if ( length > 1 ) + { + // unusual small in-place heap sort + void * i; void * ind; void * v; void * n; + size_t s, k; + + s = length >> 1; + i = rsadd_ptr_elements( left, s ); + + for(;;) + { + --s; + i = rsadd_ptr_elements( i, -1 ); + ind = i; + k = ( s << 1 ) + 1; + + for(;;) + { + v = rsadd_ptr_elements( left, k ); + n = rsadd_ptr_elements( v, 1 ); + + if ( ( ( n <= right ) ) && ( is_before( v, n ) ) ) + { + ++k; + v = n; + } + + if ( is_before( ind, v ) ) + { + swapper( ind, v, element_size ); + ind = v; + k = ( k << 1 ) + 1; + + if ( k < length ) + continue; + } + + // if s is non-zero, we are still building the heap! + if ( s ) + break; + + swapper( left, right, element_size ); + right = rsadd_ptr_elements( right, -1 ); + ind = left; + k = 1; + --length; + + if ( length <= 1 ) + return; + } + } + } +} + +//=================================================================================================== +// median routines + +#define rsswapsmaller( X, Y ) { RS_MAX_SIMPLE_BUF tmp; int cond; cond = is_before( &Y, &X); mover( &tmp, &X, element_size ); if ( cond ) mover( &X, &Y, element_size ); if ( cond ) mover( &Y, &tmp, element_size ); } + +static RSFORCEINLINE void radsortgetmedian5( void * output, void * left, void * right, size_t length, size_t element_size, is_before_func * is_before, swap_func * swapper, move_func * mover ) +{ + RS_MAX_SIMPLE_BUF mb0,mb1,mb2,mb3,mb4; + + mover( &mb0, left, element_size ); + mover( &mb1, rsadd_ptr_elements( left, length >> 2 ), element_size ); + mover( &mb2, rsadd_ptr_elements( left, length >> 1 ), element_size ); + mover( &mb3, rsadd_ptr_elements( left, length - (length >> 2) ), element_size ); + mover( &mb4, right, element_size ); + + // Basically, for simple compares, and for simple in-register types, this funcion + // must turn info 7 compares and then 5-7 movs, and 12 cmovs. Any + // compiler *should* do this - if this doesn't happen, then the compiler is + // hosing you. You can put int 3s at the start and end of this function to check. + + rsswapsmaller( mb0, mb1 ); + rsswapsmaller( mb2, mb3 ); + rsswapsmaller( mb0, mb2 ); + rsswapsmaller( mb1, mb3 ); + rsswapsmaller( mb1, mb4 ); + rsswapsmaller( mb1, mb2 ); + + mover( output, &mb2, element_size ); + if ( is_before( &mb4, &mb2 ) ) mover( output, &mb4, element_size ); +} + + +static RSFORCEINLINE void radsortgetmedian9( void * output, void * left, void * right, size_t length, size_t element_size, is_before_func * is_before, swap_func * swapper, move_func * mover ) +{ + RS_MAX_SIMPLE_BUF mb0,mb1,mb2,mb3,mb4,mb5,mb6,mb7,mb8; // todo, temp mem! + + #ifdef RS_PREFETCH + RS_PREFETCH( left ); + RS_PREFETCH( right ); + RS_PREFETCH( rsadd_ptr_elements( left, length >> 3 ) ); + RS_PREFETCH( rsadd_ptr_elements( left, length >> 2 ) ); + RS_PREFETCH( rsadd_ptr_elements( left, (length >> 1) - (length >> 3) ) ); + RS_PREFETCH( rsadd_ptr_elements( left, length >> 1 ) ); + RS_PREFETCH( rsadd_ptr_elements( left, (length >> 1) + (0 >> 3) ) ); + RS_PREFETCH( rsadd_ptr_elements( left, length - (length >> 2) ) ); + RS_PREFETCH( rsadd_ptr_elements( left, length - (length >> 3) ) ); + #endif + + mover( &mb0, left, element_size ); + mover( &mb1, rsadd_ptr_elements( left, length >> 3 ), element_size ); + mover( &mb2, rsadd_ptr_elements( left, length >> 2 ), element_size ); + mover( &mb3, rsadd_ptr_elements( left, (length >> 1) - (length >> 3) ), element_size ); + mover( &mb4, rsadd_ptr_elements( left, length >> 1 ), element_size ); + mover( &mb5, rsadd_ptr_elements( left, (length >> 1) + (length >> 3) ), element_size ); + mover( &mb6, rsadd_ptr_elements( left, length - (length >> 2) ), element_size ); + mover( &mb7, rsadd_ptr_elements( left, length - (length >> 3) ), element_size ); + mover( &mb8, right, element_size ); + + // Basically, for simple compares, and for simple in-register types, this funcion + // should turn info 19 compares and then 15-19 movs, and 36 cmovs. However, + // most compilers can only so-so job at this, and you'll end up with 3-4 jumps. + // We just need cmov intrinsics. + + rsswapsmaller( mb0, mb7 ); + rsswapsmaller( mb1, mb2 ); + rsswapsmaller( mb3, mb5 ); + rsswapsmaller( mb4, mb8 ); + rsswapsmaller( mb0, mb2 ); + rsswapsmaller( mb1, mb5 ); + rsswapsmaller( mb3, mb8 ); + rsswapsmaller( mb4, mb7 ); + rsswapsmaller( mb0, mb3 ); + rsswapsmaller( mb1, mb4 ); + rsswapsmaller( mb2, mb8 ); + rsswapsmaller( mb5, mb7 ); + rsswapsmaller( mb3, mb4 ); + rsswapsmaller( mb5, mb6 ); + rsswapsmaller( mb2, mb5 ); + rsswapsmaller( mb4, mb6 ); + rsswapsmaller( mb2, mb3 ); + rsswapsmaller( mb4, mb5 ); + + mover( output, &mb3, element_size ); + if ( is_before( &mb4, &mb3 ) ) mover( output, &mb4, element_size ); +} + +#define RSS_USE_MEDIAN_9 1024 + +static RSFORCEINLINE void radsortgetmedian( void * output, void * left, void * right, size_t length, size_t element_size, is_before_func * is_before, swap_func * swapper, move_func * mover ) +{ + // get the median into copy + if ( length >= RSS_USE_MEDIAN_9 ) + radsortgetmedian9( output, left, right, length, element_size, is_before, swapper, mover ); + else + radsortgetmedian5( output, left, right, length, element_size, is_before, swapper, mover ); +} + + + +//=================================================================================================== +// bubble 2 routines - for partitions <= 16 count + +// from Gerben Stavenga - bubble sort moving two values through at once +// for ints, this compiles down to 38 instructions +#define RSS_FLIP_TO_SMALL_SORT_BUBBLE2 16 +static RSFORCEINLINE void radbubble2sort( void * left, size_t n, size_t element_size, is_before_func * is_before, move_func * mover, void * tmp ) +{ + void * i; // todo - test with bigger blocks + void * s = rsadd_ptr_elements( left, 2 ); + RS_MAX_BUBBLE_BUF x, y, z; + + #define rsbubbleswap( X, Y ) { int cond; cond = is_before( &Y, &X); mover( tmp, &X, element_size ); if ( cond ) mover( &X, &Y, element_size ); if ( cond ) mover( &Y, tmp, element_size ); } + + for ( i = rsadd_ptr_elements( left, (int)n - 1 ) ; i > left ; i = rsadd_ptr_elements( i, -2 ) ) + { + void * j, * jm2; + + // load x & y + mover( &x, left, element_size ); + mover( &y, rsadd_ptr_elements( left, 1 ), element_size ); + + // swap x & y, so that x is smaller than y + rsbubbleswap( x, y ); + + // for ints, this loop needs to be 4 cmps, 6 cmovs, and 5 movs + // anything else will kill performance + + jm2 = left; + for ( j = s ; j <= i ; j = rsadd_ptr_elements( j, 1 ) ) + { + // make z smaller than x and y, and the dump it to the left + mover( &z, j, element_size ); + rsbubbleswap( z, x ); + rsbubbleswap( z, y ); + rsbubbleswap( x, y ); + mover( jm2, &z, element_size ); + jm2 = rsadd_ptr_elements( jm2, 1 ); + } + + mover( rsadd_ptr_elements( i, -1 ), &x, element_size ); + mover( i, &y, element_size ); + } +} + +#define RSS_FLIP_TO_SMALL_SORT_INSERTION 28 +static RSFORCEINLINE void radinsertionsort(void * start, size_t len, size_t element_size, is_before_func * is_before, move_func * mover, void * tmp ) +{ + void * cur; + void * prev; + + cur = rsadd_ptr_elements( start, 1 ); + --len; + prev = start; + do + { + void * comp = cur; + if ( is_before( comp, prev ) ) + { + mover( tmp, comp, element_size ); + do + { + mover( comp, prev, element_size ); + comp = rsadd_ptr_elements( comp, -1 ); + if ( comp == start ) + break; + prev = rsadd_ptr_elements( prev, -1 ); + } while ( is_before( tmp, prev ) ); + mover( comp, tmp, element_size ); + } + prev = cur; + cur = rsadd_ptr_elements( cur, 1 ); + } while( --len ); +} + +/* +todo +static void * rs_start; +static is_before_func * rs_ib; +static size_t rs_es; + +static RSFORCEINLINE int rss_byte_is_before_func( void * elementa, void * elementb ) +{ + unsigned char a = *(unsigned char*)elementa; + unsigned char b = *(unsigned char*)elementb; + size_t element_size = rs_es; + return rs_ib( rsadd_ptr_elements( rs_start, a ), rsadd_ptr_elements( rs_start, b ) ); +} + + +// do bubble sort of offsets, and THEN do all the swaps - faster on biy structures +static RSFORCEINLINE void radsortbubble2offsets( void * left, size_t n, size_t element_size, is_before_func * is_before, swap_func * swapper, move_func * mover ) +{ + static unsigned char init[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; + unsigned char offsets[16]; + unsigned char swap[16]; + + radsortmover16( offsets, init, 16 ); + radsortmover16( swap, init, 16 ); + rs_start = left; +rs_ib = is_before; +rs_es = element_size; + + // sort the byte offsets + radsortbubble2( offsets, n, 1, rss_byte_is_before_func, radsortmover1 ); + + // now reorder the data + { + unsigned char i; + void * ip = left; + + for( i = 0 ; i < (unsigned char)n ; i++ ) + { + unsigned char j = swap[ offsets[ i ] ]; + if ( i != j ) + { + swapper( ip, rsadd_ptr_elements( left, j ), element_size ); + swap[ j ] = swap[ i ]; + } + ip = rsadd_ptr_elements( ip, 1 ); + } + } +} +*/ +//=================================================================================================== + +#undef rsswapsmaller + +#define RSS_MAX_RECURSE 128 + +RSFORCEINLINE void radsortinternal( void * start, size_t len, size_t element_size, is_before_func * is_before, swap_func * swapper, move_func * mover, rs_small_sort_func * small_sort, size_t small_sort_thres, void * tmp ) +{ + void * left; + size_t length; + + if ( len <= 1 ) + return; + + #if _DEBUG + if ( element_size > sizeof( RS_MAX_SIMPLE_BUF ) ) + __debugbreak(); + #endif + + // stack for no recursion + typedef struct stks + { + void * left; + size_t len; + } stks; + + stks stk[ RSS_MAX_RECURSE ]; + stks * stk_ptr = stk + RSS_MAX_RECURSE; + + // we use the stk_ptr to tell when to flip to heap. + // when we hit the end of the stack, we heap it, so + // back the start of the stack to log1.5 of len + length = len; + do { + --stk_ptr; + if ( stk_ptr == stk ) { stk_ptr = stk+1; break; } + length = ( length >> 1 ) + ( length >> 2 ); + } while ( length ); + stk_ptr[ -1 ].len = 0; + + left = start; + length = len; + + do + { + for(;;) + { + // if tiny, hand with insertion + if ( length <= small_sort_thres ) + { + CompilerReset(left); // we reset the compiler before each major sort + small_sort( left, length, element_size, is_before, mover, tmp ); + break; + } + else + { + // if we have hit end of our recursion stack, flip to using a heap (this prevents N^2 behavior) + if ( stk_ptr >= stk + RSS_MAX_RECURSE ) + { + CompilerReset(left); // we reset the compiler before each major sort + //printf("heap: %d\n",(int)length); + radheapsortinteral( left, length, element_size, is_before, swapper ); + break; + } + else + { + // partition + void * rightequalpiv; + size_t leftlen; + void * scan, * piv, * rend, * right; + + CompilerReset(left); // we reset the compiler before each major sort + + right = rsadd_ptr_elements( left, length - 1 ); + + // check for and correct inverted blocks + scan = left; + rend = right; + while ( is_before( rend, scan ) ) + { + swapper( rend, scan, element_size ); + scan = rsadd_ptr_elements( scan, 1 ); + rend = rsadd_ptr_elements( rend, -1 ); + if ( scan >= rend ) break; + } + + // scan to see if the block is in order (or all the same) + scan = left; + do + { + void * next = rsadd_ptr( scan, element_size ); + if ( is_before( next, scan ) ) + goto doqsort; + scan = next; + } while ( scan < right ); + // if we get out of the loop cleanly, this block is already sorted, so just fall out and do next block + break; + + doqsort: + + // get the median into copy + radsortgetmedian( tmp, left, right, length, element_size, is_before, swapper, mover ); + + // if scan != left, then we have a few in order, so we can skip them all if the final is under the copy + if ( !is_before( scan, tmp ) ) + scan = left; + // this loop should be 3 instructions + // skip values below the pivot at the start of the segment + while( is_before( scan, tmp ) ) // the pivot will stop this loop + scan = rsadd_ptr( scan, element_size ); + + // skip values above and equal to the pivot at the end of the segment + rend = right; + if ( left == start ) + { + // we have to use this loop to check that we don't read off the front of + // the array this loop should be 5 instructions + while( rend > scan ) + { + if ( !is_before( tmp, rend ) ) + break; + rend = rsadd_ptr_elements( rend, -1 ); + } + } + else + { + // if we're not at the very start of the entire buffer, then we + // can use this loop, which is only 3 instructions + while( is_before( tmp, rend ) ) // the pivot will stop this loop + rend = rsadd_ptr_elements( rend, -1 ); + } + + // finally, do actual partitioning nanosort style - 65-70% of the + // total time will be in this loop, for ints, this should be + // 4 movs, 2 cmps, 1 cmov, 2 add, 1 jmp - 10 instructions + // compilers getting this wrong is a 50-100% slowdown! You can + // check the output by putting int 3s around this loop. + CompilerReset(scan); + piv = scan; + while( scan <= rend ) + { + size_t adv = is_before( scan, tmp ); + swapper( piv, scan, element_size ); + if ( adv ) piv = rsadd_ptr( piv, element_size ); // needs to be a cmov + scan = rsadd_ptr( scan, element_size ); + } + + // now move the right side to skip over all of the equal values... + // this loop should be 5 instructions + rightequalpiv = piv; + while ( rightequalpiv < right ) + { + if ( is_before( tmp, rightequalpiv ) ) + break; + rightequalpiv = rsadd_ptr_elements( rightequalpiv, 1 ); + } + + // ok, now get the size of each half and prepare to descend + leftlen = rsdiff_ptr_elements( piv, left ); + length -= rsdiff_ptr_elements( rightequalpiv, left ); + + // put the smaller segment on the stack + if ( length < leftlen ) + { + // put small right on stack + stk_ptr->left = rightequalpiv; + stk_ptr->len = length; + stk_ptr += ( length > 1 ); + length = leftlen; + } + else + { + // put small left on stack + stk_ptr->left = left; + stk_ptr->len = leftlen; + stk_ptr += ( leftlen > 1 ); + left = rightequalpiv; + if ( length <= 1 ) break; + } + } + } + } + --stk_ptr; + left = stk_ptr->left; + length = stk_ptr->len; + } while ( length ); +} + +#undef rsadd_ptr +#undef rsadd_ptr_elements +#undef rsdiff_ptr_elements diff --git a/src/linker/third_party_ext/xxHash/LICENSE b/src/linker/third_party_ext/xxHash/LICENSE new file mode 100644 index 00000000..e4c5da72 --- /dev/null +++ b/src/linker/third_party_ext/xxHash/LICENSE @@ -0,0 +1,26 @@ +xxHash Library +Copyright (c) 2012-2021 Yann Collet +All rights reserved. + +BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/linker/third_party_ext/xxHash/README.md b/src/linker/third_party_ext/xxHash/README.md new file mode 100644 index 00000000..eea13e27 --- /dev/null +++ b/src/linker/third_party_ext/xxHash/README.md @@ -0,0 +1,253 @@ + +xxHash - Extremely fast hash algorithm +====================================== + +xxHash is an Extremely fast Hash algorithm, processing at RAM speed limits. +Code is highly portable, and produces hashes identical across all platforms (little / big endian). +The library includes the following algorithms : +- XXH32 : generates 32-bit hashes, using 32-bit arithmetic +- XXH64 : generates 64-bit hashes, using 64-bit arithmetic +- XXH3 (since `v0.8.0`): generates 64 or 128-bit hashes, using vectorized arithmetic. + The 128-bit variant is called XXH128. + +All variants successfully complete the [SMHasher](https://code.google.com/p/smhasher/wiki/SMHasher) test suite +which evaluates the quality of hash functions (collision, dispersion and randomness). +Additional tests, which evaluate more thoroughly speed and collision properties of 64-bit hashes, [are also provided](https://github.com/Cyan4973/xxHash/tree/dev/tests). + +|Branch |Status | +|------------|---------| +|release | [![Build Status](https://github.com/Cyan4973/xxHash/actions/workflows/ci.yml/badge.svg?branch=release)](https://github.com/Cyan4973/xxHash/actions?query=branch%3Arelease+) | +|dev | [![Build Status](https://github.com/Cyan4973/xxHash/actions/workflows/ci.yml/badge.svg?branch=dev)](https://github.com/Cyan4973/xxHash/actions?query=branch%3Adev+) | + + +Benchmarks +------------------------- + +The benchmarked reference system uses an Intel i7-9700K cpu, and runs Ubuntu x64 20.04. +The [open source benchmark program] is compiled with `clang` v10.0 using `-O3` flag. + +| Hash Name | Width | Bandwidth (GB/s) | Small Data Velocity | Quality | Comment | +| --------- | ----- | ---------------- | ----- | --- | --- | +| __XXH3__ (SSE2) | 64 | 31.5 GB/s | 133.1 | 10 +| __XXH128__ (SSE2) | 128 | 29.6 GB/s | 118.1 | 10 +| _RAM sequential read_ | N/A | 28.0 GB/s | N/A | N/A | _for reference_ +| City64 | 64 | 22.0 GB/s | 76.6 | 10 +| T1ha2 | 64 | 22.0 GB/s | 99.0 | 9 | Slightly worse [collisions] +| City128 | 128 | 21.7 GB/s | 57.7 | 10 +| __XXH64__ | 64 | 19.4 GB/s | 71.0 | 10 +| SpookyHash | 64 | 19.3 GB/s | 53.2 | 10 +| Mum | 64 | 18.0 GB/s | 67.0 | 9 | Slightly worse [collisions] +| __XXH32__ | 32 | 9.7 GB/s | 71.9 | 10 +| City32 | 32 | 9.1 GB/s | 66.0 | 10 +| Murmur3 | 32 | 3.9 GB/s | 56.1 | 10 +| SipHash | 64 | 3.0 GB/s | 43.2 | 10 +| FNV64 | 64 | 1.2 GB/s | 62.7 | 5 | Poor avalanche properties +| Blake2 | 256 | 1.1 GB/s | 5.1 | 10 | Cryptographic +| SHA1 | 160 | 0.8 GB/s | 5.6 | 10 | Cryptographic but broken +| MD5 | 128 | 0.6 GB/s | 7.8 | 10 | Cryptographic but broken + +[open source benchmark program]: https://github.com/Cyan4973/xxHash/tree/release/tests/bench +[collisions]: https://github.com/Cyan4973/xxHash/wiki/Collision-ratio-comparison#collision-study + +note 1: Small data velocity is a _rough_ evaluation of algorithm's efficiency on small data. For more detailed analysis, please refer to next paragraph. + +note 2: some algorithms feature _faster than RAM_ speed. In which case, they can only reach their full speed potential when input is already in CPU cache (L3 or better). Otherwise, they max out on RAM speed limit. + +### Small data + +Performance on large data is only one part of the picture. +Hashing is also very useful in constructions like hash tables and bloom filters. +In these use cases, it's frequent to hash a lot of small data (starting at a few bytes). +Algorithm's performance can be very different for such scenarios, since parts of the algorithm, +such as initialization or finalization, become fixed cost. +The impact of branch mis-prediction also becomes much more present. + +XXH3 has been designed for excellent performance on both long and small inputs, +which can be observed in the following graph: + +![XXH3, latency, random size](https://user-images.githubusercontent.com/750081/61976089-aedeab00-af9f-11e9-9239-e5375d6c080f.png) + +For a more detailed analysis, please visit the wiki : +https://github.com/Cyan4973/xxHash/wiki/Performance-comparison#benchmarks-concentrating-on-small-data- + +Quality +------------------------- + +Speed is not the only property that matters. +Produced hash values must respect excellent dispersion and randomness properties, +so that any sub-section of it can be used to maximally spread out a table or index, +as well as reduce the amount of collisions to the minimal theoretical level, following the [birthday paradox]. + +`xxHash` has been tested with Austin Appleby's excellent SMHasher test suite, +and passes all tests, ensuring reasonable quality levels. +It also passes extended tests from [newer forks of SMHasher], featuring additional scenarios and conditions. + +Finally, xxHash provides its own [massive collision tester](https://github.com/Cyan4973/xxHash/tree/dev/tests/collisions), +able to generate and compare billions of hashes to test the limits of 64-bit hash algorithms. +On this front too, xxHash features good results, in line with the [birthday paradox]. +A more detailed analysis is documented [in the wiki](https://github.com/Cyan4973/xxHash/wiki/Collision-ratio-comparison). + +[birthday paradox]: https://en.wikipedia.org/wiki/Birthday_problem +[newer forks of SMHasher]: https://github.com/rurban/smhasher + + +### Build modifiers + +The following macros can be set at compilation time to modify libxxhash's behavior. They are generally disabled by default. + +- `XXH_INLINE_ALL`: Make all functions `inline`, with implementations being directly included within `xxhash.h`. + Inlining functions is beneficial for speed on small keys. + It's _extremely effective_ when key length is expressed as _a compile time constant_, + with performance improvements observed in the +200% range . + See [this article](https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html) for details. +- `XXH_PRIVATE_API`: same outcome as `XXH_INLINE_ALL`. Still available for legacy support. + The name underlines that `XXH_*` symbol names will not be exported. +- `XXH_NAMESPACE`: Prefixes all symbols with the value of `XXH_NAMESPACE`. + This macro can only use compilable character set. + Useful to evade symbol naming collisions, + in case of multiple inclusions of xxHash's source code. + Client applications still use the regular function names, + as symbols are automatically translated through `xxhash.h`. +- `XXH_FORCE_ALIGN_CHECK`: Use a faster direct read path when input is aligned. + This option can result in dramatic performance improvement when input to hash is aligned on 32 or 64-bit boundaries, + when running on architectures unable to load memory from unaligned addresses, or suffering a performance penalty from it. + It is (slightly) detrimental on platform with good unaligned memory access performance (same instruction for both aligned and unaligned accesses). + This option is automatically disabled on `x86`, `x64` and `aarch64`, and enabled on all other platforms. +- `XXH_FORCE_MEMORY_ACCESS`: The default method `0` uses a portable `memcpy()` notation. + Method `1` uses a gcc-specific `packed` attribute, which can provide better performance for some targets. + Method `2` forces unaligned reads, which is not standard compliant, but might sometimes be the only way to extract better read performance. + Method `3` uses a byteshift operation, which is best for old compilers which don't inline `memcpy()` or big-endian systems without a byteswap instruction. +- `XXH_VECTOR` : manually select a vector instruction set (default: auto-selected at compilation time). Available instruction sets are `XXH_SCALAR`, `XXH_SSE2`, `XXH_AVX2`, `XXH_AVX512`, `XXH_NEON` and `XXH_VSX`. Compiler may require additional flags to ensure proper support (for example, `gcc` on linux will require `-mavx2` for `AVX2`, and `-mavx512f` for `AVX512`). +- `XXH_NO_PREFETCH` : disable prefetching. Some platforms or situations may perform better without prefetching. XXH3 only. +- `XXH_PREFETCH_DIST` : select prefetching distance. For close-to-metal adaptation to specific hardware platforms. XXH3 only. +- `XXH_NO_STREAM`: Disables the streaming API, limiting it to single shot variants only. +- `XXH_SIZE_OPT`: `0`: default, optimize for speed + `1`: default for `-Os` and `-Oz`: disables some speed hacks for size optimization + `2`: makes code as small as possible, performance may cry +- `XXH_NO_INLINE_HINTS`: By default, xxHash uses `__attribute__((always_inline))` and `__forceinline` to improve performance at the cost of code size. + Defining this macro to 1 will mark all internal functions as `static`, allowing the compiler to decide whether to inline a function or not. + This is very useful when optimizing for smallest binary size, + and is automatically defined when compiling with `-O0`, `-Os`, `-Oz`, or `-fno-inline` on GCC and Clang. + This may also increase performance depending on compiler and architecture. +- `XXH32_ENDJMP`: Switch multi-branch finalization stage of XXH32 by a single jump. + This is generally undesirable for performance, especially when hashing inputs of random sizes. + But depending on exact architecture and compiler, a jump might provide slightly better performance on small inputs. Disabled by default. +- `XXH_NO_STDLIB`: Disable invocation of `` functions, notably `malloc()` and `free()`. + `libxxhash`'s `XXH*_createState()` will always fail and return `NULL`. + But one-shot hashing (like `XXH32()`) or streaming using statically allocated states + still work as expected. + This build flag is useful for embedded environments without dynamic allocation. +- `XXH_STATIC_LINKING_ONLY`: gives access to internal state declaration, required for static allocation. + Incompatible with dynamic linking, due to risks of ABI changes. +- `XXH_NO_XXH3` : removes symbols related to `XXH3` (both 64 & 128 bits) from generated binary. + Useful to reduce binary size, notably for applications which do not employ `XXH3`. +- `XXH_NO_LONG_LONG`: removes compilation of algorithms relying on 64-bit types (`XXH3` and `XXH64`). Only `XXH32` will be compiled. + Useful for targets (architectures and compilers) without 64-bit support. +- `XXH_IMPORT`: MSVC specific: should only be defined for dynamic linking, as it prevents linkage errors. +- `XXH_CPU_LITTLE_ENDIAN`: By default, endianness is determined by a runtime test resolved at compile time. + If, for some reason, the compiler cannot simplify the runtime test, it can cost performance. + It's possible to skip auto-detection and simply state that the architecture is little-endian by setting this macro to 1. + Setting it to 0 states big-endian. +- `XXH_DEBUGLEVEL` : When set to any value >= 1, enables `assert()` statements. + This (slightly) slows down execution, but may help finding bugs during debugging sessions. + +When compiling the Command Line Interface `xxhsum` using `make`, the following environment variables can also be set : +- `DISPATCH=1` : use `xxh_x86dispatch.c`, to automatically select between `scalar`, `sse2`, `avx2` or `avx512` instruction set at runtime, depending on local host. This option is only valid for `x86`/`x64` systems. +- `XXH_1ST_SPEED_TARGET` : select an initial speed target, expressed in MB/s, for the first speed test in benchmark mode. Benchmark will adjust the target at subsequent iterations, but the first test is made "blindly" by targeting this speed. Currently conservatively set to 10 MB/s, to support very slow (emulated) platforms. +- `NODE_JS=1` : When compiling `xxhsum` for Node.js with Emscripten, this links the `NODERAWFS` library for unrestricted filesystem access and patches `isatty` to make the command line utility correctly detect the terminal. This does make the binary specific to Node.js. + +### Building xxHash - Using vcpkg + +You can download and install xxHash using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager: + + git clone https://github.com/Microsoft/vcpkg.git + cd vcpkg + ./bootstrap-vcpkg.sh + ./vcpkg integrate install + ./vcpkg install xxhash + +The xxHash port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository. + +### Example + +The simplest example calls xxhash 64-bit variant as a one-shot function +generating a hash value from a single buffer, and invoked from a C/C++ program: + +```C +#include "xxhash.h" + + (...) + XXH64_hash_t hash = XXH64(buffer, size, seed); +} +``` + +Streaming variant is more involved, but makes it possible to provide data incrementally: + +```C +#include "stdlib.h" /* abort() */ +#include "xxhash.h" + + +XXH64_hash_t calcul_hash_streaming(FileHandler fh) +{ + /* create a hash state */ + XXH64_state_t* const state = XXH64_createState(); + if (state==NULL) abort(); + + size_t const bufferSize = SOME_SIZE; + void* const buffer = malloc(bufferSize); + if (buffer==NULL) abort(); + + /* Initialize state with selected seed */ + XXH64_hash_t const seed = 0; /* or any other value */ + if (XXH64_reset(state, seed) == XXH_ERROR) abort(); + + /* Feed the state with input data, any size, any number of times */ + (...) + while ( /* some data left */ ) { + size_t const length = get_more_data(buffer, bufferSize, fh); + if (XXH64_update(state, buffer, length) == XXH_ERROR) abort(); + (...) + } + (...) + + /* Produce the final hash value */ + XXH64_hash_t const hash = XXH64_digest(state); + + /* State could be re-used; but in this example, it is simply freed */ + free(buffer); + XXH64_freeState(state); + + return hash; +} +``` + + +### License + +The library files `xxhash.c` and `xxhash.h` are BSD licensed. +The utility `xxhsum` is GPL licensed. + + +### Other programming languages + +Beyond the C reference version, +xxHash is also available from many different programming languages, +thanks to great contributors. +They are [listed here](http://www.xxhash.com/#other-languages). + + +### Packaging status + +Many distributions bundle a package manager +which allows easy xxhash installation as both a `libxxhash` library +and `xxhsum` command line interface. + +[![Packaging status](https://repology.org/badge/vertical-allrepos/xxhash.svg)](https://repology.org/project/xxhash/versions) + + +### Special Thanks + +- Takayuki Matsuoka, aka @t-mat, for creating `xxhsum -c` and great support during early xxh releases +- Mathias Westerdahl, aka @JCash, for introducing the first version of `XXH64` +- Devin Hussey, aka @easyaspi314, for incredible low-level optimizations on `XXH3` and `XXH128` diff --git a/src/linker/third_party_ext/xxHash/SECURITY.md b/src/linker/third_party_ext/xxHash/SECURITY.md new file mode 100644 index 00000000..2a8b4c8e --- /dev/null +++ b/src/linker/third_party_ext/xxHash/SECURITY.md @@ -0,0 +1,13 @@ +# Security Policy + +## Supported Versions + +Security updates are applied only to the latest release. + +## Reporting a Vulnerability + +If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. + +Please disclose it at [security advisory](https://github.com/Cyan4973/xxHash/security/advisories/new). + +This project is maintained by a team of volunteers on a reasonable-effort basis. As such, please give us at least 90 days to work on a fix before public exposure. diff --git a/src/linker/third_party_ext/xxHash/doc/README.md b/src/linker/third_party_ext/xxHash/doc/README.md new file mode 100644 index 00000000..a73ad729 --- /dev/null +++ b/src/linker/third_party_ext/xxHash/doc/README.md @@ -0,0 +1,9 @@ +xxHash Specification +======================= + +This directory contains material defining the xxHash algorithm. +It's described in [this specification document](xxhash_spec.md). + +The algorithm is also be illustrated by a [simple educational library](https://github.com/easyaspi314/xxhash-clean), +written by @easyaspi314 and designed for readability +(as opposed to the reference library which is designed for speed). diff --git a/src/linker/third_party_ext/xxHash/doc/xxhash.cry b/src/linker/third_party_ext/xxHash/doc/xxhash.cry new file mode 100644 index 00000000..984e1c8b --- /dev/null +++ b/src/linker/third_party_ext/xxHash/doc/xxhash.cry @@ -0,0 +1,206 @@ +module xxhash where + +/** + * The 32-bit variant of xxHash. The first argument is the sequence + * of L bytes to hash. The second argument is a seed value. + */ +XXH32 : {L} (fin L) => [L][8] -> [32] -> [32] +XXH32 input seed = XXH32_avalanche acc1 + where (stripes16 # stripes4 # stripes1) = input + accR = foldl XXH32_rounds (XXH32_init seed) (split stripes16 : [L/16][16][8]) + accL = `(L % 2^^32) + if (`L:Integer) < 16 + then seed + PRIME32_5 + else XXH32_converge accR + acc4 = foldl XXH32_digest4 accL (split stripes4 : [(L%16)/4][4][8]) + acc1 = foldl XXH32_digest1 acc4 (stripes1 : [L%4][8]) + +/** + * The 64-bit variant of xxHash. The first argument is the sequence + * of L bytes to hash. The second argument is a seed value. + */ +XXH64 : {L} (fin L) => [L][8] -> [64] -> [64] +XXH64 input seed = XXH64_avalanche acc1 + where (stripes32 # stripes8 # stripes4 # stripes1) = input + accR = foldl XXH64_rounds (XXH64_init seed) (split stripes32 : [L/32][32][8]) + accL = `(L % 2^^64) + if (`L:Integer) < 32 + then seed + PRIME64_5 + else XXH64_converge accR + acc8 = foldl XXH64_digest8 accL (split stripes8 : [(L%32)/8][8][8]) + acc4 = foldl XXH64_digest4 acc8 (split stripes4 : [(L%8)/4][4][8]) + acc1 = foldl XXH64_digest1 acc4 (stripes1 : [L%4][8]) + +private + + //Utility functions + + /** + * Combines a sequence of bytes into a word using the little-endian + * convention. + */ + toLE bytes = join (reverse bytes) + + //32-bit xxHash helper functions + + //32-bit prime number constants + PRIME32_1 = 0x9E3779B1 : [32] + PRIME32_2 = 0x85EBCA77 : [32] + PRIME32_3 = 0xC2B2AE3D : [32] + PRIME32_4 = 0x27D4EB2F : [32] + PRIME32_5 = 0x165667B1 : [32] + + /** + * The property shows that the hexadecimal representation of the + * PRIME32 constants is the same as the binary representation. + */ + property PRIME32s_as_bits_correct = + (PRIME32_1 == 0b10011110001101110111100110110001) /\ + (PRIME32_2 == 0b10000101111010111100101001110111) /\ + (PRIME32_3 == 0b11000010101100101010111000111101) /\ + (PRIME32_4 == 0b00100111110101001110101100101111) /\ + (PRIME32_5 == 0b00010110010101100110011110110001) + + /** + * This function initializes the four internal accumulators of XXH32. + */ + XXH32_init : [32] -> [4][32] + XXH32_init seed = [acc1, acc2, acc3, acc4] + where acc1 = seed + PRIME32_1 + PRIME32_2 + acc2 = seed + PRIME32_2 + acc3 = seed + 0 + acc4 = seed - PRIME32_1 + + /** + * This processes a single lane of the main round function of XXH32. + */ + XXH32_round : [32] -> [32] -> [32] + XXH32_round accN laneN = ((accN + laneN * PRIME32_2) <<< 13) * PRIME32_1 + + /** + * This is the main round function of XXH32 and processes a stripe, + * i.e. 4 lanes with 4 bytes each. + */ + XXH32_rounds : [4][32] -> [16][8] -> [4][32] + XXH32_rounds accs stripe = + [ XXH32_round accN (toLE laneN) | accN <- accs | laneN <- split stripe ] + + /** + * This function combines the four lane accumulators into a single + * 32-bit value. + */ + XXH32_converge : [4][32] -> [32] + XXH32_converge [acc1, acc2, acc3, acc4] = + (acc1 <<< 1) + (acc2 <<< 7) + (acc3 <<< 12) + (acc4 <<< 18) + + /** + * This function digests a four byte lane + */ + XXH32_digest4 : [32] -> [4][8] -> [32] + XXH32_digest4 acc lane = ((acc + toLE lane * PRIME32_3) <<< 17) * PRIME32_4 + + /** + * This function digests a single byte lane + */ + XXH32_digest1 : [32] -> [8] -> [32] + XXH32_digest1 acc lane = ((acc + (0 # lane) * PRIME32_5) <<< 11) * PRIME32_1 + + /** + * This function ensures that all input bits have a chance to impact + * any bit in the output digest, resulting in an unbiased + * distribution. + */ + XXH32_avalanche : [32] -> [32] + XXH32_avalanche acc0 = acc5 + where acc1 = acc0 ^ (acc0 >> 15) + acc2 = acc1 * PRIME32_2 + acc3 = acc2 ^ (acc2 >> 13) + acc4 = acc3 * PRIME32_3 + acc5 = acc4 ^ (acc4 >> 16) + + //64-bit xxHash helper functions + + //64-bit prime number constants + PRIME64_1 = 0x9E3779B185EBCA87 : [64] + PRIME64_2 = 0xC2B2AE3D27D4EB4F : [64] + PRIME64_3 = 0x165667B19E3779F9 : [64] + PRIME64_4 = 0x85EBCA77C2B2AE63 : [64] + PRIME64_5 = 0x27D4EB2F165667C5 : [64] + + /** + * The property shows that the hexadecimal representation of the + * PRIME64 constants is the same as the binary representation. + */ + property PRIME64s_as_bits_correct = + (PRIME64_1 == 0b1001111000110111011110011011000110000101111010111100101010000111) /\ + (PRIME64_2 == 0b1100001010110010101011100011110100100111110101001110101101001111) /\ + (PRIME64_3 == 0b0001011001010110011001111011000110011110001101110111100111111001) /\ + (PRIME64_4 == 0b1000010111101011110010100111011111000010101100101010111001100011) /\ + (PRIME64_5 == 0b0010011111010100111010110010111100010110010101100110011111000101) + + /** + * This function initializes the four internal accumulators of XXH64. + */ + XXH64_init : [64] -> [4][64] + XXH64_init seed = [acc1, acc2, acc3, acc4] + where acc1 = seed + PRIME64_1 + PRIME64_2 + acc2 = seed + PRIME64_2 + acc3 = seed + 0 + acc4 = seed - PRIME64_1 + + /** + * This processes a single lane of the main round function of XXH64. + */ + XXH64_round : [64] -> [64] -> [64] + XXH64_round accN laneN = ((accN + laneN * PRIME64_2) <<< 31) * PRIME64_1 + + /** + * This is the main round function of XXH64 and processes a stripe, + * i.e. 4 lanes with 8 bytes each. + */ + XXH64_rounds : [4][64] -> [32][8] -> [4][64] + XXH64_rounds accs stripe = + [ XXH64_round accN (toLE laneN) | accN <- accs | laneN <- split stripe ] + + /** + * This is a helper function, used to merge the four lane accumulators. + */ + mergeAccumulator : [64] -> [64] -> [64] + mergeAccumulator acc accN = (acc ^ XXH64_round 0 accN) * PRIME64_1 + PRIME64_4 + + /** + * This function combines the four lane accumulators into a single + * 64-bit value. + */ + XXH64_converge : [4][64] -> [64] + XXH64_converge [acc1, acc2, acc3, acc4] = + foldl mergeAccumulator ((acc1 <<< 1) + (acc2 <<< 7) + (acc3 <<< 12) + (acc4 <<< 18)) [acc1, acc2, acc3, acc4] + + /** + * This function digests an eight byte lane + */ + XXH64_digest8 : [64] -> [8][8] -> [64] + XXH64_digest8 acc lane = ((acc ^ XXH64_round 0 (toLE lane)) <<< 27) * PRIME64_1 + PRIME64_4 + + /** + * This function digests a four byte lane + */ + XXH64_digest4 : [64] -> [4][8] -> [64] + XXH64_digest4 acc lane = ((acc ^ (0 # toLE lane) * PRIME64_1) <<< 23) * PRIME64_2 + PRIME64_3 + + /** + * This function digests a single byte lane + */ + XXH64_digest1 : [64] -> [8] -> [64] + XXH64_digest1 acc lane = ((acc ^ (0 # lane) * PRIME64_5) <<< 11) * PRIME64_1 + + /** + * This function ensures that all input bits have a chance to impact + * any bit in the output digest, resulting in an unbiased + * distribution. + */ + XXH64_avalanche : [64] -> [64] + XXH64_avalanche acc0 = acc5 + where acc1 = acc0 ^ (acc0 >> 33) + acc2 = acc1 * PRIME64_2 + acc3 = acc2 ^ (acc2 >> 29) + acc4 = acc3 * PRIME64_3 + acc5 = acc4 ^ (acc4 >> 32) diff --git a/src/linker/third_party_ext/xxHash/doc/xxhash_spec.md b/src/linker/third_party_ext/xxHash/doc/xxhash_spec.md new file mode 100644 index 00000000..1a544e9b --- /dev/null +++ b/src/linker/third_party_ext/xxHash/doc/xxhash_spec.md @@ -0,0 +1,820 @@ +xxHash fast digest algorithm +====================== + +### Notices + +Copyright (c) Yann Collet + +Permission is granted to copy and distribute this document +for any purpose and without charge, +including translations into other languages +and incorporation into compilations, +provided that the copyright notice and this notice are preserved, +and that any substantive changes or deletions from the original +are clearly marked. +Distribution of this document is unlimited. + +### Version + +0.2.0 (29/06/23) + + +Table of Contents +--------------------- +- [Introduction](#introduction) +- [XXH32 algorithm description](#xxh32-algorithm-description) +- [XXH64 algorithm description](#xxh64-algorithm-description) +- [XXH3 algorithm description](#xxh3-algorithm-overview) + - [Small inputs](#xxh3-algorithm-description-for-small-inputs) + - [Medium inputs](#xxh3-algorithm-description-for-medium-inputs) + - [Large inputs](#xxh3-algorithm-description-for-large-inputs) +- [Performance considerations](#performance-considerations) +- [Reference Implementation](#reference-implementation) + + +Introduction +---------------- + +This document describes the xxHash digest algorithm for both 32-bit and 64-bit variants, named `XXH32` and `XXH64`. The algorithm takes an input a message of arbitrary length and an optional seed value, then produces an output of 32 or 64-bit as "fingerprint" or "digest". + +xxHash is primarily designed for speed. It is labeled non-cryptographic, and is not meant to avoid intentional collisions (same digest for 2 different messages), or to prevent producing a message with a predefined digest. + +XXH32 is designed to be fast on 32-bit machines. +XXH64 is designed to be fast on 64-bit machines. +Both variants produce different output. +However, a given variant shall produce exactly the same output, irrespective of the cpu / os used. In particular, the result remains identical whatever the endianness and width of the cpu is. + +### Operation notations + +All operations are performed modulo {32,64} bits. Arithmetic overflows are expected. +`XXH32` uses 32-bit modular operations. +`XXH64` and `XXH3` use 64-bit modular operations. +When an operation ingests input or secret as multi-bytes values, it reads it using little-endian convention. + +- `+`: denotes modular addition +- `-`: denotes modular subtraction +- `*`: denotes modular multiplication + - **Exception:** In `XXH3`, if it is in the form `(u128)x * (u128)y`, it denotes 64-bit by 64-bit normal multiplication into a full 128-bit result. +- `X <<< s`: denotes the value obtained by circularly shifting (rotating) `X` left by `s` bit positions. +- `X >> s`: denotes the value obtained by shifting `X` right by s bit positions. Upper `s` bits become `0`. +- `X << s`: denotes the value obtained by shifting `X` left by s bit positions. Lower `s` bits become `0`. +- `X xor Y`: denotes the bit-wise XOR of `X` and `Y` (same width). +- `X | Y`: denotes the bit-wise OR of `X` and `Y` (same width). +- `~X`: denotes the bit-wise negation of `X`. + + +XXH32 Algorithm Description +------------------------------------- + +### Overview + +We begin by supposing that we have a message of any length `L` as input, and that we wish to find its digest. Here `L` is an arbitrary nonnegative integer; `L` may be zero. The following steps are performed to compute the digest of the message. + +The algorithm collect and transform input in _stripes_ of 16 bytes. The transforms are stored inside 4 "accumulators", each one storing an unsigned 32-bit value. Each accumulator can be processed independently in parallel, speeding up processing for cpu with multiple execution units. + +The algorithm uses 32-bits addition, multiplication, rotate, shift and xor operations. Many operations require some 32-bits prime number constants, all defined below: + +```c + static const u32 PRIME32_1 = 0x9E3779B1U; // 0b10011110001101110111100110110001 + static const u32 PRIME32_2 = 0x85EBCA77U; // 0b10000101111010111100101001110111 + static const u32 PRIME32_3 = 0xC2B2AE3DU; // 0b11000010101100101010111000111101 + static const u32 PRIME32_4 = 0x27D4EB2FU; // 0b00100111110101001110101100101111 + static const u32 PRIME32_5 = 0x165667B1U; // 0b00010110010101100110011110110001 +``` + +These constants are prime numbers, and feature a good mix of bits 1 and 0, neither too regular, nor too dissymmetric. These properties help dispersion capabilities. + +### Step 1. Initialize internal accumulators + +Each accumulator gets an initial value based on optional `seed` input. Since the `seed` is optional, it can be `0`. + +```c + u32 acc1 = seed + PRIME32_1 + PRIME32_2; + u32 acc2 = seed + PRIME32_2; + u32 acc3 = seed + 0; + u32 acc4 = seed - PRIME32_1; +``` + +#### Special case: input is less than 16 bytes + +When the input is too small (< 16 bytes), the algorithm will not process any stripes. Consequently, it will not make use of parallel accumulators. + +In this case, a simplified initialization is performed, using a single accumulator: + +```c + u32 acc = seed + PRIME32_5; +``` + +The algorithm then proceeds directly to step 4. + +### Step 2. Process stripes + +A stripe is a contiguous segment of 16 bytes. +It is evenly divided into 4 _lanes_, of 4 bytes each. +The first lane is used to update accumulator 1, the second lane is used to update accumulator 2, and so on. + +Each lane read its associated 32-bit value using __little-endian__ convention. + +For each {lane, accumulator}, the update process is called a _round_, and applies the following formula: + +```c + accN = accN + (laneN * PRIME32_2); + accN = accN <<< 13; + accN = accN * PRIME32_1; +``` + +This shuffles the bits so that any bit from input _lane_ impacts several bits in output _accumulator_. All operations are performed modulo 2^32. + +Input is consumed one full stripe at a time. Step 2 is looped as many times as necessary to consume the whole input, except for the last remaining bytes which cannot form a stripe (< 16 bytes). +When that happens, move to step 3. + +### Step 3. Accumulator convergence + +All 4 lane accumulators from the previous steps are merged to produce a single remaining accumulator of the same width (32-bit). The associated formula is as follows: + +```c + acc = (acc1 <<< 1) + (acc2 <<< 7) + (acc3 <<< 12) + (acc4 <<< 18); +``` + +### Step 4. Add input length + +The input total length is presumed known at this stage. This step is just about adding the length to accumulator, so that it participates to final mixing. + +```c + acc = acc + (u32)inputLength; +``` + +Note that, if input length is so large that it requires more than 32-bits, only the lower 32-bits are added to the accumulator. + +### Step 5. Consume remaining input + +There may be up to 15 bytes remaining to consume from the input. +The final stage will digest them according to following pseudo-code: + +```c + while (remainingLength >= 4) { + lane = read_32bit_little_endian(input_ptr); + acc = acc + lane * PRIME32_3; + acc = (acc <<< 17) * PRIME32_4; + input_ptr += 4; remainingLength -= 4; + } + + while (remainingLength >= 1) { + lane = read_byte(input_ptr); + acc = acc + lane * PRIME32_5; + acc = (acc <<< 11) * PRIME32_1; + input_ptr += 1; remainingLength -= 1; + } +``` + +This process ensures that all input bytes are present in the final mix. + +### Step 6. Final mix (avalanche) + +The final mix ensures that all input bits have a chance to impact any bit in the output digest, resulting in an unbiased distribution. This is also called avalanche effect. + +```c + acc = acc xor (acc >> 15); + acc = acc * PRIME32_2; + acc = acc xor (acc >> 13); + acc = acc * PRIME32_3; + acc = acc xor (acc >> 16); +``` + +### Step 7. Output + +The `XXH32()` function produces an unsigned 32-bit value as output. + +For systems which require to store and/or display the result in binary or hexadecimal format, the canonical format is defined to reproduce the same value as the natural decimal format, hence follows __big-endian__ convention (most significant byte first). + + +XXH64 Algorithm Description +------------------------------------- + +### Overview + +`XXH64`'s algorithm structure is very similar to `XXH32` one. The major difference is that `XXH64` uses 64-bit arithmetic, speeding up memory transfer for 64-bit compliant systems, but also relying on cpu capability to efficiently perform 64-bit operations. + +The algorithm collects and transforms input in _stripes_ of 32 bytes. The transforms are stored inside 4 "accumulators", each one storing an unsigned 64-bit value. Each accumulator can be processed independently in parallel, speeding up processing for cpu with multiple execution units. + +The algorithm uses 64-bit addition, multiplication, rotate, shift and xor operations. Many operations require some 64-bit prime number constants, all defined below: + +```c + static const u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; // 0b1001111000110111011110011011000110000101111010111100101010000111 + static const u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; // 0b1100001010110010101011100011110100100111110101001110101101001111 + static const u64 PRIME64_3 = 0x165667B19E3779F9ULL; // 0b0001011001010110011001111011000110011110001101110111100111111001 + static const u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; // 0b1000010111101011110010100111011111000010101100101010111001100011 + static const u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; // 0b0010011111010100111010110010111100010110010101100110011111000101 +``` + +These constants are prime numbers, and feature a good mix of bits 1 and 0, neither too regular, nor too dissymmetric. These properties help dispersion capabilities. + +### Step 1. Initialize internal accumulators + +Each accumulator gets an initial value based on optional `seed` input. Since the `seed` is optional, it can be `0`. + +```c + u64 acc1 = seed + PRIME64_1 + PRIME64_2; + u64 acc2 = seed + PRIME64_2; + u64 acc3 = seed + 0; + u64 acc4 = seed - PRIME64_1; +``` + +#### Special case: input is less than 32 bytes + +When the input is too small (< 32 bytes), the algorithm will not process any stripes. Consequently, it will not make use of parallel accumulators. + +In this case, a simplified initialization is performed, using a single accumulator: + +```c + u64 acc = seed + PRIME64_5; +``` + +The algorithm then proceeds directly to step 4. + +### Step 2. Process stripes + +A stripe is a contiguous segment of 32 bytes. +It is evenly divided into 4 _lanes_, of 8 bytes each. +The first lane is used to update accumulator 1, the second lane is used to update accumulator 2, and so on. + +Each lane read its associated 64-bit value using __little-endian__ convention. + +For each {lane, accumulator}, the update process is called a _round_, and applies the following formula: + +```c +round(accN,laneN): + accN = accN + (laneN * PRIME64_2); + accN = accN <<< 31; + return accN * PRIME64_1; +``` + +This shuffles the bits so that any bit from input _lane_ impacts several bits in output _accumulator_. All operations are performed modulo 2^64. + +Input is consumed one full stripe at a time. Step 2 is looped as many times as necessary to consume the whole input, except for the last remaining bytes which cannot form a stripe (< 32 bytes). +When that happens, move to step 3. + +### Step 3. Accumulator convergence + +All 4 lane accumulators from previous steps are merged to produce a single remaining accumulator of same width (64-bit). The associated formula is as follows. + +Note that accumulator convergence is more complex than 32-bit variant, and requires to define another function called _mergeAccumulator()_: + +```c +mergeAccumulator(acc,accN): + acc = acc xor round(0, accN); + acc = acc * PRIME64_1; + return acc + PRIME64_4; +``` + +which is then used in the convergence formula: + +```c + acc = (acc1 <<< 1) + (acc2 <<< 7) + (acc3 <<< 12) + (acc4 <<< 18); + acc = mergeAccumulator(acc, acc1); + acc = mergeAccumulator(acc, acc2); + acc = mergeAccumulator(acc, acc3); + acc = mergeAccumulator(acc, acc4); +``` + +### Step 4. Add input length + +The input total length is presumed known at this stage. This step is just about adding the length to accumulator, so that it participates to final mixing. + +```c + acc = acc + inputLength; +``` + +### Step 5. Consume remaining input + +There may be up to 31 bytes remaining to consume from the input. +The final stage will digest them according to following pseudo-code: + +```c + while (remainingLength >= 8) { + lane = read_64bit_little_endian(input_ptr); + acc = acc xor round(0, lane); + acc = (acc <<< 27) * PRIME64_1; + acc = acc + PRIME64_4; + input_ptr += 8; remainingLength -= 8; + } + + if (remainingLength >= 4) { + lane = read_32bit_little_endian(input_ptr); + acc = acc xor (lane * PRIME64_1); + acc = (acc <<< 23) * PRIME64_2; + acc = acc + PRIME64_3; + input_ptr += 4; remainingLength -= 4; + } + + while (remainingLength >= 1) { + lane = read_byte(input_ptr); + acc = acc xor (lane * PRIME64_5); + acc = (acc <<< 11) * PRIME64_1; + input_ptr += 1; remainingLength -= 1; + } +``` + +This process ensures that all input bytes are present in the final mix. + +### Step 6. Final mix (avalanche) + +The final mix ensures that all input bits have a chance to impact any bit in the output digest, resulting in an unbiased distribution. This is also called avalanche effect. + +```c + acc = acc xor (acc >> 33); + acc = acc * PRIME64_2; + acc = acc xor (acc >> 29); + acc = acc * PRIME64_3; + acc = acc xor (acc >> 32); +``` + +### Step 7. Output + +The `XXH64()` function produces an unsigned 64-bit value as output. + +For systems which require to store and/or display the result in binary or hexadecimal format, the canonical format is defined to reproduce the same value as the natural decimal format, hence follows __big-endian__ convention (most significant byte first). + +XXH3 Algorithm Overview +------------------------------------- + +XXH3 comes in two different versions: XXH3-64 and XXH3-128 (or XXH128), producing 64 and 128 bits of output, respectively. + +XXH3 uses different algorithms for small (0-16 bytes), medium (17-240 bytes), and large (241+ bytes) inputs. The algorithms for small and medium inputs are optimized for performance. The three algorithms are described in the following sections. + +Many operations require some 64-bit prime number constants, which are mostly the same constants used in XXH32 and XXH64, all defined below: + +```c + static const u64 PRIME32_1 = 0x9E3779B1U; // 0b10011110001101110111100110110001 + static const u64 PRIME32_2 = 0x85EBCA77U; // 0b10000101111010111100101001110111 + static const u64 PRIME32_3 = 0xC2B2AE3DU; // 0b11000010101100101010111000111101 + static const u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; // 0b1001111000110111011110011011000110000101111010111100101010000111 + static const u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; // 0b1100001010110010101011100011110100100111110101001110101101001111 + static const u64 PRIME64_3 = 0x165667B19E3779F9ULL; // 0b0001011001010110011001111011000110011110001101110111100111111001 + static const u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; // 0b1000010111101011110010100111011111000010101100101010111001100011 + static const u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; // 0b0010011111010100111010110010111100010110010101100110011111000101 + static const u64 PRIME_MX1 = 0x165667919E3779F9ULL; // 0b0001011001010110011001111001000110011110001101110111100111111001 + static const u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; // 0b1001111110110010000111000110010100011110100110001101111100100101 +``` + +The `XXH3_64bits()` function produces an unsigned 64-bit value. +The `XXH3_128bits()` function produces a `XXH128_hash_t` struct containing `low64` and `high64` - the lower and higher 64-bit half values of the result, respectively. + +For systems requiring storing and/or displaying the result in binary or hexadecimal format, the canonical format is defined to reproduce the same value as the natural decimal format, hence following **big-endian** convention (most significant byte first). + +### Seed and Secret + +XXH3 provides seeded hashing by introducing two configurable constants used in the hashing process: the seed and the secret. The seed is an unsigned 64-bit value, and the secret is an array of bytes that is at least 136 bytes in size. The default seed is 0, and the default secret is the following 192-byte value: + +```c +static const u8 defaultSecret[192] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; +``` + +The seed and the secret can be optionally specified using the `*_withSecret` and `*_withSeed` versions of the hash function. + +The seed and the secret cannot be specified simultaneously (`*_withSecretAndSeed` is actually `*_withSeed` for short and medium inputs <= 240 bytes, and `*_withSecret` for large inputs). When one is specified, the other one uses the default value. +There is one exception though: when input is large (> 240 bytes) and a seed is given, a secret is derived from the seed value and the default secret using the following procedure: + +```c +deriveSecret(u64 seed): + u64 derivedSecret[24] = defaultSecret[0:192]; + for (i = 0; i < 12; i++) { + derivedSecret[i*2] += seed; + derivedSecret[i*2+1] -= seed; + } + return derivedSecret; // convert to u8[192] (little-endian) +``` + +The derivation treats the secrets as 24 64-bit values. In XXH3 algorithms, the secret is always read similarly by treating a contiguous segment of the array as one or more 32-bit or 64-bit values. **The secret values are always read using little-endian convention**. + +### Final Mixing Step (avalanche) + +To make sure that all input bits have a chance to impact any bit in the output digest (avalanche effect), the final step of the XXH3 algorithm is usually one of the two fixed operations that mix the bits in a 64-bit value. These operations are denoted `avalanche()` and `avalanche_XXH64()` in the following XXH3 description. + +```c +avalanche(u64 x): + x = x xor (x >> 37); + x = x * PRIME_MX1; + x = x xor (x >> 32); + return x; + +avalanche_XXH64(u64 x): + x = x xor (x >> 33); + x = x * PRIME64_2; + x = x xor (x >> 29); + x = x * PRIME64_3; + x = x xor (x >> 32); + return x; +``` + +XXH3 Algorithm Description (for small inputs) +------------------------------------- + +The algorithm for small inputs (0-16 bytes of input) is further divided into 4 cases: empty, 1-3 bytes, 4-8 bytes, and 9-16 bytes of input. + +The algorithm uses byte-swap operations. The byte-swap operation reverses the byte order in a 32-bit or 64-bit value. It is denoted `bswap32` and `bswap64` for its 32-bit and 64-bit versions, respectively. + +### Empty input + +The hash of empty input is calculated from the seed and a segment of the secret: + +```c +XXH3_64_empty(): + u64 secretWords[2] = secret[56:72]; + return avalanche_XXH64(seed xor secretWords[0] xor secretWords[1]); + +XXH3_128_empty(): + u64 secretWords[4] = secret[64:96]; + return {avalanche_XXH64(seed xor secretWords[0] xor secretWords[1]), // lower half + avalanche_XXH64(seed xor secretWords[2] xor secretWords[3])}; // higher half +``` + +### 1-3 bytes of input + +The algorithm starts from a single 32-bit value combining the input bytes and its length: + +```c +u32 combined = (u32)input[inputLength-1] | ((u32)inputLength << 8) | + ((u32)input[0] << 16) | ((u32)input[inputLength>>1] << 24); +// LSB 8 16 24 MSB +// | last byte | length | first byte | middle-or-last byte | +``` + +Then the final output is calculated from the value and the first 8 bytes (XXH3-64) or 16 bytes (XXH3-128) of the secret to produce the final result. The secret here is read as 32-bit values instead of the usual 64-bit values. + +```c +XXH3_64_1to3(): + u32 secretWords[2] = secret[0:8]; + u64 value = ((u64)(secretWords[0] xor secretWords[1]) + seed) xor (u64)combined; + return avalanche_XXH64(value); + +XXH3_128_1to3(): + u32 secretWords[4] = secret[0:16]; + u64 low = ((u64)(secretWords[0] xor secretWords[1]) + seed) xor (u64)combined; + u64 high = ((u64)(secretWords[2] xor secretWords[3]) - seed) xor (u64)(bswap32(combined) <<< 13); + // note that the bswap32(combined) <<< 13 above is 32-bit rotate + return {avalanche_XXH64(low), // lower half + avalanche_XXH64(high)}; // higher half +``` + +Note that the XXH3-64 result is the lower half of XXH3-128 result. + +### 4-8 bytes of input + +The algorithm starts from reading the first and last 4 bytes of the input as little-endian 32-bit values, and a modified seed: + +```c +u32 inputFirst = input[0:4]; +u32 inputLast = input[inputLength-4:inputLength]; +u64 modifiedSeed = seed xor ((u64)bswap32((u32)lowerHalf(seed)) << 32); +``` + +Again, these values are combined with a segment of the secret to produce the final value. + +```c +XXH3_64_4to8(): + u64 secretWords[2] = secret[8:24]; + u64 combined = (u64)inputLast | ((u64)inputFirst << 32); + u64 value = ((secretWords[0] xor secretWords[1]) - modifiedSeed) xor combined; + value = value xor (value <<< 49) xor (value <<< 24); + value = value * PRIME_MX2; + value = value xor ((value >> 35) + inputLength); + value = value * PRIME_MX2; + value = value xor (value >> 28); + return value; + +XXH3_128_4to8(): + u64 secretWords[2] = secret[16:32]; + u64 combined = (u64)inputFirst | ((u64)inputLast << 32); + u64 value = ((secretWords[0] xor secretWords[1]) + modifiedSeed) xor combined; + u128 mulResult = (u128)value * (u128)(PRIME64_1 + (inputLength << 2)); + u64 high = higherHalf(mulResult); // mulResult >> 64 + u64 low = lowerHalf(mulResult); // mulResult & 0xFFFFFFFFFFFFFFFF + high = high + (low << 1); + low = low xor (high >> 3); + low = low xor (low >> 35); + low = low * PRIME_MX2; + low = low xor (low >> 28); + high = avalanche(high); + return {low, high}; +``` + +### 9-16 bytes of input + +The algorithm starts from reading the first and last 8 bytes of the input as little-endian 64-bit values: + +```c +u64 inputFirst = input[0:8]; +u64 inputLast = input[inputLength-8:inputLength]; +``` + +Once again, these values are combined with a segment of the secret to produce the final value. + +```c +XXH3_64_9to16(): + u64 secretWords[4] = secret[24:56]; + u64 low = ((secretWords[0] xor secretWords[1]) + seed) xor inputFirst; + u64 high = ((secretWords[2] xor secretWords[3]) - seed) xor inputLast; + u128 mulResult = (u128)low * (u128)high; + u64 value = inputLength + bswap64(low) + high + (u64)(lowerHalf(mulResult) xor higherHalf(mulResult)); + return avalanche(value); + +XXH3_128_9to16(): + u64 secretWords[4] = secret[32:64]; + u64 val1 = ((secretWords[0] xor secretWords[1]) - seed) xor inputFirst xor inputLast; + u64 val2 = ((secretWords[2] xor secretWords[3]) + seed) xor inputLast; + u128 mulResult = (u128)val1 * (u128)PRIME64_1; + u64 low = lowerHalf(mulResult) + ((u64)(inputLength - 1) << 54); + u64 high = higherHalf(mulResult) + ((u64)higherHalf(val2) << 32) + (u64)lowerHalf(val2) * PRIME32_2; + // the above line can also be simplified to higherHalf(mulResult) + val2 + (u64)lowerHalf(val2) * (PRIME32_2 - 1); + low = low xor bswap64(high); + // the following three lines are in fact a 128x64 -> 128 multiplication ({low,high} = (u128){low,high} * PRIME64_2) + u128 mulResult2 = (u128)low * (u128)PRIME64_2; + low = lowerHalf(mulResult2); + high = higherHalf(mulResult2) + high * PRIME64_2; + return {avalanche(low), // lower half + avalanche(high)}; // higher half +``` + + +XXH3 Algorithm Description (for medium inputs) +------------------------------------- + +This algorithm is used for medium inputs (17-240 bytes of input). Its internal hash state is stored inside 1 (XXH3-64) or 2 (XXH3-128) "accumulators", each storing an unsigned 64-bit value. + +### Step 1. Initialize internal accumulators + +The accumulator(s) are initialized based on the input length. + +```c +// For XXH3-64 +u64 acc = inputLength * PRIME64_1; + +// For XXH3-128 +u64 acc[2] = {inputLength * PRIME64_1, 0}; +``` + +### Step 2. Process the input + +This step is further divided into two cases: one for 17-128 bytes of input, and one for 129-240 bytes of input. + +#### Mixing operation + +This step uses a mixing operation that mixes a 16-byte segment of data, a 16-byte segment of secret and the seed into a 64-bit value as a building block. This operation treat the segment of data and secret as little-endian 64-bit values. + +```c +mixStep(u8 data[16], size secretOffset, u64 seed): + u64 dataWords[2] = data[0:16]; + u64 secretWords[2] = secret[secretOffset:secretOffset+16]; + u128 mulResult = (u128)(dataWords[0] xor (secretWords[0] + seed)) * + (u128)(dataWords[1] xor (secretWords[1] - seed)); + return lowerHalf(mulResult) xor higherHalf(mulResult); +``` + +The mixing operation is always invoked in groups of two in XXH3-128, where two 16-byte segments of data are mixed with a 32-byte segment of secret, and the accumulators are updated accordingly. + +```c +mixTwoChunks(u8 data1[16], u8 data2[16], size secretOffset, u64 seed): + u64 dataWords1[2] = data1[0:16]; // again, little-endian conversion + u64 dataWords2[2] = data2[0:16]; + acc[0] = acc[0] + mixStep(data1, secretOffset, seed); + acc[1] = acc[1] + mixStep(data2, secretOffset + 16, seed); + acc[0] = acc[0] xor (dataWords2[0] + dataWords2[1]); + acc[1] = acc[1] xor (dataWords1[0] + dataWords1[1]); +``` + +The input is split into several 16-byte chunks and mixed, and the result is added to the accumulator(s). + +#### 17-128 bytes of input + +The input is read as *N* 16-byte chunks starting from the beginning and *N* chunks starting from the end, where *N* is the smallest number that these 2*N* chunks cover the whole input. These chunks are paired up and mixed, and the results are accumulated to the accumulator(s). + +```c +// the loop variable `i` should be signed to avoid underflow in implementation +processInput_XXH3_64_17to128(): + u64 numRounds = ((inputLength - 1) >> 5) + 1; + for (i = numRounds - 1; i >= 0; i--) { + size offsetStart = i*16; + size offsetEnd = inputLength - i*16 - 16; + acc += mixStep(input[offsetStart:offsetStart+16], i*32, seed); + acc += mixStep(input[offsetEnd:offsetEnd+16], i*32+16, seed); + } + +processInput_XXH3_128_17to128(): + u64 numRounds = ((inputLength - 1) >> 5) + 1; + for (i = numRounds - 1; i >= 0; i--) { + size offsetStart = i*16; + size offsetEnd = inputLength - i*16 - 16; + mixTwoChunks(input[offsetStart:offsetStart+16], input[offsetEnd:offsetEnd+16], i*32, seed); + } +``` + +#### 129-240 bytes of input + +The input is split into 16-byte (XXH3-64) or 32-byte (XXH3-128) chunks. The first 128 bytes are first mixed chunk by chunk, followed by an intermediate avalanche operation. Then the remaining full chunks are processed, and finally the last 16/32 bytes are treated as a chunk to process. + +```c +processInput_XXH3_64_129to240(): + u64 numChunks = inputLength >> 4; + for (i = 0; i < 8; i++) { + acc += mixStep(input[i*16:i*16+16], i*16, seed); + } + acc = avalanche(acc); + for (i = 8; i < numChunks; i++) { + acc += mixStep(input[i*16:i*16+16], (i-8)*16 + 3, seed); + } + acc += mixStep(input[inputLength-16:inputLength], 119, seed); + +processInput_XXH3_128_129to240(): + u64 numChunks = inputLength >> 5; + for (i = 0; i < 4; i++) { + mixTwoChunks(input[i*32:i*32+16], input[i*32+16:i*32+32], i*32, seed); + } + acc[0] = avalanche(acc[0]); + acc[1] = avalanche(acc[1]); + for (i = 4; i < numChunks; i++) { + mixTwoChunks(input[i*32:i*32+16], input[i*32+16:i*32+32], (i-4)*32 + 3, seed); + } + // note that the half-chunk order and the seed is different here + mixTwoChunks(input[inputLength-16:inputLength], input[inputLength-32:inputLength-16], 103, (u64)0 - seed); +``` + +### Step 3. Finalization + +The final result is extracted from the accumulator(s). + +```c +XXH3_64_17to240(): + return avalanche(acc); + +XXH3_128_17to240(): + u64 low = acc[0] + acc[1]; + u64 high = (acc[0] * PRIME64_1) + (acc[1] * PRIME64_4) + (((u64)inputLength - seed) * PRIME64_2); + return {avalanche(low), // lower half + (u64)0 - avalanche(high)}; // higher half +``` + +XXH3 Algorithm Description (for large inputs) +------------------------------------- + +This algorithm is used for inputs larger than 240 bytes. The internal hash state is stored inside 8 "accumulators", each one storing an unsigned 64-bit value. + +### Step 1. Initialize internal accumulators + +The accumulators are initialized to fixed constants: + +```c +u64 acc[8] = { + PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, + PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1}; +``` + +### Step 2. Process blocks + +The input is consumed and processed one full block at a time. The size of the block depends on the length of the secret. Specifically, a block consists of several 64-byte stripes. The number of stripes per block is `floor((secretLength-64)/8)` . For the default 192-byte secret, there are 16 stripes in a block, and thus the block size is 1024 bytes. + +```c +secretLength = lengthInBytes(secret); // default 192; at least 136 +stripesPerBlock = (secretLength-64) / 8; // default 16; at least 9 +blockSize = 64 * stripesPerBlock; // default 1024; at least 576 +``` + +The process of processing a full block is called a *round*. It consists of the following two sub-steps: + +#### Step 2-1. Process stripes in the block + +A stripe is evenly divided into 8 lanes, of 8 bytes each. In an accumulation step, one stripe and a 64-byte contiguous segment of the secret are used to update the accumulators. Each lane reads its associated 64-bit value using little-endian convention. + +The accumulation step applies the following procedure: + +```c +accumulate(u64 stripe[8], size secretOffset): + u64 secretWords[8] = secret[secretOffset:secretOffset+64]; + for (i = 0; i < 8; i++) { + u64 value = stripe[i] xor secretWords[i]; + acc[i xor 1] = acc[i xor 1] + stripe[i]; + acc[i] = acc[i] + (u64)lowerHalf(value) * (u64)higherHalf(value); + // (value and 0xFFFFFFFF) * (value >> 32) + } +``` + +The accumulation step is repeated for all stripes in a block, using different segments of the secret, starting from the first 64 bytes for the first stripe, and offset by 8 bytes for each following round: + +```c +round_accumulate(u8 block[blockSize]): + for (n = 0; n < stripesPerBlock; n++) { + u64 stripe[8] = block[n*64:n*64+64]; // 64 bytes = 8 u64s + accumulate(stripe, n*8); + } +``` + +#### Step 2-2. Scramble accumulators + +After the accumulation steps are finished for all stripes in the block, the accumulators are scrambled using the last 64 bytes of the secret. + +```c +round_scramble(): + u64 secretWords[8] = secret[secretLength-64:secretLength]; + for (i = 0; i < 8; i++) { + acc[i] = acc[i] xor (acc[i] >> 47); + acc[i] = acc[i] xor secretWords[i]; + acc[i] = acc[i] * PRIME32_1; + } +``` + +A round is thus a `round_accumulate` followed by a `round_scramble`: + +```c +round(u8 block[blockSize]): + round_accumulate(block); + round_scramble(); +``` + +Step 2 is looped to consume the input until there are less than or equal to `blockSize` bytes of input left. Note that we leave the last block to the next step even if it is a full block. + +### Step 3. Process the last block and the last 64 bytes + +Accumulation steps are run for the stripes in the last block, except for the last stripe (whether it is full or not). After that, run a final accumulation step by treating the last 64 bytes as a stripe. Note that the last 64 bytes might overlap with the second-to-last block. + +```c +// len is the size of the last block (1 <= len <= blockSize) +lastRound(u8 block[], size len, u64 lastStripe[8]): + size nFullStripes = (len-1)/64; + for (n = 0; n < nFullStripes; n++) { + u64 stripe[8] = block[n*64:n*64+64]; + accumulate(stripe, n * 8); + } + accumulate(lastStripe, secretLength - 71); +``` + +### Step 4. Finalization + +In the finalization step, a merging procedure is used to extract a single 64-bit value from the accumulators, using an initial seed value and a 64-byte segment of the secret. + +```c +finalMerge(u64 initValue, size secretOffset): + u64 secretWords[8] = secret[secretOffset:secretOffset+64]; + u64 result = initValue; + for (i = 0; i < 4; i++) { + // 64-bit by 64-bit multiplication to 128-bit full result + u128 mulResult = (u128)(acc[i*2] xor secretWords[i*2]) * + (u128)(acc[i*2+1] xor secretWords[i*2+1]); + result = result + (lowerHalf(mulResult) xor higherHalf(mulResult)); + // (mulResult and 0xFFFFFFFFFFFFFFFF) xor (mulResult >> 64) + } + return avalanche(result); +``` + +XXH3-128 runs the merging procedure twice for the two halves of the result, using different secret segments and different initial values derived from the total input length. +The XXH3-64 result is just the lower half of the XXH3-128 result. + +```c +XXH3_64_large(): + return finalMerge((u64)inputLength * PRIME64_1, 11); + +XXH3_128_large(): + return {finalMerge((u64)inputLength * PRIME64_1, 11), // lower half + finalMerge(~((u64)inputLength * PRIME64_2), secretLength - 75)}; // higher half +``` + + +Performance considerations +---------------------------------- + +The xxHash algorithms are simple and compact to implement. They provide a system independent "fingerprint" or digest of a message of arbitrary length. + +The algorithm allows input to be streamed and processed in multiple steps. In such case, an internal buffer is needed to ensure data is presented to the algorithm in full stripes. + +On 64-bit systems, the 64-bit variant `XXH64` is generally faster to compute, so it is a recommended variant, even when only 32-bit are needed. + +On 32-bit systems though, positions are reversed: `XXH64` performance is reduced, due to its usage of 64-bit arithmetic. `XXH32` becomes a faster variant. + +Finally, when vector operations are possible, `XXH3` is likely the faster variant. + + +Reference Implementation +---------------------------------------- + +A reference library written in C is available at https://www.xxhash.com. +The web page also links to multiple other implementations written in many different languages. +It links to the [github project page](https://github.com/Cyan4973/xxHash) where an [issue board](https://github.com/Cyan4973/xxHash/issues) can be used for further public discussions on the topic. + + +Version changes +-------------------- +v0.2.0: added XXH3 specification, by Adrien Wu +v0.1.1: added a note on rationale for selection of constants +v0.1.0: initial release diff --git a/src/linker/third_party_ext/xxHash/xxh3.h b/src/linker/third_party_ext/xxHash/xxh3.h new file mode 100644 index 00000000..7e3ce68e --- /dev/null +++ b/src/linker/third_party_ext/xxHash/xxh3.h @@ -0,0 +1,55 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Development source file for `xxh3` + * Copyright (C) 2019-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * Note: This file used to host the source code of XXH3_* variants. + * during the development period. + * The source code is now properly integrated within xxhash.h. + * + * xxh3.h is no longer useful, + * but it is still provided for compatibility with source code + * which used to include it directly. + * + * Programs are now highly discouraged to include xxh3.h. + * Include `xxhash.h` instead, which is the officially supported interface. + * + * In the future, xxh3.h will start to generate warnings, then errors, + * then it will be removed from source package and from include directory. + */ + +/* Simulate the same impact as including the old xxh3.h source file */ + +#define XXH_INLINE_ALL +#include "xxhash.h" diff --git a/src/linker/third_party_ext/xxHash/xxh_x86dispatch.c b/src/linker/third_party_ext/xxHash/xxh_x86dispatch.c new file mode 100644 index 00000000..2489e153 --- /dev/null +++ b/src/linker/third_party_ext/xxHash/xxh_x86dispatch.c @@ -0,0 +1,845 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2020-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/*! + * @file xxh_x86dispatch.c + * + * Automatic dispatcher code for the @ref XXH3_family on x86-based targets. + * + * Optional add-on. + * + * **Compile this file with the default flags for your target.** Do not compile + * with flags like `-mavx*`, `-march=native`, or `/arch:AVX*`, there will be + * an error. See @ref XXH_X86DISPATCH_ALLOW_AVX for details. + * + * @defgroup dispatch x86 Dispatcher + * @{ + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#if !(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)) +# error "Dispatching is currently only supported on x86 and x86_64." +#endif + +/*! @cond Doxygen ignores this part */ +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +# define XXH_HAS_INCLUDE(x) __has_include(x) +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif +/*! @endcond */ + +/*! + * @def XXH_X86DISPATCH_ALLOW_AVX + * @brief Disables the AVX sanity check. + * + * xxh_x86dispatch.c is intended to be compiled for the minimum target, and + * it selectively enables SSE2, AVX2, and AVX512 when it is needed. + * + * Compiling with options like `-mavx*`, `-march=native`, or `/arch:AVX*` + * _globally_ will always enable this feature, and therefore makes it + * undefined behavior to execute on any CPU without said feature. + * + * Even if the source code isn't directly using AVX intrinsics in a function, + * the compiler can still generate AVX code from autovectorization and by + * "upgrading" SSE2 intrinsics to use the VEX prefixes (a.k.a. AVX128). + * + * Define XXH_X86DISPATCH_ALLOW_AVX to ignore this check, + * thus accepting that the produced binary will not work correctly + * on any CPU with less features than the ones stated at compilation time. + */ +#ifdef XXH_DOXYGEN +# define XXH_X86DISPATCH_ALLOW_AVX +#endif + +#if defined(__AVX__) && !defined(XXH_X86DISPATCH_ALLOW_AVX) +# error "Error: if xxh_x86dispatch.c is compiled with AVX enabled, the resulting binary will crash on sse2-only cpus !! " \ + "If you nonetheless want to do that, please enable the XXH_X86DISPATCH_ALLOW_AVX build variable" +#endif + +/*! + * @def XXH_DISPATCH_SCALAR + * @brief Enables/dispatching the scalar code path. + * + * If this is defined to 0, SSE2 support is assumed. This reduces code size + * when the scalar path is not needed. + * + * This is automatically defined to 0 when... + * - SSE2 support is enabled in the compiler + * - Targeting x86_64 + * - Targeting Android x86 + * - Targeting macOS + */ +#ifndef XXH_DISPATCH_SCALAR +# if defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) /* SSE2 on by default */ \ + || defined(__x86_64__) || defined(_M_X64) /* x86_64 */ \ + || defined(__ANDROID__) || defined(__APPLE__) /* Android or macOS */ +# define XXH_DISPATCH_SCALAR 0 /* disable */ +# else +# define XXH_DISPATCH_SCALAR 1 +# endif +#endif +/*! + * @def XXH_DISPATCH_AVX2 + * @brief Enables/disables dispatching for AVX2. + * + * This is automatically detected if it is not defined. + * - GCC 4.7 and later are known to support AVX2, but >4.9 is required for + * to get the AVX2 intrinsics and typedefs without -mavx -mavx2. + * - Visual Studio 2013 Update 2 and later are known to support AVX2. + * - The GCC/Clang internal header `` is detected. While this is + * not allowed to be included directly, it still appears in the builtin + * include path and is detectable with `__has_include`. + * + * @see XXH_AVX2 + */ +#ifndef XXH_DISPATCH_AVX2 +# if (defined(__GNUC__) && (__GNUC__ > 4)) /* GCC 5.0+ */ \ + || (defined(_MSC_VER) && _MSC_VER >= 1900) /* VS 2015+ */ \ + || (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 180030501) /* VS 2013 Update 2 */ \ + || XXH_HAS_INCLUDE() /* GCC/Clang internal header */ +# define XXH_DISPATCH_AVX2 1 /* enable dispatch towards AVX2 */ +# else +# define XXH_DISPATCH_AVX2 0 +# endif +#endif /* XXH_DISPATCH_AVX2 */ + +/*! + * @def XXH_DISPATCH_AVX512 + * @brief Enables/disables dispatching for AVX512. + * + * Automatically detected if one of the following conditions is met: + * - GCC 4.9 and later are known to support AVX512. + * - Visual Studio 2017 and later are known to support AVX2. + * - The GCC/Clang internal header `` is detected. While this + * is not allowed to be included directly, it still appears in the builtin + * include path and is detectable with `__has_include`. + * + * @see XXH_AVX512 + */ +#ifndef XXH_DISPATCH_AVX512 +# if (defined(__GNUC__) \ + && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9))) /* GCC 4.9+ */ \ + || (defined(_MSC_VER) && _MSC_VER >= 1910) /* VS 2017+ */ \ + || XXH_HAS_INCLUDE() /* GCC/Clang internal header */ +# define XXH_DISPATCH_AVX512 1 /* enable dispatch towards AVX512 */ +# else +# define XXH_DISPATCH_AVX512 0 +# endif +#endif /* XXH_DISPATCH_AVX512 */ + +/*! + * @def XXH_TARGET_SSE2 + * @brief Allows a function to be compiled with SSE2 intrinsics. + * + * Uses `__attribute__((__target__("sse2")))` on GCC to allow SSE2 to be used + * even with `-mno-sse2`. + * + * @def XXH_TARGET_AVX2 + * @brief Like @ref XXH_TARGET_SSE2, but for AVX2. + * + * @def XXH_TARGET_AVX512 + * @brief Like @ref XXH_TARGET_SSE2, but for AVX512. + * + */ +#if defined(__GNUC__) +# include /* SSE2 */ +# if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 +# include /* AVX2, AVX512F */ +# endif +# define XXH_TARGET_SSE2 __attribute__((__target__("sse2"))) +# define XXH_TARGET_AVX2 __attribute__((__target__("avx2"))) +# define XXH_TARGET_AVX512 __attribute__((__target__("avx512f"))) +#elif defined(__clang__) && defined(_MSC_VER) /* clang-cl.exe */ +# include /* SSE2 */ +# if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 +# include /* AVX2, AVX512F */ +# include +# include +# include +# include +# endif +# define XXH_TARGET_SSE2 __attribute__((__target__("sse2"))) +# define XXH_TARGET_AVX2 __attribute__((__target__("avx2"))) +# define XXH_TARGET_AVX512 __attribute__((__target__("avx512f"))) +#elif defined(_MSC_VER) +# include +# define XXH_TARGET_SSE2 +# define XXH_TARGET_AVX2 +# define XXH_TARGET_AVX512 +#else +# error "Dispatching is currently not supported for your compiler." +#endif + +/*! @cond Doxygen ignores this part */ +#ifdef XXH_DISPATCH_DEBUG +/* debug logging */ +# include +# define XXH_debugPrint(str) { fprintf(stderr, "DEBUG: xxHash dispatch: %s \n", str); fflush(NULL); } +#else +# define XXH_debugPrint(str) ((void)0) +# undef NDEBUG /* avoid redefinition */ +# define NDEBUG +#endif +/*! @endcond */ +#include + +#ifndef XXH_DOXYGEN +#define XXH_INLINE_ALL +#define XXH_X86DISPATCH +#include "xxhash.h" +#endif + +/*! @cond Doxygen ignores this part */ +#ifndef XXH_HAS_ATTRIBUTE +# ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(...) __has_attribute(__VA_ARGS__) +# else +# define XXH_HAS_ATTRIBUTE(...) 0 +# endif +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +#if XXH_HAS_ATTRIBUTE(constructor) +# define XXH_CONSTRUCTOR __attribute__((constructor)) +# define XXH_DISPATCH_MAYBE_NULL 0 +#else +# define XXH_CONSTRUCTOR +# define XXH_DISPATCH_MAYBE_NULL 1 +#endif +/*! @endcond */ + + +/*! @cond Doxygen ignores this part */ +/* + * Support both AT&T and Intel dialects + * + * GCC doesn't convert AT&T syntax to Intel syntax, and will error out if + * compiled with -masm=intel. Instead, it supports dialect switching with + * curly braces: { AT&T syntax | Intel syntax } + * + * Clang's integrated assembler automatically converts AT&T syntax to Intel if + * needed, making the dialect switching useless (it isn't even supported). + * + * Note: Comments are written in the inline assembly itself. + */ +#ifdef __clang__ +# define XXH_I_ATT(intel, att) att "\n\t" +#else +# define XXH_I_ATT(intel, att) "{" att "|" intel "}\n\t" +#endif +/*! @endcond */ + +/*! + * @private + * @brief Runs CPUID. + * + * @param eax , ecx The parameters to pass to CPUID, %eax and %ecx respectively. + * @param abcd The array to store the result in, `{ eax, ebx, ecx, edx }` + */ +static void XXH_cpuid(xxh_u32 eax, xxh_u32 ecx, xxh_u32* abcd) +{ +#if defined(_MSC_VER) + __cpuidex((int*)abcd, eax, ecx); +#else + xxh_u32 ebx, edx; +# if defined(__i386__) && defined(__PIC__) + __asm__( + "# Call CPUID\n\t" + "#\n\t" + "# On 32-bit x86 with PIC enabled, we are not allowed to overwrite\n\t" + "# EBX, so we use EDI instead.\n\t" + XXH_I_ATT("mov edi, ebx", "movl %%ebx, %%edi") + XXH_I_ATT("cpuid", "cpuid" ) + XXH_I_ATT("xchg edi, ebx", "xchgl %%ebx, %%edi") + : "=D" (ebx), +# else + __asm__( + "# Call CPUID\n\t" + XXH_I_ATT("cpuid", "cpuid") + : "=b" (ebx), +# endif + "+a" (eax), "+c" (ecx), "=d" (edx)); + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +/* + * Modified version of Intel's guide + * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family + */ + +#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 +/*! + * @private + * @brief Runs `XGETBV`. + * + * While the CPU may support AVX2, the operating system might not properly save + * the full YMM/ZMM registers. + * + * xgetbv is used for detecting this: Any compliant operating system will define + * a set of flags in the xcr0 register indicating how it saves the AVX registers. + * + * You can manually disable this flag on Windows by running, as admin: + * + * bcdedit.exe /set xsavedisable 1 + * + * and rebooting. Run the same command with 0 to re-enable it. + */ +static xxh_u64 XXH_xgetbv(void) +{ +#if defined(_MSC_VER) + return _xgetbv(0); /* min VS2010 SP1 compiler is required */ +#else + xxh_u32 xcr0_lo, xcr0_hi; + __asm__( + "# Call XGETBV\n\t" + "#\n\t" + "# Older assemblers (e.g. macOS's ancient GAS version) don't support\n\t" + "# the XGETBV opcode, so we encode it by hand instead.\n\t" + "# See for details.\n\t" + ".byte 0x0f, 0x01, 0xd0\n\t" + : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0)); + return xcr0_lo | ((xxh_u64)xcr0_hi << 32); +#endif +} +#endif + +/*! @cond Doxygen ignores this part */ +#define XXH_SSE2_CPUID_MASK (1 << 26) +#define XXH_OSXSAVE_CPUID_MASK ((1 << 26) | (1 << 27)) +#define XXH_AVX2_CPUID_MASK (1 << 5) +#define XXH_AVX2_XGETBV_MASK ((1 << 2) | (1 << 1)) +#define XXH_AVX512F_CPUID_MASK (1 << 16) +#define XXH_AVX512F_XGETBV_MASK ((7 << 5) | (1 << 2) | (1 << 1)) +/*! @endcond */ + +/*! + * @private + * @brief Returns the best XXH3 implementation. + * + * Runs various CPUID/XGETBV tests to try and determine the best implementation. + * + * @return The best @ref XXH_VECTOR implementation. + * @see XXH_VECTOR_TYPES + */ +static int XXH_featureTest(void) +{ + xxh_u32 abcd[4]; + xxh_u32 max_leaves; + int best = XXH_SCALAR; +#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 + xxh_u64 xgetbv_val; +#endif +#if defined(__GNUC__) && defined(__i386__) + xxh_u32 cpuid_supported; + __asm__( + "# For the sake of ruthless backwards compatibility, check if CPUID\n\t" + "# is supported in the EFLAGS on i386.\n\t" + "# This is not necessary on x86_64 - CPUID is mandatory.\n\t" + "# The ID flag (bit 21) in the EFLAGS register indicates support\n\t" + "# for the CPUID instruction. If a software procedure can set and\n\t" + "# clear this flag, the processor executing the procedure supports\n\t" + "# the CPUID instruction.\n\t" + "# \n\t" + "#\n\t" + "# Routine is from .\n\t" + + "# Save EFLAGS\n\t" + XXH_I_ATT("pushfd", "pushfl" ) + "# Store EFLAGS\n\t" + XXH_I_ATT("pushfd", "pushfl" ) + "# Invert the ID bit in stored EFLAGS\n\t" + XXH_I_ATT("xor dword ptr[esp], 0x200000", "xorl $0x200000, (%%esp)") + "# Load stored EFLAGS (with ID bit inverted)\n\t" + XXH_I_ATT("popfd", "popfl" ) + "# Store EFLAGS again (ID bit may or not be inverted)\n\t" + XXH_I_ATT("pushfd", "pushfl" ) + "# eax = modified EFLAGS (ID bit may or may not be inverted)\n\t" + XXH_I_ATT("pop eax", "popl %%eax" ) + "# eax = whichever bits were changed\n\t" + XXH_I_ATT("xor eax, dword ptr[esp]", "xorl (%%esp), %%eax" ) + "# Restore original EFLAGS\n\t" + XXH_I_ATT("popfd", "popfl" ) + "# eax = zero if ID bit can't be changed, else non-zero\n\t" + XXH_I_ATT("and eax, 0x200000", "andl $0x200000, %%eax" ) + : "=a" (cpuid_supported) :: "cc"); + + if (XXH_unlikely(!cpuid_supported)) { + XXH_debugPrint("CPUID support is not detected!"); + return best; + } + +#endif + /* Check how many CPUID pages we have */ + XXH_cpuid(0, 0, abcd); + max_leaves = abcd[0]; + + /* Shouldn't happen on hardware, but happens on some QEMU configs. */ + if (XXH_unlikely(max_leaves == 0)) { + XXH_debugPrint("Max CPUID leaves == 0!"); + return best; + } + + /* Check for SSE2, OSXSAVE and xgetbv */ + XXH_cpuid(1, 0, abcd); + + /* + * Test for SSE2. The check is redundant on x86_64, but it doesn't hurt. + */ + if (XXH_unlikely((abcd[3] & XXH_SSE2_CPUID_MASK) != XXH_SSE2_CPUID_MASK)) + return best; + + XXH_debugPrint("SSE2 support detected."); + + best = XXH_SSE2; +#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 + /* Make sure we have enough leaves */ + if (XXH_unlikely(max_leaves < 7)) + return best; + + /* Test for OSXSAVE and XGETBV */ + if ((abcd[2] & XXH_OSXSAVE_CPUID_MASK) != XXH_OSXSAVE_CPUID_MASK) + return best; + + /* CPUID check for AVX features */ + XXH_cpuid(7, 0, abcd); + + xgetbv_val = XXH_xgetbv(); +#if XXH_DISPATCH_AVX2 + /* Validate that AVX2 is supported by the CPU */ + if ((abcd[1] & XXH_AVX2_CPUID_MASK) != XXH_AVX2_CPUID_MASK) + return best; + + /* Validate that the OS supports YMM registers */ + if ((xgetbv_val & XXH_AVX2_XGETBV_MASK) != XXH_AVX2_XGETBV_MASK) { + XXH_debugPrint("AVX2 supported by the CPU, but not the OS."); + return best; + } + + /* AVX2 supported */ + XXH_debugPrint("AVX2 support detected."); + best = XXH_AVX2; +#endif +#if XXH_DISPATCH_AVX512 + /* Check if AVX512F is supported by the CPU */ + if ((abcd[1] & XXH_AVX512F_CPUID_MASK) != XXH_AVX512F_CPUID_MASK) { + XXH_debugPrint("AVX512F not supported by CPU"); + return best; + } + + /* Validate that the OS supports ZMM registers */ + if ((xgetbv_val & XXH_AVX512F_XGETBV_MASK) != XXH_AVX512F_XGETBV_MASK) { + XXH_debugPrint("AVX512F supported by the CPU, but not the OS."); + return best; + } + + /* AVX512F supported */ + XXH_debugPrint("AVX512F support detected."); + best = XXH_AVX512; +#endif +#endif + return best; +} + + +/* === Vector implementations === */ + +/*! @cond PRIVATE */ +/*! + * @private + * @brief Defines the various dispatch functions. + * + * TODO: Consolidate? + * + * @param suffix The suffix for the functions, e.g. sse2 or scalar + * @param target XXH_TARGET_* or empty. + */ + +#define XXH_DEFINE_DISPATCH_FUNCS(suffix, target) \ + \ +/* === XXH3, default variants === */ \ + \ +XXH_NO_INLINE target XXH64_hash_t \ +XXHL64_default_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, \ + size_t len) \ +{ \ + return XXH3_hashLong_64b_internal( \ + input, len, XXH3_kSecret, sizeof(XXH3_kSecret), \ + XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix \ + ); \ +} \ + \ +/* === XXH3, Seeded variants === */ \ + \ +XXH_NO_INLINE target XXH64_hash_t \ +XXHL64_seed_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, size_t len, \ + XXH64_hash_t seed) \ +{ \ + return XXH3_hashLong_64b_withSeed_internal( \ + input, len, seed, XXH3_accumulate_##suffix, \ + XXH3_scrambleAcc_##suffix, XXH3_initCustomSecret_##suffix \ + ); \ +} \ + \ +/* === XXH3, Secret variants === */ \ + \ +XXH_NO_INLINE target XXH64_hash_t \ +XXHL64_secret_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, \ + size_t len, XXH_NOESCAPE const void* secret, \ + size_t secretLen) \ +{ \ + return XXH3_hashLong_64b_internal( \ + input, len, secret, secretLen, \ + XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix \ + ); \ +} \ + \ +/* === XXH3 update variants === */ \ + \ +XXH_NO_INLINE target XXH_errorcode \ +XXH3_update_##suffix(XXH_NOESCAPE XXH3_state_t* state, \ + XXH_NOESCAPE const void* input, size_t len) \ +{ \ + return XXH3_update(state, (const xxh_u8*)input, len, \ + XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix); \ +} \ + \ +/* === XXH128 default variants === */ \ + \ +XXH_NO_INLINE target XXH128_hash_t \ +XXHL128_default_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, \ + size_t len) \ +{ \ + return XXH3_hashLong_128b_internal( \ + input, len, XXH3_kSecret, sizeof(XXH3_kSecret), \ + XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix \ + ); \ +} \ + \ +/* === XXH128 Secret variants === */ \ + \ +XXH_NO_INLINE target XXH128_hash_t \ +XXHL128_secret_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, \ + size_t len, \ + XXH_NOESCAPE const void* XXH_RESTRICT secret, \ + size_t secretLen) \ +{ \ + return XXH3_hashLong_128b_internal( \ + input, len, (const xxh_u8*)secret, secretLen, \ + XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix); \ +} \ + \ +/* === XXH128 Seeded variants === */ \ + \ +XXH_NO_INLINE target XXH128_hash_t \ +XXHL128_seed_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, size_t len,\ + XXH64_hash_t seed) \ +{ \ + return XXH3_hashLong_128b_withSeed_internal(input, len, seed, \ + XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix, \ + XXH3_initCustomSecret_##suffix); \ +} + +/*! @endcond */ +/* End XXH_DEFINE_DISPATCH_FUNCS */ + +/*! @cond Doxygen ignores this part */ +#if XXH_DISPATCH_SCALAR +XXH_DEFINE_DISPATCH_FUNCS(scalar, /* nothing */) +#endif +XXH_DEFINE_DISPATCH_FUNCS(sse2, XXH_TARGET_SSE2) +#if XXH_DISPATCH_AVX2 +XXH_DEFINE_DISPATCH_FUNCS(avx2, XXH_TARGET_AVX2) +#endif +#if XXH_DISPATCH_AVX512 +XXH_DEFINE_DISPATCH_FUNCS(avx512, XXH_TARGET_AVX512) +#endif +#undef XXH_DEFINE_DISPATCH_FUNCS +/*! @endcond */ + +/* ==== Dispatchers ==== */ + +/*! @cond Doxygen ignores this part */ +typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_default)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t); + +typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_withSeed)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, XXH64_hash_t); + +typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_withSecret)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, XXH_NOESCAPE const void* XXH_RESTRICT, size_t); + +typedef XXH_errorcode (*XXH3_dispatchx86_update)(XXH_NOESCAPE XXH3_state_t*, XXH_NOESCAPE const void*, size_t); + +typedef struct { + XXH3_dispatchx86_hashLong64_default hashLong64_default; + XXH3_dispatchx86_hashLong64_withSeed hashLong64_seed; + XXH3_dispatchx86_hashLong64_withSecret hashLong64_secret; + XXH3_dispatchx86_update update; +} XXH_dispatchFunctions_s; + +#define XXH_NB_DISPATCHES 4 +/*! @endcond */ + +/*! + * @private + * @brief Table of dispatchers for @ref XXH3_64bits(). + * + * @pre The indices must match @ref XXH_VECTOR_TYPE. + */ +static const XXH_dispatchFunctions_s XXH_kDispatch[XXH_NB_DISPATCHES] = { +#if XXH_DISPATCH_SCALAR + /* Scalar */ { XXHL64_default_scalar, XXHL64_seed_scalar, XXHL64_secret_scalar, XXH3_update_scalar }, +#else + /* Scalar */ { NULL, NULL, NULL, NULL }, +#endif + /* SSE2 */ { XXHL64_default_sse2, XXHL64_seed_sse2, XXHL64_secret_sse2, XXH3_update_sse2 }, +#if XXH_DISPATCH_AVX2 + /* AVX2 */ { XXHL64_default_avx2, XXHL64_seed_avx2, XXHL64_secret_avx2, XXH3_update_avx2 }, +#else + /* AVX2 */ { NULL, NULL, NULL, NULL }, +#endif +#if XXH_DISPATCH_AVX512 + /* AVX512 */ { XXHL64_default_avx512, XXHL64_seed_avx512, XXHL64_secret_avx512, XXH3_update_avx512 } +#else + /* AVX512 */ { NULL, NULL, NULL, NULL } +#endif +}; +/*! + * @private + * @brief The selected dispatch table for @ref XXH3_64bits(). + */ +static XXH_dispatchFunctions_s XXH_g_dispatch = { NULL, NULL, NULL, NULL }; + + +/*! @cond Doxygen ignores this part */ +typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_default)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t); + +typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_withSeed)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, XXH64_hash_t); + +typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_withSecret)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, const void* XXH_RESTRICT, size_t); + +typedef struct { + XXH3_dispatchx86_hashLong128_default hashLong128_default; + XXH3_dispatchx86_hashLong128_withSeed hashLong128_seed; + XXH3_dispatchx86_hashLong128_withSecret hashLong128_secret; + XXH3_dispatchx86_update update; +} XXH_dispatch128Functions_s; +/*! @endcond */ + + +/*! + * @private + * @brief Table of dispatchers for @ref XXH3_128bits(). + * + * @pre The indices must match @ref XXH_VECTOR_TYPE. + */ +static const XXH_dispatch128Functions_s XXH_kDispatch128[XXH_NB_DISPATCHES] = { +#if XXH_DISPATCH_SCALAR + /* Scalar */ { XXHL128_default_scalar, XXHL128_seed_scalar, XXHL128_secret_scalar, XXH3_update_scalar }, +#else + /* Scalar */ { NULL, NULL, NULL, NULL }, +#endif + /* SSE2 */ { XXHL128_default_sse2, XXHL128_seed_sse2, XXHL128_secret_sse2, XXH3_update_sse2 }, +#if XXH_DISPATCH_AVX2 + /* AVX2 */ { XXHL128_default_avx2, XXHL128_seed_avx2, XXHL128_secret_avx2, XXH3_update_avx2 }, +#else + /* AVX2 */ { NULL, NULL, NULL, NULL }, +#endif +#if XXH_DISPATCH_AVX512 + /* AVX512 */ { XXHL128_default_avx512, XXHL128_seed_avx512, XXHL128_secret_avx512, XXH3_update_avx512 } +#else + /* AVX512 */ { NULL, NULL, NULL, NULL } +#endif +}; + +/*! + * @private + * @brief The selected dispatch table for @ref XXH3_64bits(). + */ +static XXH_dispatch128Functions_s XXH_g_dispatch128 = { NULL, NULL, NULL, NULL }; + +/*! + * @private + * @brief Runs a CPUID check and sets the correct dispatch tables. + */ +static XXH_CONSTRUCTOR void XXH_setDispatch(void) +{ + int vecID = XXH_featureTest(); + XXH_STATIC_ASSERT(XXH_AVX512 == XXH_NB_DISPATCHES-1); + assert(XXH_SCALAR <= vecID && vecID <= XXH_AVX512); +#if !XXH_DISPATCH_SCALAR + assert(vecID != XXH_SCALAR); +#endif +#if !XXH_DISPATCH_AVX512 + assert(vecID != XXH_AVX512); +#endif +#if !XXH_DISPATCH_AVX2 + assert(vecID != XXH_AVX2); +#endif + XXH_g_dispatch = XXH_kDispatch[vecID]; + XXH_g_dispatch128 = XXH_kDispatch128[vecID]; +} + + +/* ==== XXH3 public functions ==== */ +/*! @cond Doxygen ignores this part */ + +static XXH64_hash_t +XXH3_hashLong_64b_defaultSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.hashLong64_default == NULL) + XXH_setDispatch(); + return XXH_g_dispatch.hashLong64_default(input, len); +} + +XXH64_hash_t XXH3_64bits_dispatch(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_defaultSecret_selection); +} + +static XXH64_hash_t +XXH3_hashLong_64b_withSeed_selection(const void* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.hashLong64_seed == NULL) + XXH_setDispatch(); + return XXH_g_dispatch.hashLong64_seed(input, len, seed64); +} + +XXH64_hash_t XXH3_64bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed_selection); +} + +static XXH64_hash_t +XXH3_hashLong_64b_withSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* secret, size_t secretLen) +{ + (void)seed64; + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.hashLong64_secret == NULL) + XXH_setDispatch(); + return XXH_g_dispatch.hashLong64_secret(input, len, secret, secretLen); +} + +XXH64_hash_t XXH3_64bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen) +{ + return XXH3_64bits_internal(input, len, 0, secret, secretLen, XXH3_hashLong_64b_withSecret_selection); +} + +XXH_errorcode +XXH3_64bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.update == NULL) + XXH_setDispatch(); + + return XXH_g_dispatch.update(state, (const xxh_u8*)input, len); +} + +/*! @endcond */ + + +/* ==== XXH128 public functions ==== */ +/*! @cond Doxygen ignores this part */ + +static XXH128_hash_t +XXH3_hashLong_128b_defaultSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const void* secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.hashLong128_default == NULL) + XXH_setDispatch(); + return XXH_g_dispatch128.hashLong128_default(input, len); +} + +XXH128_hash_t XXH3_128bits_dispatch(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_defaultSecret_selection); +} + +static XXH128_hash_t +XXH3_hashLong_128b_withSeed_selection(const void* input, size_t len, + XXH64_hash_t seed64, const void* secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.hashLong128_seed == NULL) + XXH_setDispatch(); + return XXH_g_dispatch128.hashLong128_seed(input, len, seed64); +} + +XXH128_hash_t XXH3_128bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_withSeed_selection); +} + +static XXH128_hash_t +XXH3_hashLong_128b_withSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const void* secret, size_t secretLen) +{ + (void)seed64; + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.hashLong128_secret == NULL) + XXH_setDispatch(); + return XXH_g_dispatch128.hashLong128_secret(input, len, secret, secretLen); +} + +XXH128_hash_t XXH3_128bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen) +{ + return XXH3_128bits_internal(input, len, 0, secret, secretLen, XXH3_hashLong_128b_withSecret_selection); +} + +XXH_errorcode +XXH3_128bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.update == NULL) + XXH_setDispatch(); + return XXH_g_dispatch128.update(state, (const xxh_u8*)input, len); +} + +/*! @endcond */ + +#if defined (__cplusplus) +} +#endif +/*! @} */ diff --git a/src/linker/third_party_ext/xxHash/xxh_x86dispatch.h b/src/linker/third_party_ext/xxHash/xxh_x86dispatch.h new file mode 100644 index 00000000..b87cea95 --- /dev/null +++ b/src/linker/third_party_ext/xxHash/xxh_x86dispatch.h @@ -0,0 +1,85 @@ +/* + * xxHash - XXH3 Dispatcher for x86-based targets + * Copyright (C) 2020-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +#ifndef XXH_X86DISPATCH_H_13563687684 +#define XXH_X86DISPATCH_H_13563687684 + +#include "xxhash.h" /* XXH64_hash_t, XXH3_state_t */ + +#if defined (__cplusplus) +extern "C" { +#endif + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_dispatch(XXH_NOESCAPE const void* input, size_t len); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen); +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len); + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_dispatch(XXH_NOESCAPE const void* input, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len); + +#if defined (__cplusplus) +} +#endif + + +/* automatic replacement of XXH3 functions. + * can be disabled by setting XXH_DISPATCH_DISABLE_REPLACE */ +#ifndef XXH_DISPATCH_DISABLE_REPLACE + +# undef XXH3_64bits +# define XXH3_64bits XXH3_64bits_dispatch +# undef XXH3_64bits_withSeed +# define XXH3_64bits_withSeed XXH3_64bits_withSeed_dispatch +# undef XXH3_64bits_withSecret +# define XXH3_64bits_withSecret XXH3_64bits_withSecret_dispatch +# undef XXH3_64bits_update +# define XXH3_64bits_update XXH3_64bits_update_dispatch + +# undef XXH128 +# define XXH128 XXH3_128bits_withSeed_dispatch +# undef XXH3_128bits +# define XXH3_128bits XXH3_128bits_dispatch +# undef XXH3_128bits_withSeed +# define XXH3_128bits_withSeed XXH3_128bits_withSeed_dispatch +# undef XXH3_128bits_withSecret +# define XXH3_128bits_withSecret XXH3_128bits_withSecret_dispatch +# undef XXH3_128bits_update +# define XXH3_128bits_update XXH3_128bits_update_dispatch + +#endif /* XXH_DISPATCH_DISABLE_REPLACE */ + +#endif /* XXH_X86DISPATCH_H_13563687684 */ diff --git a/src/linker/third_party_ext/xxHash/xxhash.c b/src/linker/third_party_ext/xxHash/xxhash.c new file mode 100644 index 00000000..083b039d --- /dev/null +++ b/src/linker/third_party_ext/xxHash/xxhash.c @@ -0,0 +1,43 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/src/linker/third_party_ext/xxHash/xxhash.h b/src/linker/third_party_ext/xxHash/xxhash.h new file mode 100644 index 00000000..5e2c0ed2 --- /dev/null +++ b/src/linker/third_party_ext/xxHash/xxhash.h @@ -0,0 +1,6773 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2023 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Gives access to internal state declaration, required for static allocation. + * + * Incompatible with dynamic linking, due to risks of ABI changes. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #include "xxhash.h" + * @endcode + */ +# define XXH_STATIC_LINKING_ONLY +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ + +/*! + * @brief Gives access to internal definitions. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #define XXH_IMPLEMENTATION + * #include "xxhash.h" + * @endcode + */ +# define XXH_IMPLEMENTATION +/* Do not undef XXH_IMPLEMENTATION for Doxygen */ + +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 2 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; + +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * See @ref single_shot_example "Single Shot Example" for an example. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * @see streaming_example at the top of @ref xxhash.h for an example. + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +/*! @cond Doxygen ignores this part */ +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * C23 __STDC_VERSION__ number hasn't been specified yet. For now + * leave as `201711L` (C17 + 1). + * TODO: Update to correct value when its been specified. + */ +#define XXH_C23_VN 201711L +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif +/*! @endcond */ + + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! + * @brief Allocates an @ref XXH64_state_t. + * + * Must be freed with XXH64_freeState(). + * @return An allocated XXH64_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); + +/*! + * @brief Frees an @ref XXH64_state_t. + * + * Must be allocated with XXH64_createState(). + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH64_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +/*! + * @brief Resets an @ref XXH64_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH64_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH64_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH64_state_t. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash64 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH64_hash_t. + */ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; + +/*! + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. + * + * @param dst The @ref XXH64_canonical_t pointer to be stored to. + * @param hash The @ref XXH64_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); + +/*! + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. + * + * @param src The @ref XXH64_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD + * implementations for many common platforms: + * - AVX512 + * - AVX2 + * - SSE2 + * - ARM NEON + * - WebAssembly SIMD128 + * - POWER8 VSX + * - s390x ZVector + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically + * selects the best version according to predefined macros. For the x86 family, an + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generate exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief 64-bit unseeded variant of XXH3. + * + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see + * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief 64-bit seeded variant of XXH3 + * + * This variant generates a custom secret on the fly based on default secret + * altered using the `seed` value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * @param input The data to hash + * @param length The length + * @param seed The 64-bit seed to alter the state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief 64-bit variant of XXH3 with a custom "secret". + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing "XXH3_generateSecret()" instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH3_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_64bits_update(). + * Digest will be equivalent to `XXH3_64bits()`. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_64bits_update(). + * Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the state. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); + +/*! + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 64-bit hash value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Unseeded 128-bit variant of XXH3 + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see + * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_128bits_update(). + * Digest will be equivalent to `XXH3_128bits()`. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_128bits_update(). + * Digest will be equivalent to `XXH3_128bits_withSeed()`. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the state. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH_64bits_reset_withSecret(). */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 128-bit hash value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; + + +/*! + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. + * + * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param hash The @ref XXH128_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); + +/*! + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. + * + * @param src The @ref XXH128_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @internal + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) \ + do { \ + XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ + tmp_xxh3_state_ptr->seed = 0; \ + tmp_xxh3_state_ptr->extSecret = NULL; \ + } while(0) + + +/*! + * simple alias to pre-selected XXH3_128bits variant + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The seed to seed the state. + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); + +/*! + * These variants generate hash values using either + * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH3_INLINE_SECRET + * @brief Determines whether to inline the XXH3 withSecret code. + * + * When the secret size is known, the compiler can improve the performance + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). + * + * However, if the secret size is not known, it doesn't have any benefit. This + * happens when xxHash is compiled into a global symbol. Therefore, if + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. + * + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers + * that are *sometimes* force inline on -Og, and it is impossible to automatically + * detect this optimization level. + */ +# define XXH3_INLINE_SECRET 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH3_INLINE_SECRET +# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ + || !defined(XXH_INLINE_ALL) +# define XXH3_INLINE_SECRET 0 +# else +# define XXH3_INLINE_SECRET 1 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + +#if XXH3_INLINE_SECRET +# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE +#else +# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE +#endif + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# if defined(__INTEL_COMPILER) +# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) +# else +# define XXH_ASSERT(c) XXH_ASSUME(c) +# endif +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* Specifically for NEON vectors which use the "w" constraint, on + * Clang. */ +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) +# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + + + +/* + * C23 and future versions have standard "unreachable()". + * Once it has been implemented reliably we can add it as an + * additional case: + * + * ``` + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * # include + * # ifdef unreachable + * # define XXH_UNREACHABLE() unreachable() + * # endif + * #endif + * ``` + * + * Note C++23 also has std::unreachable() which can be detected + * as follows: + * ``` + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) + * # include + * # define XXH_UNREACHABLE() std::unreachable() + * #endif + * ``` + * NB: `__cpp_lib_unreachable` is defined in the `` header. + * We don't use that as including `` in `extern "C"` blocks + * doesn't work on GCC12 + */ + +#if XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#if XXH_HAS_BUILTIN(__builtin_assume) +# define XXH_ASSUME(c) __builtin_assume(c) +#else +# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing + * the loop. NEON is only faster on the A53, and with the newer cores, it is less + * than half the speed. + * + * Additionally, this is used on WASM SIMD128 because it JITs to the same + * SIMD instructions and has the same issue. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @ingroup XXH32_family + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict +#else +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +# define XXH_HAS_INCLUDE(x) __has_include(x) +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include +# endif +# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || (defined(_M_ARM) && _M_ARM >= 7) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* WASM SIMD128 via SIMDe */ +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# endif +#endif + +#if defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * internal macro XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< + * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 + * via the SIMDeverywhere polyfill provided with the + * Emscripten SDK. + */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +# define XXH_SVE 6 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_ALIASING __attribute__((may_alias)) +#else +# define XXH_ALIASING /* nothing */ +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + +#if XXH_VECTOR == XXH_NEON + +/* + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 + * optimizes out the entire hashLong loop because of the aliasing violation. + * + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, + * so the only option is to mark it as aliasing. + */ +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(xxh_aliasing_uint64x2_t const *)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif + +/*! + * @internal + * @brief `vmlal_u32` on low and high halves of a vector. + * + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` + * with `vmlal_u32`. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* Inline assembly is the only way */ + __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); + return acc; +} +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* This intrinsic works as expected */ + return vmlal_high_u32(acc, lhs, rhs); +} +#else +/* Portable intrinsic versions */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); +} +/*! @copydoc XXH_vmlal_low_u32 + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); +} +#endif + +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * This can be set to 2, 4, 6, or 8. + * + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU + * bandwidth. + * + * This is even more noticeable on the more advanced cores like the Cortex-A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes + * and 2 scalar lanes, which is chosen by default. + * + * This does not apply to Apple processors or 32-bit processors, which run better with + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning + * it effectively becomes worse 4. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include +# else +# include +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +/* + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. + */ +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= PRIME_MX1; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= PRIME_MX2; + h64 ^= (h64 >> 35) + len ; + h64 *= PRIME_MX2; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); +#endif + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + return XXH3_avalanche(acc + acc_end); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_() and it calls + * XXH3_accumulate_512_(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON and WASM SIMD128. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + * + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit + * integers instead of the other platforms which mask full 64-bit vectors, + * so the setup is more complicated than just shifting right. + * + * Additionally, there is an optimization for 4 lanes at once noted below. + * + * Since, as stated, the most optimal amount of lanes for Cortexes is 6, + * there needs to be *three* versions of the accumulate operation used + * for the remaining 2 lanes. + * + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap + * nearly perfectly. + */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { /* GCC for darwin arm64 does not like aliasing here */ + xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* xinput = (const uint8_t *) input; + uint8_t const* xsecret = (const uint8_t *) secret; + + size_t i; +#ifdef __wasm_simd128__ + /* + * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret + * is constant propagated, which results in it converting it to this + * inside the loop: + * + * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) + * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) + * ... + * + * This requires a full 32-bit address immediate (and therefore a 6 byte + * instruction) as well as an add for each offset. + * + * Putting an asm guard prevents it from folding (at the cost of losing + * the alignment hint), and uses the free offset in `v128.load` instead + * of adding secret_offset each time which overall reduces code size by + * about a kilobyte and improves performance. + */ + XXH_COMPILER_GUARD(xsecret); +#endif + /* Scalar lanes use the normal scalarRound routine */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + /* 4 NEON lanes at a time. */ + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); + uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* data_swap = swap(data_vec) */ + uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); + uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); + uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); + + /* + * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a + * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to + * get one vector with the low 32 bits of each lane, and one vector + * with the high 32 bits of each lane. + * + * The intrinsic returns a double vector because the original ARMv7-a + * instruction modified both arguments in place. AArch64 and SIMD128 emit + * two instructions from this intrinsic. + * + * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] + * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] + */ + uint32x4x2_t unzipped = vuzpq_u32( + vreinterpretq_u32_u64(data_key_1), + vreinterpretq_u32_u64(data_key_2) + ); + /* data_key_lo = data_key & 0xFFFFFFFF */ + uint32x4_t data_key_lo = unzipped.val[0]; + /* data_key_hi = data_key >> 32 */ + uint32x4_t data_key_hi = unzipped.val[1]; + /* + * Then, we can split the vectors horizontally and multiply which, as for most + * widening intrinsics, have a variant that works on both high half vectors + * for free on AArch64. A similar instruction is available on SIMD128. + * + * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi + */ + uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); + uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); + /* + * Clang reorders + * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s + * c += a; // add acc.2d, acc.2d, swap.2d + * to + * c += a; // add acc.2d, acc.2d, swap.2d + * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s + * + * While it would make sense in theory since the addition is faster, + * for reasons likely related to umlal being limited to certain NEON + * pipelines, this is worse. A compiler guard fixes this. + */ + XXH_COMPILER_GUARD_CLANG_NEON(sum_1); + XXH_COMPILER_GUARD_CLANG_NEON(sum_2); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64(xacc[i], sum_1); + xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); + } + /* Operate on the remaining NEON lanes 2 at a time. */ + for (; i < XXH3_NEON_LANES / 2; i++) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* For two lanes, just use VMOVN and VSHRN. */ + /* data_key_lo = data_key & 0xFFFFFFFF; */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* data_key_hi = data_key >> 32; */ + uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); + /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ + uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); + /* Same Clang workaround as before */ + XXH_COMPILER_GUARD_CLANG_NEON(sum); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64 (xacc[i], sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + + size_t i; + /* WASM uses operator overloads and doesn't need these. */ +#ifndef __wasm_simd128__ + /* { prime32_1, prime32_1 } */ + uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); + /* { 0, prime32_1, 0, prime32_1 } */ + uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); +#endif + + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* xacc[i] *= XXH_PRIME32_1 */ +#ifdef __wasm_simd128__ + /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ + xacc[i] = data_key * XXH_PRIME32_1; +#else + /* + * Expanded version with portable NEON intrinsics + * + * lo(x) * lo(y) + (hi(x) * lo(y) << 32) + * + * prod_hi = hi(data_key) * lo(prime) << 32 + * + * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector + * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits + * and avoid the shift. + */ + uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); + /* Extract low bits for vmlal_u32 */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); +#endif + } + } +} +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ + xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = xacc[i]; + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] = acc_vec; + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + const xxh_u8* const xsecret = (const xxh_u8*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +/* scalar variants - universal */ + +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) +/* + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they + * emit an excess mask and a full 64-bit multiply-add (MADD X-form). + * + * While this might not seem like much, as AArch64 is a 64-bit architecture, only + * big Cortex designs have a full 64-bit multiplier. + * + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. + * + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does + * not have this penalty and does the mask automatically. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + xxh_u64 ret; + /* note: %x = 64-bit register, %w = 32-bit register */ + __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); + return ret; +} +#else +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; +} +#endif + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__GNUC__) && defined(__aarch64__) + /* + * UGLY HACK: + * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes the compiler to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes the compiler to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +/*! + * @brief Allocate an @ref XXH3_state_t. + * + * Must be freed with XXH3_freeState(). + * @return An allocated XXH3_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +/*! + * @brief Frees an @ref XXH3_state_t. + * + * Must be allocated with XXH3_createState(). + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/*! + * @internal + * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). + * + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. + * + * @param acc Pointer to the 8 accumulator lanes + * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* + * @param nbStripesPerBlock Number of stripes in a block + * @param input Input pointer + * @param nbStripes Number of stripes to process + * @param secret Secret pointer + * @param secretLimit Offset of the last block in @p secret + * @param f_acc Pointer to an XXH3_accumulate implementation + * @param f_scramble Pointer to an XXH3_scrambleAcc implementation + * @return Pointer past the end of @p input after processing + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; + /* Process full blocks */ + if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { + /* Process the initial partial block... */ + size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; + + do { + /* Accumulate and scramble */ + f_acc(acc, input, initialSecret, nbStripesThisIter); + f_scramble(acc, secret + secretLimit); + input += nbStripesThisIter * XXH_STRIPE_LEN; + nbStripes -= nbStripesThisIter; + /* Then continue the loop with the full block size */ + nbStripesThisIter = nbStripesPerBlock; + initialSecret = secret; + } while (nbStripes >= nbStripesPerBlock); + *nbStripesSoFarPtr = 0; + } + /* Process a partial block */ + if (nbStripes > 0) { + f_acc(acc, input, initialSecret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + *nbStripesSoFarPtr += nbStripes; + } + /* Return end pointer */ + return input; +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; + XXH_memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + input = XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, nbStripes, + secret, state->secretLimit, + f_acc, f_scramble); + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + + } + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + XXH_memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + const xxh_u8* lastStripePtr; + + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + /* Consume remaining stripes then point to remaining data in buffer */ + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; + } else { /* bufferedSize < XXH_STRIPE_LEN */ + /* Copy to temp buffer */ + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + lastStripePtr = lastStripe; + } + /* Last stripe */ + XXH3_accumulate_512(acc, + lastStripePtr, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= PRIME_MX2; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + i - 32, + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + (XXH64_hash_t)0 - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + * + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_64bits_update(state, input, len); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; npool; + + Arena *arena = NULL; + if (pool->worker_arena) { + arena = pool->worker_arena->v[worker->id]; + } + + for (;;) { + // get task id + U64 next_task_id = ins_atomic_u64_inc_eval(&pool->next_task_id); + if (next_task_id > pool->task_count) { + break; + } + + // invoke task func + U64 task_id = next_task_id - 1; + pool->task_func(arena, worker->id, task_id, pool->task_data); + } +} + +internal void +tp_worker_main(void *raw_worker) +{ + TCTX tctx_; + tctx_init_and_equip(&tctx_); + + TP_Worker *worker = (TP_Worker *)raw_worker; + TP_Context *pool = worker->pool; + + while (pool->is_live) { + if (os_semaphore_take(pool->task_semaphore, max_U64)) { + tp_execute_tasks(worker); + + // before last worker takes semaphore wake up main worker + U64 take_count = ins_atomic_u64_dec_eval(&pool->take_count); + if (take_count == 1) { + os_semaphore_drop(pool->main_semaphore); + } + } else { + Assert(!"time out"); + } + } +} + +internal TP_Context * +tp_alloc(Arena *arena, U32 worker_count) +{ + ProfBeginDynamic("Alloc Thread Pool [Worker Count: %u]", worker_count); + Assert(worker_count > 0); + + // init pool + TP_Context *pool = push_array(arena, TP_Context, 1); + if (worker_count > 1) { + pool->task_semaphore = os_semaphore_alloc(0, worker_count - 1, str8(0,0)); + pool->main_semaphore = os_semaphore_alloc(0, 1, str8(0,0)); + } + pool->is_live = 1; + pool->worker_count = worker_count; + pool->worker_arr = push_array(arena, TP_Worker, worker_count); + + // init worker data + for (U64 i = 0; i < worker_count; i += 1) { + TP_Worker *worker = &pool->worker_arr[i]; + worker->id = i; + worker->pool = pool; + } + + // launch worker threads + for (U64 i = 1; i < worker_count; i += 1) { + TP_Worker *worker = &pool->worker_arr[i]; + worker->handle = os_thread_launch(tp_worker_main, worker, 0); + } + + ProfEnd(); + return pool; +} + +internal void +tp_release(TP_Context *pool) +{ + pool->is_live = 0; + os_semaphore_release(pool->task_semaphore); + os_semaphore_release(pool->main_semaphore); + for (U64 i = 1; i < pool->worker_count; i += 1) { + os_thread_detach(pool->worker_arr[i].handle); + } + MemoryZeroStruct(pool); +} + +internal TP_Arena * +tp_arena_alloc(TP_Context *pool) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(0,0); + Arena **arr = push_array(scratch.arena, Arena *, pool->worker_count); + for (U64 i = 0; i < pool->worker_count; ++i) { + arr[i] = arena_alloc(); + } + Arena **dst = push_array(arr[0], Arena *, pool->worker_count); + MemoryCopy(dst, arr, sizeof(Arena*) * pool->worker_count); + TP_Arena *worker_arena_arr = push_array(arr[0], TP_Arena, 1); + worker_arena_arr->count = pool->worker_count; + worker_arena_arr->v = dst; + scratch_end(scratch); + ProfEnd(); + return worker_arena_arr; +} + +internal void +tp_arena_release(TP_Arena **arena_ptr) +{ + ProfBeginFunction(); + for (U64 i = 1; i < (*arena_ptr)->count; ++i) { + arena_release((*arena_ptr)->v[i]); + } + arena_release((*arena_ptr)->v[0]); + *arena_ptr = NULL; + ProfEnd(); +} + +internal TP_Temp +tp_temp_begin(TP_Arena *arena) +{ + ProfBeginFunction(); + + Temp first_temp = temp_begin(arena->v[0]); + + TP_Temp temp; + temp.count = arena->count; + temp.v = push_array_no_zero(first_temp.arena, Temp, arena->count); + + temp.v[0] = first_temp; + + for (U64 arena_idx = 1; arena_idx < arena->count; arena_idx += 1) { + temp.v[arena_idx] = temp_begin(arena->v[arena_idx]); + } + + ProfEnd(); + return temp; +} + +internal void +tp_temp_end(TP_Temp temp) +{ + ProfBeginFunction(); + for (U64 temp_idx = temp.count - 1; temp_idx > 0; temp_idx -= 1) { + temp_end(temp.v[temp_idx]); + } + ProfEnd(); +} + +internal void +tp_for_parallel(TP_Context *pool, TP_Arena *arena, U64 task_count, TP_TaskFunc *task_func, void *task_data) +{ + Assert(!arena || arena->count == pool->worker_count); + + // setup pool state + pool->worker_arena = arena; + pool->task_count = task_count; + pool->task_func = task_func; + pool->task_data = task_data; + pool->next_task_id = 0; + pool->take_count = 0; + + // do we have enough work for other workers? + pool->take_count = Min(pool->task_count, pool->worker_count); + U64 drop_count = pool->take_count; + for (U64 worker_idx = 1; worker_idx < drop_count; worker_idx += 1) { + os_semaphore_drop(pool->task_semaphore); + } + + // execute tasks on main worker too + TP_Worker *main_worker = &pool->worker_arr[0]; + tp_execute_tasks(main_worker); + + if (drop_count > 1) { + // wait for workers to finish assigned tasks + os_semaphore_take(pool->main_semaphore, max_U64); + } +} + +internal Rng1U64 * +tp_divide_work(Arena *arena, U64 item_count, U32 worker_count) +{ + U64 per_count = CeilIntegerDiv(item_count, worker_count); + Rng1U64 *range_arr = push_array_no_zero(arena, Rng1U64, worker_count + 1); + for (U64 i = 0; i < worker_count; i += 1) { + range_arr[i] = rng_1u64(Min(item_count, i * per_count), + Min(item_count, i * per_count + per_count)); + } + + // thread_pool_dummy_range: + range_arr[worker_count] = rng_1u64(item_count, item_count); + + return range_arr; +} diff --git a/src/linker/thread_pool/thread_pool.h b/src/linker/thread_pool/thread_pool.h new file mode 100644 index 00000000..4ed6e5c2 --- /dev/null +++ b/src/linker/thread_pool/thread_pool.h @@ -0,0 +1,51 @@ +// Copyright (c) 2024 Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +#define THREAD_POOL_TASK_FUNC(name) void name(Arena *arena, U64 worker_id, U64 task_id, void *raw_task) +typedef THREAD_POOL_TASK_FUNC(TP_TaskFunc); + +typedef struct TP_Arena +{ + U64 count; + Arena **v; +} TP_Arena; + +typedef struct TP_Temp +{ + U64 count; + Temp *v; +} TP_Temp; + +typedef struct TP_Worker +{ + U64 id; + struct TP_Context *pool; + OS_Handle handle; +} TP_Worker; + +typedef struct TP_Context +{ + OS_Handle task_semaphore; + OS_Handle main_semaphore; + B32 is_live; + U32 worker_count; + TP_Worker *worker_arr; + TP_Arena *worker_arena; + U64 task_count; + TP_TaskFunc *task_func; + void *task_data; + volatile U64 next_task_id; + volatile U64 take_count; +} TP_Context; + +internal TP_Context * tp_alloc(Arena *arena, U32 worker_count); +internal void tp_release(TP_Context *pool); +internal TP_Arena * tp_arena_alloc(TP_Context *pool); +internal void tp_arena_release(TP_Arena **arena_ptr); +internal TP_Temp tp_temp_begin(TP_Arena *arena); +internal void tp_temp_end(TP_Temp temp); +internal void tp_for_parallel(TP_Context *pool, TP_Arena *arena, U64 task_count, TP_TaskFunc *task_func, void *task_data); +internal Rng1U64 * tp_divide_work(Arena *arena, U64 item_count, U32 worker_count); +