diff --git a/demo.str_cache.c b/demo.str_cache.c index a1faf30..3fe6c8a 100644 --- a/demo.str_cache.c +++ b/demo.str_cache.c @@ -40,7 +40,8 @@ int main() // Demo selection // #define DEMO__STR_SLICE // #define DEMO__FILE_READ_CONTENTS_V1 -#define DEMO__WATL_LEX_V1 +// #define DEMO__WATL_LEX_V1 +#define DEMO__WATL_PARSE_V1 /* The above makes use of the following core concepts to achieve its net result: @@ -302,7 +303,7 @@ void slice_zero(SliceMem mem) { // Now for our "Version 1" -#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1) +#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1) struct FileOpResult { @@ -452,12 +453,12 @@ struct WATL_SliceTok { SSIZE len; }; -Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) { - SSIZE start = cast(SSIZE, toks->ptr); +Str8 watl_tok_str8(WATL_SliceTok toks, WATL_Tok* tok) { + SSIZE start = cast(SSIZE, toks.ptr); SSIZE curr = cast(SSIZE, tok->code); SSIZE offset = curr - start; - SSIZE left = toks->len - offset; - B32 last_tok = (start + toks->len) == (curr + left); + SSIZE left = toks.len - offset; + B32 last_tok = (start + toks.len) == (curr + left); Str8 text = {0}; text.ptr = tok->code; text.len = last_tok ? @@ -494,8 +495,8 @@ void farena_reset (FArena* arena); void farena_rewind (FArena* arena, ArenaSP savepoint); ArenaSP farena_save (FArena arena); -#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type))) -#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type))) +#define farena_push(arena, type) cast(type*, farena__push(& arena, size_of(type), 1, lit(stringify(type))).ptr) +#define farena_push_array(arena, type, amount) pcast(Slice ## type, farena__push(& arena, size_of(type), amount, lit(stringify(type))) ) inline void api_farena_init(FArena* arena, SliceMem mem) { @@ -529,7 +530,7 @@ inline void farena_reset(FArena* arena) { arena->used = 0; } inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; } #pragma endregion FArena -#ifdef DEMO__WATL_LEX_V1 +#if defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1) struct WATL_LexInfo { // For now just the tokens @@ -569,7 +570,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts) case WATL_Tok_Tab: { if (* prev != * cursor) { - tok = farena_push(arena, WATL_Tok).ptr; + tok = farena_push(arena, WATL_Tok); tok->code = cursor; was_text = false; } @@ -582,8 +583,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts) // Assumes next is line feed. cursor += 1; } - case WATL_Tok_LineFeed: - { + case WATL_Tok_LineFeed: { cursor += 1; code = * cursor; } @@ -593,7 +593,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts) break; } if (! was_text) { - tok = farena_push(arena, WATL_Tok).ptr; + tok = farena_push(arena, WATL_Tok); tok->code = cursor; was_text = true; } @@ -653,6 +653,277 @@ int main() /* Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree. * The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens. - -We'll preserve whether or not a non-visible chunk is a tab or series of spaces. */ + +typedef struct WATL_ParseInfo WATL_ParseInfo; +typedef struct Opts__watl_parse Opts__watl_parse; +void api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts); +WATL_ParseInfo watl__parse ( WATL_SliceTok tokens, Opts__watl_parse* opts); +#define watl_parse(tokens, ...) watl__parse(tokens, & (Opts__watl_parse) {__VA_ARGS__}) + +typedef struct WATL_Node WATL_Node; +struct WATL_Node { + WATL_Node* next; + Str8 entry; +}; + +typedef struct WATL_Line WATL_Line; +struct WATL_Line { + WATL_Line* next; + WATL_Node* ptr; +}; + +typedef struct WATL_SliceLine WATL_SliceLine; +struct WATL_SliceLine { + WATL_Line* ptr; + SSIZE len; +}; + +/* +For the sake of the exercise, we'll be eliminating the association with the file's strings and we'll need to instead cache them. +*/ +#pragma region Str8Cache +typedef struct Str8Cache Str8Cache; +void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table); +Str8Cache str8cache_init ( SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table); + +// A cache like this relies on tabling string entires utiliszing an index derived from a hashed ID. +// For these strings we'll be using a hash called djb8: + +// Introducing a slice iterator: +#define slice_iter(container) typeof(container.ptr) iter = container.ptr; iter != (container.ptr + container.len); ++ iter + +inline +void hash64_djb8(U64* hash, SliceByte bytes) { + for (slice_iter(bytes)) { + *hash = (((*hash) << 8) + (*hash)) + (*iter); + } +} + +// For a library or codebase its recommended to setup a metaprogram to generate hash utilizing containers +// Or other containers that cannot be sufficiently lifted to general runtime paths without losing ergonomic debug type info or type-constraint enforcements. +// Unlike with the template markup C++ uses, you can strike a balance between how many definitions are redundantly made or optimized for collapsing to a general path +// based on target optimization and debugability. + +// For this V1 example, we'll be hand-rolling a fixed sized table with excess slot chaining for colliding slots. +// Its a relatively simple implementation to hand-roll. These things tend to become unyeilding with more advanced variants. + +typedef struct Str8Cache_Slot Str8Cache_Slot; +struct Str8Cache_Slot { + Str8Cache_Slot* prev; + Str8Cache_Slot* next; + Str8 value; + U64 key; + B32 occupied; +}; + +typedef struct Str8Cache_SliceSlot Str8Cache_SliceSlot; +struct Str8Cache_SliceSlot { + Str8Cache_Slot* ptr; + SSIZE len; +}; + +struct Str8Cache { + FArena a_str; + Str8Cache_SliceSlot pool; + Str8Cache_Slot* vacant; + Str8Cache_SliceSlot table; +}; + +Str8Cache str8cache_init(SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) { Str8Cache cache; api_str8cache_init(& cache, mem_strs, mem_slots, mem_table); return cache; } +inline +void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) { + assert(cache != nullptr); + slice_assert(mem_strs); + slice_assert(mem_slots); + slice_assert(mem_table); + cache->a_str = farena_init(mem_strs); + cache->pool = pcast(Str8Cache_SliceSlot, mem_slots); + cache->table = pcast(Str8Cache_SliceSlot, mem_table); +} + +void str8cache_clear(Str8Cache* cache) +{ + for (slice_iter(cache->table)) + { + if (iter == nullptr) { + continue; + } + for (Str8Cache_Slot* probe_slot = iter->next; probe_slot != nullptr; probe_slot = probe_slot->next) { + iter->occupied = false; + } + iter->occupied = false; + } +} + +// We don't introduce a remove option because we're not tracking fixed sized entities. +// Strings take up non-determistic sizes of their backing arena. So the only thing that can be done with the cache is wiping it and recaching all strings. + +/* +When storing a hash of a slot, we can almost never utilize the full width of a key, +so we must truncate the key via module to get a "good enough" unique ID to place in the table. +*/ +inline +U64 str8cache_slot_id(Str8Cache cache, U64 key) { + U64 hash_index = key % cast(U64, cache.table.len); + return hash_index; +} + +Str8* str8cache_get(Str8Cache cache, U64 key) +{ + U64 hash_index = str8cache_slot_id(cache, key); + Str8Cache_Slot* surface_slot = & cache.table.ptr[hash_index]; + if (surface_slot == nullptr) { + return nullptr; + } + if (surface_slot->occupied && surface_slot->key == key) { + return & surface_slot->value; + } + for (Str8Cache_Slot* slot = surface_slot->next; slot != nullptr; slot = slot->next) + { + if (slot->occupied && slot->key == key) { + return & slot->value; + } + } + return nullptr; +} + +Str8* str8cache_set(Str8Cache* cache, U64 key, Str8 value) +{ + U64 hash_index = str8cache_slot_id(*cache, key); + Str8Cache_Slot* surface_slot = & cache->table.ptr[hash_index]; + if (! surface_slot->occupied || surface_slot->key == key) + { + if (value.ptr != surface_slot->value.ptr) { + SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8")); + slice_copy(mem, pcast(SliceMem, value)); + surface_slot->value = pcast(Str8, mem); + } + surface_slot->key = key; + surface_slot->occupied = true; + return & surface_slot->value; + } + Str8Cache_Slot* slot = surface_slot; + for (;; slot = slot->next) + { + if (slot->next == nullptr) + { + // We had a collision, we need to grab a vacant slot from the pool and utilize it instead. + slot->next = cache->vacant; + * slot->next = (Str8Cache_Slot){0}; + slot->next->prev = slot; + + Str8Cache_Slot* next_vacant = cache->vacant + 1; + assert(next_vacant < cache->pool.ptr + cache->pool.len ); + // If the above fails we ran out of extra slots. + cache->vacant = cache->vacant + 1; + } + if ( ! slot->next->occupied || slot->next->key == key) + { + if (value.ptr != slot->next->value.ptr) { + SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8")); + slice_copy(mem, pcast(SliceMem, value)); + slot->next->value = pcast(Str8, mem); + } + slot->next->value = value; + slot->next->key = key; + slot->next->occupied = true; + return & slot->next->value; + } + // We keep traversing till we find a match or we find a vacancy for this list in the table. + // Make sure to tune the size of the table so it does this less! + // Note: Tables sized by prime values collide less aswell. + // You can use a closest prime number lookup table to derive what length to expose to the cache's table for hash ID resolution. + } + return nullptr; +} +#pragma endregion Str8Cache + +// Finally our abstracted cache interface: +Str8 cache_str8(Str8Cache* cache, Str8 str) +{ + U64 key = 0; hash64_djb8(& key, pcast(SliceByte, str)); + Str8* result = str8cache_set(cache, key, str); + assert(result != nullptr); + return * result; +} + +#ifdef DEMO__WATL_PARSE_V1 + +struct Opts__watl_parse { + SliceMem backing_nodes; + SliceMem backing_lines; + Str8Cache* str_cache; +}; + +struct WATL_ParseInfo { + WATL_SliceLine lines; +}; + +void api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts) +{ + assert(info != nullptr); + slice_assert(tokens); + assert(opts != nullptr); + + FArena a_lines = farena_init(opts->backing_lines); + FArena a_nodes = farena_init(opts->backing_nodes); + + WATL_Line* line = farena_push(a_lines, WATL_Line); + WATL_Node* curr = line->ptr->entry; + for (slice_iter(tokens)) + { + switch (* iter->code) + { + case WATL_Tok_Space: + case WATL_Tok_Tab: { + line->ptr->entry = cache_str8(opts->str_cache, watl_tok_str8(tokens, iter)); + continue; + } + break; + case WATL_Tok_CarriageReturn: { + ++ iter; + } + case WATL_Tok_LineFeed: { + WATL_Line* new_line = farena_push(a_lines, WATL_Line); + line = new_line; + continue; + } + + default: + break; + } + + + } +} + +#endif DEMO__WATL_PARSE_V1 + +WATL_ParseInfo watl__parse(WATL_SliceTok tokens, Opts__watl_parse* opts) { WATL_ParseInfo info; api_watl_parse(& info, tokens, opts); return info; } + +#ifdef DEMO__WATL_PARSE_V1 +int main() +{ + // This will limit for our V1 read to 64kb at most. + FMem_64KB read_mem = {0}; + FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) ); + + // This will limit our V1 lex to only 8 megs worth of token tracking on a file. + SliceMem mem_toks = slicemem_alloc(MEGABYTES(8)); + WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks); + + SliceMem mem_cache_strs = slicemem_alloc(MEGABYTES(16)); + SliceMem mem_cache_slots = slicemem_alloc(KILOBTYES(512)); + SliceMem mem_cache_table = slicemem_alloc(KILOBTYES(64)); + Str8Cache str_cache = str8cache_init(mem_cache_strs, mem_cache_slots, mem_cache_table); + + SliceMem mem_parse_nodes = slicemem_alloc(MEGABYTES(4)); + SliceMem mem_parse_lines = slicemem_alloc(MEGABYTES(4)); + WATL_ParseInfo parse_res = watl_parse(lex_res.tokens, .backing_nodes = mem_parse_nodes, .backing_lines = mem_parse_lines, .str_cache = & str_cache); + + // unnecessary in this case but if you want to explicitly: + slicemem_free(mem_toks); + return 0; +} +#endif