progress on making the basic parser.

This commit is contained in:
Edward R. Gonzalez 2025-05-05 22:29:22 -04:00
parent 490cb76d41
commit 567ec7d6fd

View File

@ -40,7 +40,8 @@ int main()
// Demo selection
// #define DEMO__STR_SLICE
// #define DEMO__FILE_READ_CONTENTS_V1
#define DEMO__WATL_LEX_V1
// #define DEMO__WATL_LEX_V1
#define DEMO__WATL_PARSE_V1
/*
The above makes use of the following core concepts to achieve its net result:
@ -302,7 +303,7 @@ void slice_zero(SliceMem mem) {
// Now for our "Version 1"
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1)
struct FileOpResult
{
@ -452,12 +453,12 @@ struct WATL_SliceTok {
SSIZE len;
};
Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
SSIZE start = cast(SSIZE, toks->ptr);
Str8 watl_tok_str8(WATL_SliceTok toks, WATL_Tok* tok) {
SSIZE start = cast(SSIZE, toks.ptr);
SSIZE curr = cast(SSIZE, tok->code);
SSIZE offset = curr - start;
SSIZE left = toks->len - offset;
B32 last_tok = (start + toks->len) == (curr + left);
SSIZE left = toks.len - offset;
B32 last_tok = (start + toks.len) == (curr + left);
Str8 text = {0};
text.ptr = tok->code;
text.len = last_tok ?
@ -494,8 +495,8 @@ void farena_reset (FArena* arena);
void farena_rewind (FArena* arena, ArenaSP savepoint);
ArenaSP farena_save (FArena arena);
#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type)))
#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
#define farena_push(arena, type) cast(type*, farena__push(& arena, size_of(type), 1, lit(stringify(type))).ptr)
#define farena_push_array(arena, type, amount) pcast(Slice ## type, farena__push(& arena, size_of(type), amount, lit(stringify(type))) )
inline
void api_farena_init(FArena* arena, SliceMem mem) {
@ -529,7 +530,7 @@ inline void farena_reset(FArena* arena) { arena->used = 0; }
inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; }
#pragma endregion FArena
#ifdef DEMO__WATL_LEX_V1
#if defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1)
struct WATL_LexInfo {
// For now just the tokens
@ -569,7 +570,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
case WATL_Tok_Tab:
{
if (* prev != * cursor) {
tok = farena_push(arena, WATL_Tok).ptr;
tok = farena_push(arena, WATL_Tok);
tok->code = cursor;
was_text = false;
}
@ -582,8 +583,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
// Assumes next is line feed.
cursor += 1;
}
case WATL_Tok_LineFeed:
{
case WATL_Tok_LineFeed: {
cursor += 1;
code = * cursor;
}
@ -593,7 +593,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
break;
}
if (! was_text) {
tok = farena_push(arena, WATL_Tok).ptr;
tok = farena_push(arena, WATL_Tok);
tok->code = cursor;
was_text = true;
}
@ -653,6 +653,277 @@ int main()
/*
Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
*/
typedef struct WATL_ParseInfo WATL_ParseInfo;
typedef struct Opts__watl_parse Opts__watl_parse;
void api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts);
WATL_ParseInfo watl__parse ( WATL_SliceTok tokens, Opts__watl_parse* opts);
#define watl_parse(tokens, ...) watl__parse(tokens, & (Opts__watl_parse) {__VA_ARGS__})
typedef struct WATL_Node WATL_Node;
struct WATL_Node {
WATL_Node* next;
Str8 entry;
};
typedef struct WATL_Line WATL_Line;
struct WATL_Line {
WATL_Line* next;
WATL_Node* ptr;
};
typedef struct WATL_SliceLine WATL_SliceLine;
struct WATL_SliceLine {
WATL_Line* ptr;
SSIZE len;
};
/*
For the sake of the exercise, we'll be eliminating the association with the file's strings and we'll need to instead cache them.
*/
#pragma region Str8Cache
typedef struct Str8Cache Str8Cache;
void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
Str8Cache str8cache_init ( SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
// A cache like this relies on tabling string entires utiliszing an index derived from a hashed ID.
// For these strings we'll be using a hash called djb8:
// Introducing a slice iterator:
#define slice_iter(container) typeof(container.ptr) iter = container.ptr; iter != (container.ptr + container.len); ++ iter
inline
void hash64_djb8(U64* hash, SliceByte bytes) {
for (slice_iter(bytes)) {
*hash = (((*hash) << 8) + (*hash)) + (*iter);
}
}
// For a library or codebase its recommended to setup a metaprogram to generate hash utilizing containers
// Or other containers that cannot be sufficiently lifted to general runtime paths without losing ergonomic debug type info or type-constraint enforcements.
// Unlike with the template markup C++ uses, you can strike a balance between how many definitions are redundantly made or optimized for collapsing to a general path
// based on target optimization and debugability.
// For this V1 example, we'll be hand-rolling a fixed sized table with excess slot chaining for colliding slots.
// Its a relatively simple implementation to hand-roll. These things tend to become unyeilding with more advanced variants.
typedef struct Str8Cache_Slot Str8Cache_Slot;
struct Str8Cache_Slot {
Str8Cache_Slot* prev;
Str8Cache_Slot* next;
Str8 value;
U64 key;
B32 occupied;
};
typedef struct Str8Cache_SliceSlot Str8Cache_SliceSlot;
struct Str8Cache_SliceSlot {
Str8Cache_Slot* ptr;
SSIZE len;
};
struct Str8Cache {
FArena a_str;
Str8Cache_SliceSlot pool;
Str8Cache_Slot* vacant;
Str8Cache_SliceSlot table;
};
Str8Cache str8cache_init(SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) { Str8Cache cache; api_str8cache_init(& cache, mem_strs, mem_slots, mem_table); return cache; }
inline
void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) {
assert(cache != nullptr);
slice_assert(mem_strs);
slice_assert(mem_slots);
slice_assert(mem_table);
cache->a_str = farena_init(mem_strs);
cache->pool = pcast(Str8Cache_SliceSlot, mem_slots);
cache->table = pcast(Str8Cache_SliceSlot, mem_table);
}
void str8cache_clear(Str8Cache* cache)
{
for (slice_iter(cache->table))
{
if (iter == nullptr) {
continue;
}
for (Str8Cache_Slot* probe_slot = iter->next; probe_slot != nullptr; probe_slot = probe_slot->next) {
iter->occupied = false;
}
iter->occupied = false;
}
}
// We don't introduce a remove option because we're not tracking fixed sized entities.
// Strings take up non-determistic sizes of their backing arena. So the only thing that can be done with the cache is wiping it and recaching all strings.
/*
When storing a hash of a slot, we can almost never utilize the full width of a key,
so we must truncate the key via module to get a "good enough" unique ID to place in the table.
*/
inline
U64 str8cache_slot_id(Str8Cache cache, U64 key) {
U64 hash_index = key % cast(U64, cache.table.len);
return hash_index;
}
Str8* str8cache_get(Str8Cache cache, U64 key)
{
U64 hash_index = str8cache_slot_id(cache, key);
Str8Cache_Slot* surface_slot = & cache.table.ptr[hash_index];
if (surface_slot == nullptr) {
return nullptr;
}
if (surface_slot->occupied && surface_slot->key == key) {
return & surface_slot->value;
}
for (Str8Cache_Slot* slot = surface_slot->next; slot != nullptr; slot = slot->next)
{
if (slot->occupied && slot->key == key) {
return & slot->value;
}
}
return nullptr;
}
Str8* str8cache_set(Str8Cache* cache, U64 key, Str8 value)
{
U64 hash_index = str8cache_slot_id(*cache, key);
Str8Cache_Slot* surface_slot = & cache->table.ptr[hash_index];
if (! surface_slot->occupied || surface_slot->key == key)
{
if (value.ptr != surface_slot->value.ptr) {
SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8"));
slice_copy(mem, pcast(SliceMem, value));
surface_slot->value = pcast(Str8, mem);
}
surface_slot->key = key;
surface_slot->occupied = true;
return & surface_slot->value;
}
Str8Cache_Slot* slot = surface_slot;
for (;; slot = slot->next)
{
if (slot->next == nullptr)
{
// We had a collision, we need to grab a vacant slot from the pool and utilize it instead.
slot->next = cache->vacant;
* slot->next = (Str8Cache_Slot){0};
slot->next->prev = slot;
Str8Cache_Slot* next_vacant = cache->vacant + 1;
assert(next_vacant < cache->pool.ptr + cache->pool.len );
// If the above fails we ran out of extra slots.
cache->vacant = cache->vacant + 1;
}
if ( ! slot->next->occupied || slot->next->key == key)
{
if (value.ptr != slot->next->value.ptr) {
SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8"));
slice_copy(mem, pcast(SliceMem, value));
slot->next->value = pcast(Str8, mem);
}
slot->next->value = value;
slot->next->key = key;
slot->next->occupied = true;
return & slot->next->value;
}
// We keep traversing till we find a match or we find a vacancy for this list in the table.
// Make sure to tune the size of the table so it does this less!
// Note: Tables sized by prime values collide less aswell.
// You can use a closest prime number lookup table to derive what length to expose to the cache's table for hash ID resolution.
}
return nullptr;
}
#pragma endregion Str8Cache
// Finally our abstracted cache interface:
Str8 cache_str8(Str8Cache* cache, Str8 str)
{
U64 key = 0; hash64_djb8(& key, pcast(SliceByte, str));
Str8* result = str8cache_set(cache, key, str);
assert(result != nullptr);
return * result;
}
#ifdef DEMO__WATL_PARSE_V1
struct Opts__watl_parse {
SliceMem backing_nodes;
SliceMem backing_lines;
Str8Cache* str_cache;
};
struct WATL_ParseInfo {
WATL_SliceLine lines;
};
void api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts)
{
assert(info != nullptr);
slice_assert(tokens);
assert(opts != nullptr);
FArena a_lines = farena_init(opts->backing_lines);
FArena a_nodes = farena_init(opts->backing_nodes);
WATL_Line* line = farena_push(a_lines, WATL_Line);
WATL_Node* curr = line->ptr->entry;
for (slice_iter(tokens))
{
switch (* iter->code)
{
case WATL_Tok_Space:
case WATL_Tok_Tab: {
line->ptr->entry = cache_str8(opts->str_cache, watl_tok_str8(tokens, iter));
continue;
}
break;
case WATL_Tok_CarriageReturn: {
++ iter;
}
case WATL_Tok_LineFeed: {
WATL_Line* new_line = farena_push(a_lines, WATL_Line);
line = new_line;
continue;
}
default:
break;
}
}
}
#endif DEMO__WATL_PARSE_V1
WATL_ParseInfo watl__parse(WATL_SliceTok tokens, Opts__watl_parse* opts) { WATL_ParseInfo info; api_watl_parse(& info, tokens, opts); return info; }
#ifdef DEMO__WATL_PARSE_V1
int main()
{
// This will limit for our V1 read to 64kb at most.
FMem_64KB read_mem = {0};
FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
SliceMem mem_toks = slicemem_alloc(MEGABYTES(8));
WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
SliceMem mem_cache_strs = slicemem_alloc(MEGABYTES(16));
SliceMem mem_cache_slots = slicemem_alloc(KILOBTYES(512));
SliceMem mem_cache_table = slicemem_alloc(KILOBTYES(64));
Str8Cache str_cache = str8cache_init(mem_cache_strs, mem_cache_slots, mem_cache_table);
SliceMem mem_parse_nodes = slicemem_alloc(MEGABYTES(4));
SliceMem mem_parse_lines = slicemem_alloc(MEGABYTES(4));
WATL_ParseInfo parse_res = watl_parse(lex_res.tokens, .backing_nodes = mem_parse_nodes, .backing_lines = mem_parse_lines, .str_cache = & str_cache);
// unnecessary in this case but if you want to explicitly:
slicemem_free(mem_toks);
return 0;
}
#endif