progress on making the basic parser.

2025-05-05 22:29:22 -04:00 · 2025-05-05 22:29:22 -04:00 · 567ec7d6fd
commit 567ec7d6fd
parent 490cb76d41
1 changed files with 286 additions and 15 deletions
--- a/demo.str_cache.c
+++ b/demo.str_cache.c
@ -40,7 +40,8 @@ int main()
 // Demo selection
 // #define DEMO__STR_SLICE
 // #define DEMO__FILE_READ_CONTENTS_V1
-#define DEMO__WATL_LEX_V1
+// #define DEMO__WATL_LEX_V1
 #define DEMO__WATL_PARSE_V1
 /*
 The above makes use of the following core concepts to achieve its net result:
@ -302,7 +303,7 @@ void slice_zero(SliceMem mem) {
 // Now for our "Version 1"
-#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)
+#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1)
 struct FileOpResult
 {
@ -452,12 +453,12 @@ struct WATL_SliceTok {
 	SSIZE     len;
 };
-Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
+Str8 watl_tok_str8(WATL_SliceTok toks, WATL_Tok* tok) {
-	SSIZE start    = cast(SSIZE, toks->ptr);
+	SSIZE start    = cast(SSIZE, toks.ptr);
 	SSIZE curr     = cast(SSIZE, tok->code);
 	SSIZE offset   = curr - start;
-	SSIZE left     = toks->len - offset;
+	SSIZE left     = toks.len - offset;
-	B32   last_tok = (start + toks->len) == (curr + left);
+	B32   last_tok = (start + toks.len) == (curr + left);
 	Str8  text     = {0};
 	text.ptr = tok->code;
 	text.len = last_tok ? 
@ -494,8 +495,8 @@ void     farena_reset   (FArena* arena);
 void     farena_rewind  (FArena* arena, ArenaSP savepoint);
 ArenaSP  farena_save    (FArena  arena);
-#define  farena_push(arena, type)               farena__push(& arena, size_of(type), 1,      lit(stringify(type)))
+#define  farena_push(arena, type)                cast(type*,         farena__push(& arena, size_of(type), 1,      lit(stringify(type))).ptr)
-#define  farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
+#define  farena_push_array(arena, type, amount) pcast(Slice ## type, farena__push(& arena, size_of(type), amount, lit(stringify(type)))    )
 inline
 void api_farena_init(FArena* arena, SliceMem mem) {
@ -529,7 +530,7 @@ inline void    farena_reset(FArena* arena) { arena->used = 0; }
 inline ArenaSP farena_save (FArena  arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; } 
 #pragma endregion FArena
-#ifdef DEMO__WATL_LEX_V1
+#if defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1)
 struct WATL_LexInfo {
 	// For now just the tokens
@ -569,7 +570,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
 			case WATL_Tok_Tab:
 			{
 				if (* prev != * cursor) {
-					tok       = farena_push(arena, WATL_Tok).ptr;
+					tok       = farena_push(arena, WATL_Tok);
 					tok->code = cursor;
 					was_text  = false;
 				}
@ -582,8 +583,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
 				// Assumes next is line feed.
 				cursor   += 1;
 			}
-			case WATL_Tok_LineFeed:
+			case WATL_Tok_LineFeed: {
 			{
 				cursor   += 1;
 				code      = * cursor;
 			}
@ -593,7 +593,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
 			break;
 		}
 		if (! was_text) {
-			tok       = farena_push(arena, WATL_Tok).ptr;
+			tok       = farena_push(arena, WATL_Tok);
 			tok->code = cursor;
 			was_text  = true;
 		}
@ -653,6 +653,277 @@ int main()
 /*
 Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
 * The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
 We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
 */
 typedef struct WATL_ParseInfo WATL_ParseInfo;
 typedef struct Opts__watl_parse Opts__watl_parse;
 void           api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts);
 WATL_ParseInfo watl__parse   (                      WATL_SliceTok tokens, Opts__watl_parse* opts);
 #define watl_parse(tokens, ...) watl__parse(tokens, & (Opts__watl_parse) {__VA_ARGS__})
 typedef struct WATL_Node WATL_Node;
 struct WATL_Node {
 	WATL_Node* next;
 	Str8       entry;
 };
 typedef struct WATL_Line WATL_Line;
 struct WATL_Line {
 	WATL_Line* next;
 	WATL_Node* ptr;
 };
 typedef struct WATL_SliceLine WATL_SliceLine;
 struct WATL_SliceLine {
 	WATL_Line* ptr;
 	SSIZE      len;
 };
 /*
 For the sake of the exercise, we'll be eliminating the association with the file's strings and we'll need to instead cache them.
 */
 #pragma region Str8Cache
 typedef struct Str8Cache Str8Cache;
 void      api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
 Str8Cache str8cache_init    (                  SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
 // A cache like this relies on tabling string entires utiliszing an index derived from a hashed ID.
 // For these strings we'll be using a hash called djb8:
 // Introducing a slice iterator:
 #define slice_iter(container) typeof(container.ptr) iter = container.ptr; iter != (container.ptr + container.len); ++ iter
 inline
 void hash64_djb8(U64* hash, SliceByte bytes) {
 	for (slice_iter(bytes)) {
 		*hash = (((*hash) << 8) + (*hash)) + (*iter);
 	}
 }
 // For a library or codebase its recommended to setup a metaprogram to generate hash utilizing containers
 // Or other containers that cannot be sufficiently lifted to general runtime paths without losing ergonomic debug type info or type-constraint enforcements.
 // Unlike with the template markup C++ uses, you can strike a balance between how many definitions are redundantly made or optimized for collapsing to a general path
 // based on target optimization and debugability.
 // For this V1 example, we'll be hand-rolling a fixed sized table with excess slot chaining for colliding slots.
 // Its a relatively simple implementation to hand-roll. These things tend to become unyeilding with more advanced variants.
 typedef struct Str8Cache_Slot Str8Cache_Slot;
 struct Str8Cache_Slot {
 	Str8Cache_Slot* prev;
 	Str8Cache_Slot* next;
 	Str8 value;
 	U64  key;
 	B32  occupied;
 };
 typedef struct Str8Cache_SliceSlot Str8Cache_SliceSlot;
 struct Str8Cache_SliceSlot {
 	Str8Cache_Slot* ptr;
 	SSIZE           len;
 };
 struct Str8Cache {
 	FArena               a_str;
 	Str8Cache_SliceSlot  pool;
 	Str8Cache_Slot*      vacant;
 	Str8Cache_SliceSlot  table;
 };
 Str8Cache str8cache_init(SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) { Str8Cache cache; api_str8cache_init(& cache, mem_strs, mem_slots, mem_table); return cache; }
 inline
 void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) {
 	assert(cache != nullptr);
 	slice_assert(mem_strs);
 	slice_assert(mem_slots);
 	slice_assert(mem_table);
 	cache->a_str = farena_init(mem_strs);
 	cache->pool  = pcast(Str8Cache_SliceSlot, mem_slots);
 	cache->table = pcast(Str8Cache_SliceSlot, mem_table);
 }
 void str8cache_clear(Str8Cache* cache)
 {
 	for (slice_iter(cache->table))
 	{
 		if (iter == nullptr) {
 			continue;
 		}
 		for (Str8Cache_Slot* probe_slot = iter->next; probe_slot != nullptr; probe_slot = probe_slot->next) {
 			iter->occupied = false;
 		}
 		iter->occupied = false;
 	}
 }
 // We don't introduce a remove option because we're not tracking fixed sized entities.
 // Strings take up non-determistic sizes of their backing arena. So the only thing that can be done with the cache is wiping it and recaching all strings.
 /*
 When storing a hash of a slot, we can almost never utilize the full width of a key,
 so we must truncate the key via module to get a "good enough" unique ID to place in the table.
 */
 inline
 U64 str8cache_slot_id(Str8Cache cache, U64 key) {
 	U64 hash_index = key % cast(U64, cache.table.len);
 	return hash_index;
 }
 Str8* str8cache_get(Str8Cache cache, U64 key)
 {
 	U64             hash_index   = str8cache_slot_id(cache, key);
 	Str8Cache_Slot* surface_slot = & cache.table.ptr[hash_index];
 	if (surface_slot == nullptr) { 
 		return nullptr;
 	}
 	if (surface_slot->occupied && surface_slot->key == key) {
 		return & surface_slot->value;
 	}
 	for (Str8Cache_Slot* slot = surface_slot->next; slot != nullptr; slot = slot->next)
 	{
 		if (slot->occupied && slot->key == key) {
 			return & slot->value;
 		}
 	}
 	return nullptr;
 }
 Str8* str8cache_set(Str8Cache* cache, U64 key, Str8 value)
 {
 	U64             hash_index   = str8cache_slot_id(*cache, key);
 	Str8Cache_Slot* surface_slot = & cache->table.ptr[hash_index];
 	if (! surface_slot->occupied || surface_slot->key == key)
 	{
 		if (value.ptr != surface_slot->value.ptr) {
 			SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8"));
 			slice_copy(mem, pcast(SliceMem, value));
 			surface_slot->value = pcast(Str8, mem);
 		}
 		surface_slot->key      = key;
 		surface_slot->occupied = true;
 		return & surface_slot->value;
 	}
 	Str8Cache_Slot* slot = surface_slot;
 	for (;; slot = slot->next)
 	{
 		if (slot->next == nullptr)
 		{
 			// We had a collision, we need to grab a vacant slot from the pool and utilize it instead.
 			slot->next       = cache->vacant;
 			* slot->next     = (Str8Cache_Slot){0};
 			slot->next->prev = slot;
 			Str8Cache_Slot* next_vacant = cache->vacant + 1;
 			assert(next_vacant < cache->pool.ptr + cache->pool.len );
 			// If the above fails we ran out of extra slots.
 			cache->vacant = cache->vacant + 1;
 		}
 		if ( ! slot->next->occupied || slot->next->key == key)
 		{
 			if (value.ptr != slot->next->value.ptr) {
 				SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8"));
 				slice_copy(mem, pcast(SliceMem, value));
 				slot->next->value = pcast(Str8, mem);
 			}
 			slot->next->value = value;
 			slot->next->key      = key;
 			slot->next->occupied = true;
 			return & slot->next->value;
 		}
 		// We keep traversing till we find a match or we find a vacancy for this list in the table.
 		// Make sure to tune the size of the table so it does this less! 
 		// Note: Tables sized by prime values collide less aswell. 
 		// You can use a closest prime number lookup table to derive what length to expose to the cache's table for hash ID resolution.
 	}
 	return nullptr;
 }
 #pragma endregion Str8Cache
 // Finally our abstracted cache interface:
 Str8 cache_str8(Str8Cache* cache, Str8 str)
 {
 	U64 key = 0; hash64_djb8(& key, pcast(SliceByte, str));
 	Str8* result = str8cache_set(cache, key, str);
 	assert(result != nullptr);
 	return * result;
 }
 #ifdef DEMO__WATL_PARSE_V1
 struct Opts__watl_parse {
 	SliceMem   backing_nodes;
 	SliceMem   backing_lines;
 	Str8Cache* str_cache;
 };
 struct WATL_ParseInfo {
 	WATL_SliceLine lines;
 };
 void api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts)
 {
 	assert(info != nullptr);
 	slice_assert(tokens);
 	assert(opts != nullptr);
 	FArena a_lines = farena_init(opts->backing_lines);
 	FArena a_nodes = farena_init(opts->backing_nodes);
 	WATL_Line* line = farena_push(a_lines, WATL_Line);
 	WATL_Node* curr = line->ptr->entry;
 	for (slice_iter(tokens))
 	{
 		switch (* iter->code)
 		{
 			case WATL_Tok_Space:
 			case WATL_Tok_Tab: {
 				line->ptr->entry = cache_str8(opts->str_cache, watl_tok_str8(tokens, iter));
 				continue;
 			}
 			break;
 			case WATL_Tok_CarriageReturn: {
 				++ iter;
 			}
 			case WATL_Tok_LineFeed: {
 				WATL_Line* new_line = farena_push(a_lines, WATL_Line);
 				line = new_line;
 				continue;
 			}
 			default:
 			break;
 		}
 	}
 }
 #endif DEMO__WATL_PARSE_V1
 WATL_ParseInfo watl__parse(WATL_SliceTok tokens, Opts__watl_parse* opts) { WATL_ParseInfo info; api_watl_parse(& info, tokens, opts); return info; }
 #ifdef DEMO__WATL_PARSE_V1
 int main()
 {
 		// This will limit for our V1 read to 64kb at most.
 	FMem_64KB    read_mem = {0};
 	FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
 	// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
 	SliceMem     mem_toks = slicemem_alloc(MEGABYTES(8));
 	WATL_LexInfo lex_res  = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
 	SliceMem mem_cache_strs  = slicemem_alloc(MEGABYTES(16));
 	SliceMem mem_cache_slots = slicemem_alloc(KILOBTYES(512));
 	SliceMem mem_cache_table = slicemem_alloc(KILOBTYES(64));
 	Str8Cache str_cache      = str8cache_init(mem_cache_strs, mem_cache_slots, mem_cache_table);
 	SliceMem mem_parse_nodes = slicemem_alloc(MEGABYTES(4));
 	SliceMem mem_parse_lines = slicemem_alloc(MEGABYTES(4));
 	WATL_ParseInfo parse_res = watl_parse(lex_res.tokens, .backing_nodes = mem_parse_nodes, .backing_lines = mem_parse_lines, .str_cache = & str_cache);
 	// unnecessary in this case but if you want to explicitly:
 	slicemem_free(mem_toks);
 	return 0;
 }
 #endif