diff --git a/demo.str_cache.c b/demo.str_cache.c index cbd0eab..a1faf30 100644 --- a/demo.str_cache.c +++ b/demo.str_cache.c @@ -1,8 +1,6 @@ /* A introduction to C11 with a str cache demo. Attempting to showcase better conventions and constructs in C; Discovered to me as of 2025 from scouring the internet. - -"C is old and flawed, but your use of it is most likely more flawed. You must have calouses to write with barbed syntax & semantics." */ /* @@ -25,7 +23,7 @@ int main() StrCache cache = strcache_init(varena_ainfo(cache)); VArena file_arena; varena_init(file_arena); - Str path_text = lit("../demo.strcache.c"); + Str8 path_text = lit("../demo.strcache.c"); FileContent text_file = file_read_contents(varena_ainfo(file_arena), path_text); Arena ast_arena; arena_init(ast_arena); @@ -39,6 +37,11 @@ int main() } #endif +// Demo selection +// #define DEMO__STR_SLICE +// #define DEMO__FILE_READ_CONTENTS_V1 +#define DEMO__WATL_LEX_V1 + /* The above makes use of the following core concepts to achieve its net result: * Slices @@ -85,7 +88,6 @@ typedef signed __int64 S64; typedef size_t USIZE; typedef ptrdiff_t SSIZE; - enum { false, true, @@ -96,14 +98,31 @@ typedef S8 B8; typedef S16 B16; typedef S32 B32; +// Common macros we'll use throughout this. + +#define assert_bounds(point, start, end) do { \ + SSIZE pos_point = cast(SSIZE, point); \ + SSIZE pos_start = cast(SSIZE, start); \ + SSIZE pos_end = cast(SSIZE, end); \ + assert(pos_start <= pos_point); \ + assert(pos_point <= pos_end); \ +} while(0) + // Functional style cast -#define cast(type, data) ((type)(data)) +#define cast(type, data) ((type)(data)) +#define pcast(type, data) * cast(type*, & data) #define nullptr cast(void*, 0) +#define glue_(A, B) A ## B +#define glue(A, B) glue_(A,B) + // Enforces size querying uses SSIZE type. #define size_of(data) cast(SSIZE, sizeof(data)) +#define stringify_(S) #S +#define stringify(S) stringify_(S) + /* The first construct we'll utilize is a String Slice. In modern programming with the memory sizes utilized, it is more ergonomic to track the length of strings with their pointer. @@ -120,7 +139,6 @@ struct Str8 { #define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 } // For now this string can visualized using a debugger. -// #define DEMO__STR_SLICE #ifdef DEMO__STR_SLICE int main() { @@ -147,7 +165,6 @@ typedef struct FileOpResult FileOpResult; typedef struct Opts__read_file_contents Opts__read_file_contents; void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts); FileOpResult file__read_contents ( Str8 path, Opts__read_file_contents* opts); - #define file_read_contents(path, ...) file__read_contents(path, & (Opts__read_file_contents){__VA_ARGS__} ) /* @@ -285,8 +302,7 @@ void slice_zero(SliceMem mem) { // Now for our "Version 1" -#define DEMO__FILE_READ_CONTENTS_V1 -#ifdef DEMO__FILE_READ_CONTENTS_V1 +#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1) struct FileOpResult { @@ -310,6 +326,7 @@ void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_con // Backing is required at this point slice_assert(opts->backing); + // This will limit a path for V1 to be 16kb worth of codepoints. FMem_16KB scratch = {0}; char const* path_cstr = str8_to_cstr_capped(path, fmem_slice(scratch) ); @@ -385,8 +402,257 @@ FileOpResult file__read_contents(Str8 path, Opts__read_file_contents* opts) { #ifdef DEMO__FILE_READ_CONTENTS_V1 int main() { + // This will limit for our V1 read to 64kb at most. FMem_64KB read_mem = {0}; FileOpResult res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) ); return 0; } +#endif DEMO__FILE_READ_CONTENTS_V1 + +/* +Now that we have file reading done we need to be able to process the content. + +First we want to do lexical analysis. So we'll create a token listing delimiting aspects of the text file relevant to us. +For our data structure, we are going for a Whitespace-Aware Text Layout; where we'll track text and the formatting around them. + +Just like with the read file contents operation, we'll define an interface to performing this analysis. +It will be called watl_lex and take the SliceMem from the file as a Str8 slice and some Opts__watl_lex; +returning a WATL_LexInfo for providing user info on how the operation went. +*/ + +typedef struct WATL_LexInfo WATL_LexInfo; +typedef struct Opts__watl_lex Opts__watl_lex; + +void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts); +WATL_LexInfo watl__lex ( Str8 source, Opts__watl_lex* opts); +#define watl_lex(source, ...) watl__lex(source, &(Opts__watl_lex){__VA_ARGS__}) + +/* +Token identification will be done using a WATL_TokKind enumeration. +The token type itself will be the id along with a ptr to its start of the slice. We can resolve the width of the token by its delta to the next token. +If its the last token, then its delta is determined by its offset to the end of the Str8 slice. +*/ + +typedef U32 WATL_TokKind; +enum WATL_TokKind { + WATL_Tok_Space = ' ', + WATL_Tok_Tab = '\t', + WATL_Tok_CarriageReturn = '\r', + WATL_Tok_LineFeed = '\n', +}; + +typedef struct WATL_Tok WATL_Tok; +struct WATL_Tok { + char const* code; +}; + +typedef struct WATL_SliceTok WATL_SliceTok; +struct WATL_SliceTok { + WATL_Tok* ptr; + SSIZE len; +}; + +Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) { + SSIZE start = cast(SSIZE, toks->ptr); + SSIZE curr = cast(SSIZE, tok->code); + SSIZE offset = curr - start; + SSIZE left = toks->len - offset; + B32 last_tok = (start + toks->len) == (curr + left); + Str8 text = {0}; + text.ptr = tok->code; + text.len = last_tok ? + left + // Othwerise its the last minus the curr. + : cast(SSIZE, (tok + 1) - curr); + return text; +} + +/* +Tokens are allocated to a backing slice of memory defined by the user. This pool of memory will ideally not be constrained to a fixed size on the stack. +So for V1 we'll allocate 10 megs of heap memory to act as a pool for the tokens. We'll keep track of how much for the pool we used via a new memory tracking construct: +The fixed-sized arena. + +A basic fixed size arena only has three components which can vary depending on the convention the user perfers. +In our case we'll track its capacity, its starting address, and how much has been comitted.. +*/ + +// We use this in-conjunction with Areans to save a point thats safe to rewind to by the user. +typedef struct ArenaSP ArenaSP; +struct ArenaSP { void* ptr; }; + +#pragma region FArena +typedef struct FArena FArena; +struct FArena { + void* start; + SSIZE capacity; + SSIZE used; +}; +void api_farena_init(FArena* arena, SliceMem mem); +FArena farena_init (SliceMem mem); +SliceMem farena__push (FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename); +void farena_reset (FArena* arena); +void farena_rewind (FArena* arena, ArenaSP savepoint); +ArenaSP farena_save (FArena arena); + +#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type))) +#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type))) + +inline +void api_farena_init(FArena* arena, SliceMem mem) { + arena->start = mem.ptr; + arena->capacity = mem.len; + arena->used = 0; +} +inline FArena farena_init(SliceMem mem) { FArena arena; api_farena_init(& arena, mem); return arena; } + +inline +SliceMem farena__push(FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename) { + SSIZE to_commit = type_size * amount; + SSIZE unused = arena->capacity - arena->used; + assert(to_commit <= unused); + + SliceMem result = {0}; + result.ptr = cast(void*, cast(SSIZE, arena->start) + arena->used); + result.len = to_commit; + arena->used += to_commit; + return result; +} + +inline +void farena_rewind(FArena* arena, ArenaSP savepoint) { + void* end = cast(void*, cast(SSIZE, arena->start) + arena->used); + assert_bounds(savepoint.ptr, arena->start, end); + arena->used -= cast(SSIZE, savepoint.ptr) - cast(SSIZE, arena->start); +} + +inline void farena_reset(FArena* arena) { arena->used = 0; } +inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; } +#pragma endregion FArena + +#ifdef DEMO__WATL_LEX_V1 + +struct WATL_LexInfo { + // For now just the tokens + WATL_SliceTok tokens; +}; + +struct Opts__watl_lex { +/* + For this operation we'll enforce that the arena must linearly allocate each token, forming a strictly adjacent sent of elements in an array. + This is not necessary and an equivalent process could be done where the tokens instead are semi-contigously organized into linked list with a chained arena, or the tokens are sparely cached. + Where their position in their originating string is not preserved. In this case we're keeping it simple. Tokens are in the same block of memory and they don't use a string cache. +*/ + SliceMem pool_toks; +}; + +// We are assuming everything is utf8-ascii. +void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts) +{ + assert(info != nullptr); + slice_assert(source); + assert(opts != nullptr); + + FArena arena = farena_init(opts->pool_toks); + + char const* end = source.ptr + source.len; + char const* cursor = source.ptr; + char const* prev = source.ptr; + char code = * cursor; + + B32 was_text = false; + WATL_Tok* tok = nullptr; + for (; cursor < end;) + { + switch (code) + { + case WATL_Tok_Space: + case WATL_Tok_Tab: + { + if (* prev != * cursor) { + tok = farena_push(arena, WATL_Tok).ptr; + tok->code = cursor; + was_text = false; + } + cursor += 1; + code = * cursor; + } + continue; + + case WATL_Tok_CarriageReturn: { + // Assumes next is line feed. + cursor += 1; + } + case WATL_Tok_LineFeed: + { + cursor += 1; + code = * cursor; + } + continue; + + default: + break; + } + if (! was_text) { + tok = farena_push(arena, WATL_Tok).ptr; + tok->code = cursor; + was_text = true; + } + prev = cursor; + cursor += 1; + code = * cursor; + } + info->tokens.ptr = arena.start; + info->tokens.len = arena.used / size_of(WATL_Tok*); +} + +#endif DEMO__WATL_LEX_V1 + +inline +WATL_LexInfo watl__lex(Str8 source, Opts__watl_lex* opts) { + WATL_LexInfo result = {0}; + api_watl_lex(& result, source, opts); + return result; +} + +/* +To allocate onto the heap we'll make a basic slicemem_malloc to allocate, we'll make a corresponding slicemem_free aswell. +However we don't need to use it for the V1 example. The OS will cleanup the pages used by the process during its termination. +*/ + +SliceMem slicemem_alloc(SSIZE amount) +{ + assert(amount > KILOBTYES(4)); + void* result = malloc(amount); + assert(result != nullptr); + SliceMem mem = { + .ptr = result, + .len = amount + }; + return mem; +} +void slicemem_free(SliceMem mem) { + free(mem.ptr); +} + +#ifdef DEMO__WATL_LEX_V1 +int main() +{ + // This will limit for our V1 read to 64kb at most. + FMem_64KB read_mem = {0}; + FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) ); + + // This will limit our V1 lex to only 8 megs worth of token tracking on a file. + SliceMem mem_toks = slicemem_alloc(MEGABYTES(8)); + WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks); + // unnecessary in this case but if you want to explicitly: + slicemem_free(mem_toks); + return 0; +} #endif + +/* +Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree. +* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens. + +We'll preserve whether or not a non-visible chunk is a tab or series of spaces. +*/