V1 lex done

2025-05-04 18:11:04 -04:00
parent 76fbeff084
commit 490cb76d41
1 changed files with 275 additions and 9 deletions
--- a/demo.str_cache.c
+++ b/demo.str_cache.c
@ -1,8 +1,6 @@
 /*
 A introduction to C11 with a str cache demo.
 Attempting to showcase better conventions and constructs in C; Discovered to me as of 2025 from scouring the internet.
-
-"C is old and flawed, but your use of it is most likely more flawed. You must have calouses to write with barbed syntax & semantics."
 */

 /*
@ -25,7 +23,7 @@ int main()
 	StrCache cache = strcache_init(varena_ainfo(cache));

 	VArena      file_arena; varena_init(file_arena);
-	Str         path_text = lit("../demo.strcache.c");
+	Str8         path_text = lit("../demo.strcache.c");
 	FileContent text_file = file_read_contents(varena_ainfo(file_arena), path_text);

 	Arena ast_arena; arena_init(ast_arena);
@ -39,6 +37,11 @@ int main()
 }
 #endif

+// Demo selection
+// #define DEMO__STR_SLICE
+// #define DEMO__FILE_READ_CONTENTS_V1
+#define DEMO__WATL_LEX_V1
+
 /*
 The above makes use of the following core concepts to achieve its net result:
 * Slices
@ -85,7 +88,6 @@ typedef signed   __int64 S64;
 typedef size_t    USIZE;
 typedef ptrdiff_t SSIZE;

-
 enum {
 	false,
 	true,
@ -96,14 +98,31 @@ typedef S8  B8;
 typedef S16 B16;
 typedef S32 B32;

+// Common macros we'll use throughout this.
+
+#define assert_bounds(point, start, end) do { \
+	SSIZE pos_point = cast(SSIZE, point);     \
+	SSIZE pos_start = cast(SSIZE, start);     \
+	SSIZE pos_end   = cast(SSIZE, end);       \
+	assert(pos_start <= pos_point);           \
+	assert(pos_point <= pos_end);             \
+} while(0)
+
 // Functional style cast
-#define cast(type, data) ((type)(data))
+#define cast(type, data)  ((type)(data))
+#define pcast(type, data) * cast(type*, & data)

 #define nullptr cast(void*, 0)

+#define glue_(A, B) A ## B
+#define glue(A, B)  glue_(A,B)
+
 // Enforces size querying uses SSIZE type.
 #define size_of(data) cast(SSIZE, sizeof(data))

+#define stringify_(S) #S
+#define stringify(S)  stringify_(S)
+
 /*
 The first construct we'll utilize is a String Slice.
 In modern programming with the memory sizes utilized, it is more ergonomic to track the length of strings with their pointer.
@ -120,7 +139,6 @@ struct Str8 {
 #define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 }

 // For now this string can visualized using a debugger.
-// #define DEMO__STR_SLICE
 #ifdef  DEMO__STR_SLICE
 int main()
 {
@ -147,7 +165,6 @@ typedef struct FileOpResult             FileOpResult;
 typedef struct Opts__read_file_contents Opts__read_file_contents;
 void         api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts);
 FileOpResult file__read_contents   (                      Str8 path, Opts__read_file_contents* opts);
-
 #define file_read_contents(path, ...) file__read_contents(path, & (Opts__read_file_contents){__VA_ARGS__} )

 /*
@ -285,8 +302,7 @@ void slice_zero(SliceMem mem) {

 // Now for our "Version 1"

-#define DEMO__FILE_READ_CONTENTS_V1
-#ifdef  DEMO__FILE_READ_CONTENTS_V1
+#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)

 struct FileOpResult
 {
@ -310,6 +326,7 @@ void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_con
 	// Backing is required at this point
 	slice_assert(opts->backing);

+	// This will limit a path for V1 to be 16kb worth of codepoints.
 	FMem_16KB   scratch   = {0};
 	char const* path_cstr = str8_to_cstr_capped(path, fmem_slice(scratch) );

@ -385,8 +402,257 @@ FileOpResult file__read_contents(Str8 path, Opts__read_file_contents* opts) {
 #ifdef DEMO__FILE_READ_CONTENTS_V1
 int main()
 {
+	// This will limit for our V1 read to 64kb at most.
 	FMem_64KB    read_mem = {0};
 	FileOpResult res      = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
 	return 0;
 }
+#endif DEMO__FILE_READ_CONTENTS_V1
+
+/*
+Now that we have file reading done we need to be able to process the content.
+
+First we want to do lexical analysis. So we'll create a token listing delimiting aspects of the text file relevant to us.
+For our data structure, we are going for a Whitespace-Aware Text Layout; where we'll track text and the formatting around them.
+
+Just like with the read file contents operation, we'll define an interface to performing this analysis.
+It will be called watl_lex and take the SliceMem from the file as a Str8 slice and some Opts__watl_lex; 
+returning a WATL_LexInfo for providing user info on how the operation went.
+*/
+
+typedef struct WATL_LexInfo WATL_LexInfo;
+typedef struct Opts__watl_lex Opts__watl_lex;
+
+void         api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts);
+WATL_LexInfo watl__lex   (                    Str8 source, Opts__watl_lex* opts);
+#define watl_lex(source, ...) watl__lex(source, &(Opts__watl_lex){__VA_ARGS__})
+
+/*
+Token identification will be done using a WATL_TokKind enumeration.
+The token type itself will be the id along with a ptr to its start of the slice. We can resolve the width of the token by its delta to the next token.
+If its the last token, then its delta is determined by its offset to the end of the Str8 slice.
+*/
+
+typedef U32 WATL_TokKind;
+enum WATL_TokKind {
+	WATL_Tok_Space          = ' ',
+	WATL_Tok_Tab            = '\t',
+	WATL_Tok_CarriageReturn = '\r',
+	WATL_Tok_LineFeed       = '\n',
+};
+
+typedef struct WATL_Tok WATL_Tok;
+struct WATL_Tok {
+	char const* code;
+};
+
+typedef struct WATL_SliceTok WATL_SliceTok;
+struct WATL_SliceTok {
+	WATL_Tok* ptr;
+	SSIZE     len;
+};
+
+Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
+	SSIZE start    = cast(SSIZE, toks->ptr);
+	SSIZE curr     = cast(SSIZE, tok->code);
+	SSIZE offset   = curr - start;
+	SSIZE left     = toks->len - offset;
+	B32   last_tok = (start + toks->len) == (curr + left);
+	Str8  text     = {0};
+	text.ptr = tok->code;
+	text.len = last_tok ? 
+		left 
+	// Othwerise its the last minus the curr.
+	:	cast(SSIZE, (tok + 1) - curr);
+	return text;
+}
+
+/*
+Tokens are allocated to a backing slice of memory defined by the user. This pool of memory will ideally not be constrained to a fixed size on the stack.
+So for V1 we'll allocate 10 megs of heap memory to act as a pool for the tokens. We'll keep track of how much for the pool we used via a new memory tracking construct:
+The fixed-sized arena.
+
+A basic fixed size arena only has three components which can vary depending on the convention the user perfers.
+In our case we'll track its capacity, its starting address, and how much has been comitted..
+*/
+
+// We use this in-conjunction with Areans to save a point thats safe to rewind to by the user.
+typedef struct ArenaSP ArenaSP;
+struct ArenaSP { void* ptr; };
+
+#pragma region FArena
+typedef struct FArena FArena;
+struct FArena {
+	void* start;
+	SSIZE capacity;
+	SSIZE used;
+};
+void     api_farena_init(FArena* arena, SliceMem mem);
+FArena   farena_init    (SliceMem mem);
+SliceMem farena__push   (FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename);
+void     farena_reset   (FArena* arena);
+void     farena_rewind  (FArena* arena, ArenaSP savepoint);
+ArenaSP  farena_save    (FArena  arena);
+
+#define  farena_push(arena, type)               farena__push(& arena, size_of(type), 1,      lit(stringify(type)))
+#define  farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
+
+inline
+void api_farena_init(FArena* arena, SliceMem mem) {
+	arena->start    = mem.ptr;
+	arena->capacity = mem.len;
+	arena->used      = 0;
+}
+inline FArena farena_init(SliceMem mem) { FArena arena; api_farena_init(& arena, mem); return arena; }
+
+inline
+SliceMem farena__push(FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename) {
+	SSIZE to_commit = type_size * amount;
+	SSIZE unused    = arena->capacity - arena->used;
+	assert(to_commit <= unused);
+
+	SliceMem result = {0};
+	result.ptr   = cast(void*, cast(SSIZE, arena->start) + arena->used);
+	result.len   = to_commit;
+	arena->used += to_commit;
+	return result;
+}
+
+inline
+void farena_rewind(FArena* arena, ArenaSP savepoint) {
+	void* end = cast(void*, cast(SSIZE, arena->start) + arena->used);
+	assert_bounds(savepoint.ptr, arena->start, end);
+	arena->used -= cast(SSIZE, savepoint.ptr) - cast(SSIZE, arena->start);
+}
+
+inline void    farena_reset(FArena* arena) { arena->used = 0; }
+inline ArenaSP farena_save (FArena  arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; } 
+#pragma endregion FArena
+
+#ifdef DEMO__WATL_LEX_V1
+
+struct WATL_LexInfo {
+	// For now just the tokens
+	WATL_SliceTok tokens;
+};
+
+struct Opts__watl_lex {
+/*
+	For this operation we'll enforce that the arena must linearly allocate each token, forming a strictly adjacent sent of elements in an array.
+	This is not necessary and an equivalent process could be done where the tokens instead are semi-contigously organized into linked list with a chained arena, or the tokens are sparely cached.
+	Where their position in their originating string is not preserved. In this case we're keeping it simple. Tokens are in the same block of memory and they don't use a string cache.
+*/
+	SliceMem pool_toks;
+};
+
+// We are assuming everything is utf8-ascii.
+void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
+{
+	assert(info != nullptr);
+	slice_assert(source);
+	assert(opts != nullptr);
+
+	FArena arena = farena_init(opts->pool_toks);
+
+	char const* end    = source.ptr + source.len;
+	char const* cursor = source.ptr;
+	char const* prev   = source.ptr;
+	char        code   = * cursor;
+
+	B32       was_text = false;
+	WATL_Tok* tok      = nullptr;
+	for (; cursor < end;)
+	{
+		switch (code)
+		{
+			case WATL_Tok_Space:
+			case WATL_Tok_Tab:
+			{
+				if (* prev != * cursor) {
+					tok       = farena_push(arena, WATL_Tok).ptr;
+					tok->code = cursor;
+					was_text  = false;
+				}
+				cursor   += 1;
+				code      = * cursor;
+			}
+			continue;
+
+			case WATL_Tok_CarriageReturn: {
+				// Assumes next is line feed.
+				cursor   += 1;
+			}
+			case WATL_Tok_LineFeed:
+			{
+				cursor   += 1;
+				code      = * cursor;
+			}
+			continue;
+
+			default:
+			break;
+		}
+		if (! was_text) {
+			tok       = farena_push(arena, WATL_Tok).ptr;
+			tok->code = cursor;
+			was_text  = true;
+		}
+		prev    = cursor;
+		cursor += 1;
+		code    = * cursor;
+	}
+	info->tokens.ptr = arena.start;
+	info->tokens.len = arena.used / size_of(WATL_Tok*);
+}
+
+#endif DEMO__WATL_LEX_V1
+
+inline
+WATL_LexInfo watl__lex(Str8 source, Opts__watl_lex* opts) {
+	WATL_LexInfo result = {0};
+	api_watl_lex(& result, source, opts);
+	return result;
+}
+
+/*
+To allocate onto the heap we'll make a basic slicemem_malloc to allocate, we'll make a corresponding slicemem_free aswell.
+However we don't need to use it for the V1 example. The OS will cleanup the pages used by the process during its termination.
+*/
+
+SliceMem slicemem_alloc(SSIZE amount)
+{
+	assert(amount > KILOBTYES(4));
+	void* result = malloc(amount);
+	assert(result != nullptr);
+	SliceMem mem = {
+		.ptr = result,
+		.len = amount
+	};
+	return mem;
+}
+void     slicemem_free(SliceMem mem) {
+	free(mem.ptr);
+}
+
+#ifdef DEMO__WATL_LEX_V1
+int main()
+{
+	// This will limit for our V1 read to 64kb at most.
+	FMem_64KB    read_mem = {0};
+	FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
+
+	// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
+	SliceMem     mem_toks = slicemem_alloc(MEGABYTES(8));
+	WATL_LexInfo lex_res  = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
+	// unnecessary in this case but if you want to explicitly:
+	slicemem_free(mem_toks);
+	return 0;
+}
 #endif
+
+/*
+Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
+* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
+
+We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
+*/