V1 lex done

This commit is contained in:
Edward R. Gonzalez 2025-05-04 18:11:04 -04:00
parent 76fbeff084
commit 490cb76d41

View File

@ -1,8 +1,6 @@
/*
A introduction to C11 with a str cache demo.
Attempting to showcase better conventions and constructs in C; Discovered to me as of 2025 from scouring the internet.
"C is old and flawed, but your use of it is most likely more flawed. You must have calouses to write with barbed syntax & semantics."
*/
/*
@ -25,7 +23,7 @@ int main()
StrCache cache = strcache_init(varena_ainfo(cache));
VArena file_arena; varena_init(file_arena);
Str path_text = lit("../demo.strcache.c");
Str8 path_text = lit("../demo.strcache.c");
FileContent text_file = file_read_contents(varena_ainfo(file_arena), path_text);
Arena ast_arena; arena_init(ast_arena);
@ -39,6 +37,11 @@ int main()
}
#endif
// Demo selection
// #define DEMO__STR_SLICE
// #define DEMO__FILE_READ_CONTENTS_V1
#define DEMO__WATL_LEX_V1
/*
The above makes use of the following core concepts to achieve its net result:
* Slices
@ -85,7 +88,6 @@ typedef signed __int64 S64;
typedef size_t USIZE;
typedef ptrdiff_t SSIZE;
enum {
false,
true,
@ -96,14 +98,31 @@ typedef S8 B8;
typedef S16 B16;
typedef S32 B32;
// Common macros we'll use throughout this.
#define assert_bounds(point, start, end) do { \
SSIZE pos_point = cast(SSIZE, point); \
SSIZE pos_start = cast(SSIZE, start); \
SSIZE pos_end = cast(SSIZE, end); \
assert(pos_start <= pos_point); \
assert(pos_point <= pos_end); \
} while(0)
// Functional style cast
#define cast(type, data) ((type)(data))
#define cast(type, data) ((type)(data))
#define pcast(type, data) * cast(type*, & data)
#define nullptr cast(void*, 0)
#define glue_(A, B) A ## B
#define glue(A, B) glue_(A,B)
// Enforces size querying uses SSIZE type.
#define size_of(data) cast(SSIZE, sizeof(data))
#define stringify_(S) #S
#define stringify(S) stringify_(S)
/*
The first construct we'll utilize is a String Slice.
In modern programming with the memory sizes utilized, it is more ergonomic to track the length of strings with their pointer.
@ -120,7 +139,6 @@ struct Str8 {
#define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 }
// For now this string can visualized using a debugger.
// #define DEMO__STR_SLICE
#ifdef DEMO__STR_SLICE
int main()
{
@ -147,7 +165,6 @@ typedef struct FileOpResult FileOpResult;
typedef struct Opts__read_file_contents Opts__read_file_contents;
void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts);
FileOpResult file__read_contents ( Str8 path, Opts__read_file_contents* opts);
#define file_read_contents(path, ...) file__read_contents(path, & (Opts__read_file_contents){__VA_ARGS__} )
/*
@ -285,8 +302,7 @@ void slice_zero(SliceMem mem) {
// Now for our "Version 1"
#define DEMO__FILE_READ_CONTENTS_V1
#ifdef DEMO__FILE_READ_CONTENTS_V1
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)
struct FileOpResult
{
@ -310,6 +326,7 @@ void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_con
// Backing is required at this point
slice_assert(opts->backing);
// This will limit a path for V1 to be 16kb worth of codepoints.
FMem_16KB scratch = {0};
char const* path_cstr = str8_to_cstr_capped(path, fmem_slice(scratch) );
@ -385,8 +402,257 @@ FileOpResult file__read_contents(Str8 path, Opts__read_file_contents* opts) {
#ifdef DEMO__FILE_READ_CONTENTS_V1
int main()
{
// This will limit for our V1 read to 64kb at most.
FMem_64KB read_mem = {0};
FileOpResult res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
return 0;
}
#endif DEMO__FILE_READ_CONTENTS_V1
/*
Now that we have file reading done we need to be able to process the content.
First we want to do lexical analysis. So we'll create a token listing delimiting aspects of the text file relevant to us.
For our data structure, we are going for a Whitespace-Aware Text Layout; where we'll track text and the formatting around them.
Just like with the read file contents operation, we'll define an interface to performing this analysis.
It will be called watl_lex and take the SliceMem from the file as a Str8 slice and some Opts__watl_lex;
returning a WATL_LexInfo for providing user info on how the operation went.
*/
typedef struct WATL_LexInfo WATL_LexInfo;
typedef struct Opts__watl_lex Opts__watl_lex;
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts);
WATL_LexInfo watl__lex ( Str8 source, Opts__watl_lex* opts);
#define watl_lex(source, ...) watl__lex(source, &(Opts__watl_lex){__VA_ARGS__})
/*
Token identification will be done using a WATL_TokKind enumeration.
The token type itself will be the id along with a ptr to its start of the slice. We can resolve the width of the token by its delta to the next token.
If its the last token, then its delta is determined by its offset to the end of the Str8 slice.
*/
typedef U32 WATL_TokKind;
enum WATL_TokKind {
WATL_Tok_Space = ' ',
WATL_Tok_Tab = '\t',
WATL_Tok_CarriageReturn = '\r',
WATL_Tok_LineFeed = '\n',
};
typedef struct WATL_Tok WATL_Tok;
struct WATL_Tok {
char const* code;
};
typedef struct WATL_SliceTok WATL_SliceTok;
struct WATL_SliceTok {
WATL_Tok* ptr;
SSIZE len;
};
Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
SSIZE start = cast(SSIZE, toks->ptr);
SSIZE curr = cast(SSIZE, tok->code);
SSIZE offset = curr - start;
SSIZE left = toks->len - offset;
B32 last_tok = (start + toks->len) == (curr + left);
Str8 text = {0};
text.ptr = tok->code;
text.len = last_tok ?
left
// Othwerise its the last minus the curr.
: cast(SSIZE, (tok + 1) - curr);
return text;
}
/*
Tokens are allocated to a backing slice of memory defined by the user. This pool of memory will ideally not be constrained to a fixed size on the stack.
So for V1 we'll allocate 10 megs of heap memory to act as a pool for the tokens. We'll keep track of how much for the pool we used via a new memory tracking construct:
The fixed-sized arena.
A basic fixed size arena only has three components which can vary depending on the convention the user perfers.
In our case we'll track its capacity, its starting address, and how much has been comitted..
*/
// We use this in-conjunction with Areans to save a point thats safe to rewind to by the user.
typedef struct ArenaSP ArenaSP;
struct ArenaSP { void* ptr; };
#pragma region FArena
typedef struct FArena FArena;
struct FArena {
void* start;
SSIZE capacity;
SSIZE used;
};
void api_farena_init(FArena* arena, SliceMem mem);
FArena farena_init (SliceMem mem);
SliceMem farena__push (FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename);
void farena_reset (FArena* arena);
void farena_rewind (FArena* arena, ArenaSP savepoint);
ArenaSP farena_save (FArena arena);
#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type)))
#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
inline
void api_farena_init(FArena* arena, SliceMem mem) {
arena->start = mem.ptr;
arena->capacity = mem.len;
arena->used = 0;
}
inline FArena farena_init(SliceMem mem) { FArena arena; api_farena_init(& arena, mem); return arena; }
inline
SliceMem farena__push(FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename) {
SSIZE to_commit = type_size * amount;
SSIZE unused = arena->capacity - arena->used;
assert(to_commit <= unused);
SliceMem result = {0};
result.ptr = cast(void*, cast(SSIZE, arena->start) + arena->used);
result.len = to_commit;
arena->used += to_commit;
return result;
}
inline
void farena_rewind(FArena* arena, ArenaSP savepoint) {
void* end = cast(void*, cast(SSIZE, arena->start) + arena->used);
assert_bounds(savepoint.ptr, arena->start, end);
arena->used -= cast(SSIZE, savepoint.ptr) - cast(SSIZE, arena->start);
}
inline void farena_reset(FArena* arena) { arena->used = 0; }
inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; }
#pragma endregion FArena
#ifdef DEMO__WATL_LEX_V1
struct WATL_LexInfo {
// For now just the tokens
WATL_SliceTok tokens;
};
struct Opts__watl_lex {
/*
For this operation we'll enforce that the arena must linearly allocate each token, forming a strictly adjacent sent of elements in an array.
This is not necessary and an equivalent process could be done where the tokens instead are semi-contigously organized into linked list with a chained arena, or the tokens are sparely cached.
Where their position in their originating string is not preserved. In this case we're keeping it simple. Tokens are in the same block of memory and they don't use a string cache.
*/
SliceMem pool_toks;
};
// We are assuming everything is utf8-ascii.
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
{
assert(info != nullptr);
slice_assert(source);
assert(opts != nullptr);
FArena arena = farena_init(opts->pool_toks);
char const* end = source.ptr + source.len;
char const* cursor = source.ptr;
char const* prev = source.ptr;
char code = * cursor;
B32 was_text = false;
WATL_Tok* tok = nullptr;
for (; cursor < end;)
{
switch (code)
{
case WATL_Tok_Space:
case WATL_Tok_Tab:
{
if (* prev != * cursor) {
tok = farena_push(arena, WATL_Tok).ptr;
tok->code = cursor;
was_text = false;
}
cursor += 1;
code = * cursor;
}
continue;
case WATL_Tok_CarriageReturn: {
// Assumes next is line feed.
cursor += 1;
}
case WATL_Tok_LineFeed:
{
cursor += 1;
code = * cursor;
}
continue;
default:
break;
}
if (! was_text) {
tok = farena_push(arena, WATL_Tok).ptr;
tok->code = cursor;
was_text = true;
}
prev = cursor;
cursor += 1;
code = * cursor;
}
info->tokens.ptr = arena.start;
info->tokens.len = arena.used / size_of(WATL_Tok*);
}
#endif DEMO__WATL_LEX_V1
inline
WATL_LexInfo watl__lex(Str8 source, Opts__watl_lex* opts) {
WATL_LexInfo result = {0};
api_watl_lex(& result, source, opts);
return result;
}
/*
To allocate onto the heap we'll make a basic slicemem_malloc to allocate, we'll make a corresponding slicemem_free aswell.
However we don't need to use it for the V1 example. The OS will cleanup the pages used by the process during its termination.
*/
SliceMem slicemem_alloc(SSIZE amount)
{
assert(amount > KILOBTYES(4));
void* result = malloc(amount);
assert(result != nullptr);
SliceMem mem = {
.ptr = result,
.len = amount
};
return mem;
}
void slicemem_free(SliceMem mem) {
free(mem.ptr);
}
#ifdef DEMO__WATL_LEX_V1
int main()
{
// This will limit for our V1 read to 64kb at most.
FMem_64KB read_mem = {0};
FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
SliceMem mem_toks = slicemem_alloc(MEGABYTES(8));
WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
// unnecessary in this case but if you want to explicitly:
slicemem_free(mem_toks);
return 0;
}
#endif
/*
Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
*/