V1 lex done
This commit is contained in:
parent
76fbeff084
commit
490cb76d41
284
demo.str_cache.c
284
demo.str_cache.c
@ -1,8 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
A introduction to C11 with a str cache demo.
|
A introduction to C11 with a str cache demo.
|
||||||
Attempting to showcase better conventions and constructs in C; Discovered to me as of 2025 from scouring the internet.
|
Attempting to showcase better conventions and constructs in C; Discovered to me as of 2025 from scouring the internet.
|
||||||
|
|
||||||
"C is old and flawed, but your use of it is most likely more flawed. You must have calouses to write with barbed syntax & semantics."
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -25,7 +23,7 @@ int main()
|
|||||||
StrCache cache = strcache_init(varena_ainfo(cache));
|
StrCache cache = strcache_init(varena_ainfo(cache));
|
||||||
|
|
||||||
VArena file_arena; varena_init(file_arena);
|
VArena file_arena; varena_init(file_arena);
|
||||||
Str path_text = lit("../demo.strcache.c");
|
Str8 path_text = lit("../demo.strcache.c");
|
||||||
FileContent text_file = file_read_contents(varena_ainfo(file_arena), path_text);
|
FileContent text_file = file_read_contents(varena_ainfo(file_arena), path_text);
|
||||||
|
|
||||||
Arena ast_arena; arena_init(ast_arena);
|
Arena ast_arena; arena_init(ast_arena);
|
||||||
@ -39,6 +37,11 @@ int main()
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Demo selection
|
||||||
|
// #define DEMO__STR_SLICE
|
||||||
|
// #define DEMO__FILE_READ_CONTENTS_V1
|
||||||
|
#define DEMO__WATL_LEX_V1
|
||||||
|
|
||||||
/*
|
/*
|
||||||
The above makes use of the following core concepts to achieve its net result:
|
The above makes use of the following core concepts to achieve its net result:
|
||||||
* Slices
|
* Slices
|
||||||
@ -85,7 +88,6 @@ typedef signed __int64 S64;
|
|||||||
typedef size_t USIZE;
|
typedef size_t USIZE;
|
||||||
typedef ptrdiff_t SSIZE;
|
typedef ptrdiff_t SSIZE;
|
||||||
|
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
false,
|
false,
|
||||||
true,
|
true,
|
||||||
@ -96,14 +98,31 @@ typedef S8 B8;
|
|||||||
typedef S16 B16;
|
typedef S16 B16;
|
||||||
typedef S32 B32;
|
typedef S32 B32;
|
||||||
|
|
||||||
|
// Common macros we'll use throughout this.
|
||||||
|
|
||||||
|
#define assert_bounds(point, start, end) do { \
|
||||||
|
SSIZE pos_point = cast(SSIZE, point); \
|
||||||
|
SSIZE pos_start = cast(SSIZE, start); \
|
||||||
|
SSIZE pos_end = cast(SSIZE, end); \
|
||||||
|
assert(pos_start <= pos_point); \
|
||||||
|
assert(pos_point <= pos_end); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
// Functional style cast
|
// Functional style cast
|
||||||
#define cast(type, data) ((type)(data))
|
#define cast(type, data) ((type)(data))
|
||||||
|
#define pcast(type, data) * cast(type*, & data)
|
||||||
|
|
||||||
#define nullptr cast(void*, 0)
|
#define nullptr cast(void*, 0)
|
||||||
|
|
||||||
|
#define glue_(A, B) A ## B
|
||||||
|
#define glue(A, B) glue_(A,B)
|
||||||
|
|
||||||
// Enforces size querying uses SSIZE type.
|
// Enforces size querying uses SSIZE type.
|
||||||
#define size_of(data) cast(SSIZE, sizeof(data))
|
#define size_of(data) cast(SSIZE, sizeof(data))
|
||||||
|
|
||||||
|
#define stringify_(S) #S
|
||||||
|
#define stringify(S) stringify_(S)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
The first construct we'll utilize is a String Slice.
|
The first construct we'll utilize is a String Slice.
|
||||||
In modern programming with the memory sizes utilized, it is more ergonomic to track the length of strings with their pointer.
|
In modern programming with the memory sizes utilized, it is more ergonomic to track the length of strings with their pointer.
|
||||||
@ -120,7 +139,6 @@ struct Str8 {
|
|||||||
#define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 }
|
#define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 }
|
||||||
|
|
||||||
// For now this string can visualized using a debugger.
|
// For now this string can visualized using a debugger.
|
||||||
// #define DEMO__STR_SLICE
|
|
||||||
#ifdef DEMO__STR_SLICE
|
#ifdef DEMO__STR_SLICE
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
@ -147,7 +165,6 @@ typedef struct FileOpResult FileOpResult;
|
|||||||
typedef struct Opts__read_file_contents Opts__read_file_contents;
|
typedef struct Opts__read_file_contents Opts__read_file_contents;
|
||||||
void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts);
|
void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts);
|
||||||
FileOpResult file__read_contents ( Str8 path, Opts__read_file_contents* opts);
|
FileOpResult file__read_contents ( Str8 path, Opts__read_file_contents* opts);
|
||||||
|
|
||||||
#define file_read_contents(path, ...) file__read_contents(path, & (Opts__read_file_contents){__VA_ARGS__} )
|
#define file_read_contents(path, ...) file__read_contents(path, & (Opts__read_file_contents){__VA_ARGS__} )
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -285,8 +302,7 @@ void slice_zero(SliceMem mem) {
|
|||||||
|
|
||||||
// Now for our "Version 1"
|
// Now for our "Version 1"
|
||||||
|
|
||||||
#define DEMO__FILE_READ_CONTENTS_V1
|
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)
|
||||||
#ifdef DEMO__FILE_READ_CONTENTS_V1
|
|
||||||
|
|
||||||
struct FileOpResult
|
struct FileOpResult
|
||||||
{
|
{
|
||||||
@ -310,6 +326,7 @@ void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_con
|
|||||||
// Backing is required at this point
|
// Backing is required at this point
|
||||||
slice_assert(opts->backing);
|
slice_assert(opts->backing);
|
||||||
|
|
||||||
|
// This will limit a path for V1 to be 16kb worth of codepoints.
|
||||||
FMem_16KB scratch = {0};
|
FMem_16KB scratch = {0};
|
||||||
char const* path_cstr = str8_to_cstr_capped(path, fmem_slice(scratch) );
|
char const* path_cstr = str8_to_cstr_capped(path, fmem_slice(scratch) );
|
||||||
|
|
||||||
@ -385,8 +402,257 @@ FileOpResult file__read_contents(Str8 path, Opts__read_file_contents* opts) {
|
|||||||
#ifdef DEMO__FILE_READ_CONTENTS_V1
|
#ifdef DEMO__FILE_READ_CONTENTS_V1
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
// This will limit for our V1 read to 64kb at most.
|
||||||
FMem_64KB read_mem = {0};
|
FMem_64KB read_mem = {0};
|
||||||
FileOpResult res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
|
FileOpResult res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#endif DEMO__FILE_READ_CONTENTS_V1
|
||||||
|
|
||||||
|
/*
|
||||||
|
Now that we have file reading done we need to be able to process the content.
|
||||||
|
|
||||||
|
First we want to do lexical analysis. So we'll create a token listing delimiting aspects of the text file relevant to us.
|
||||||
|
For our data structure, we are going for a Whitespace-Aware Text Layout; where we'll track text and the formatting around them.
|
||||||
|
|
||||||
|
Just like with the read file contents operation, we'll define an interface to performing this analysis.
|
||||||
|
It will be called watl_lex and take the SliceMem from the file as a Str8 slice and some Opts__watl_lex;
|
||||||
|
returning a WATL_LexInfo for providing user info on how the operation went.
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct WATL_LexInfo WATL_LexInfo;
|
||||||
|
typedef struct Opts__watl_lex Opts__watl_lex;
|
||||||
|
|
||||||
|
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts);
|
||||||
|
WATL_LexInfo watl__lex ( Str8 source, Opts__watl_lex* opts);
|
||||||
|
#define watl_lex(source, ...) watl__lex(source, &(Opts__watl_lex){__VA_ARGS__})
|
||||||
|
|
||||||
|
/*
|
||||||
|
Token identification will be done using a WATL_TokKind enumeration.
|
||||||
|
The token type itself will be the id along with a ptr to its start of the slice. We can resolve the width of the token by its delta to the next token.
|
||||||
|
If its the last token, then its delta is determined by its offset to the end of the Str8 slice.
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef U32 WATL_TokKind;
|
||||||
|
enum WATL_TokKind {
|
||||||
|
WATL_Tok_Space = ' ',
|
||||||
|
WATL_Tok_Tab = '\t',
|
||||||
|
WATL_Tok_CarriageReturn = '\r',
|
||||||
|
WATL_Tok_LineFeed = '\n',
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct WATL_Tok WATL_Tok;
|
||||||
|
struct WATL_Tok {
|
||||||
|
char const* code;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct WATL_SliceTok WATL_SliceTok;
|
||||||
|
struct WATL_SliceTok {
|
||||||
|
WATL_Tok* ptr;
|
||||||
|
SSIZE len;
|
||||||
|
};
|
||||||
|
|
||||||
|
Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
|
||||||
|
SSIZE start = cast(SSIZE, toks->ptr);
|
||||||
|
SSIZE curr = cast(SSIZE, tok->code);
|
||||||
|
SSIZE offset = curr - start;
|
||||||
|
SSIZE left = toks->len - offset;
|
||||||
|
B32 last_tok = (start + toks->len) == (curr + left);
|
||||||
|
Str8 text = {0};
|
||||||
|
text.ptr = tok->code;
|
||||||
|
text.len = last_tok ?
|
||||||
|
left
|
||||||
|
// Othwerise its the last minus the curr.
|
||||||
|
: cast(SSIZE, (tok + 1) - curr);
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Tokens are allocated to a backing slice of memory defined by the user. This pool of memory will ideally not be constrained to a fixed size on the stack.
|
||||||
|
So for V1 we'll allocate 10 megs of heap memory to act as a pool for the tokens. We'll keep track of how much for the pool we used via a new memory tracking construct:
|
||||||
|
The fixed-sized arena.
|
||||||
|
|
||||||
|
A basic fixed size arena only has three components which can vary depending on the convention the user perfers.
|
||||||
|
In our case we'll track its capacity, its starting address, and how much has been comitted..
|
||||||
|
*/
|
||||||
|
|
||||||
|
// We use this in-conjunction with Areans to save a point thats safe to rewind to by the user.
|
||||||
|
typedef struct ArenaSP ArenaSP;
|
||||||
|
struct ArenaSP { void* ptr; };
|
||||||
|
|
||||||
|
#pragma region FArena
|
||||||
|
typedef struct FArena FArena;
|
||||||
|
struct FArena {
|
||||||
|
void* start;
|
||||||
|
SSIZE capacity;
|
||||||
|
SSIZE used;
|
||||||
|
};
|
||||||
|
void api_farena_init(FArena* arena, SliceMem mem);
|
||||||
|
FArena farena_init (SliceMem mem);
|
||||||
|
SliceMem farena__push (FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename);
|
||||||
|
void farena_reset (FArena* arena);
|
||||||
|
void farena_rewind (FArena* arena, ArenaSP savepoint);
|
||||||
|
ArenaSP farena_save (FArena arena);
|
||||||
|
|
||||||
|
#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type)))
|
||||||
|
#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
|
||||||
|
|
||||||
|
inline
|
||||||
|
void api_farena_init(FArena* arena, SliceMem mem) {
|
||||||
|
arena->start = mem.ptr;
|
||||||
|
arena->capacity = mem.len;
|
||||||
|
arena->used = 0;
|
||||||
|
}
|
||||||
|
inline FArena farena_init(SliceMem mem) { FArena arena; api_farena_init(& arena, mem); return arena; }
|
||||||
|
|
||||||
|
inline
|
||||||
|
SliceMem farena__push(FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename) {
|
||||||
|
SSIZE to_commit = type_size * amount;
|
||||||
|
SSIZE unused = arena->capacity - arena->used;
|
||||||
|
assert(to_commit <= unused);
|
||||||
|
|
||||||
|
SliceMem result = {0};
|
||||||
|
result.ptr = cast(void*, cast(SSIZE, arena->start) + arena->used);
|
||||||
|
result.len = to_commit;
|
||||||
|
arena->used += to_commit;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline
|
||||||
|
void farena_rewind(FArena* arena, ArenaSP savepoint) {
|
||||||
|
void* end = cast(void*, cast(SSIZE, arena->start) + arena->used);
|
||||||
|
assert_bounds(savepoint.ptr, arena->start, end);
|
||||||
|
arena->used -= cast(SSIZE, savepoint.ptr) - cast(SSIZE, arena->start);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void farena_reset(FArena* arena) { arena->used = 0; }
|
||||||
|
inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; }
|
||||||
|
#pragma endregion FArena
|
||||||
|
|
||||||
|
#ifdef DEMO__WATL_LEX_V1
|
||||||
|
|
||||||
|
struct WATL_LexInfo {
|
||||||
|
// For now just the tokens
|
||||||
|
WATL_SliceTok tokens;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Opts__watl_lex {
|
||||||
|
/*
|
||||||
|
For this operation we'll enforce that the arena must linearly allocate each token, forming a strictly adjacent sent of elements in an array.
|
||||||
|
This is not necessary and an equivalent process could be done where the tokens instead are semi-contigously organized into linked list with a chained arena, or the tokens are sparely cached.
|
||||||
|
Where their position in their originating string is not preserved. In this case we're keeping it simple. Tokens are in the same block of memory and they don't use a string cache.
|
||||||
|
*/
|
||||||
|
SliceMem pool_toks;
|
||||||
|
};
|
||||||
|
|
||||||
|
// We are assuming everything is utf8-ascii.
|
||||||
|
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
|
||||||
|
{
|
||||||
|
assert(info != nullptr);
|
||||||
|
slice_assert(source);
|
||||||
|
assert(opts != nullptr);
|
||||||
|
|
||||||
|
FArena arena = farena_init(opts->pool_toks);
|
||||||
|
|
||||||
|
char const* end = source.ptr + source.len;
|
||||||
|
char const* cursor = source.ptr;
|
||||||
|
char const* prev = source.ptr;
|
||||||
|
char code = * cursor;
|
||||||
|
|
||||||
|
B32 was_text = false;
|
||||||
|
WATL_Tok* tok = nullptr;
|
||||||
|
for (; cursor < end;)
|
||||||
|
{
|
||||||
|
switch (code)
|
||||||
|
{
|
||||||
|
case WATL_Tok_Space:
|
||||||
|
case WATL_Tok_Tab:
|
||||||
|
{
|
||||||
|
if (* prev != * cursor) {
|
||||||
|
tok = farena_push(arena, WATL_Tok).ptr;
|
||||||
|
tok->code = cursor;
|
||||||
|
was_text = false;
|
||||||
|
}
|
||||||
|
cursor += 1;
|
||||||
|
code = * cursor;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
|
||||||
|
case WATL_Tok_CarriageReturn: {
|
||||||
|
// Assumes next is line feed.
|
||||||
|
cursor += 1;
|
||||||
|
}
|
||||||
|
case WATL_Tok_LineFeed:
|
||||||
|
{
|
||||||
|
cursor += 1;
|
||||||
|
code = * cursor;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (! was_text) {
|
||||||
|
tok = farena_push(arena, WATL_Tok).ptr;
|
||||||
|
tok->code = cursor;
|
||||||
|
was_text = true;
|
||||||
|
}
|
||||||
|
prev = cursor;
|
||||||
|
cursor += 1;
|
||||||
|
code = * cursor;
|
||||||
|
}
|
||||||
|
info->tokens.ptr = arena.start;
|
||||||
|
info->tokens.len = arena.used / size_of(WATL_Tok*);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif DEMO__WATL_LEX_V1
|
||||||
|
|
||||||
|
inline
|
||||||
|
WATL_LexInfo watl__lex(Str8 source, Opts__watl_lex* opts) {
|
||||||
|
WATL_LexInfo result = {0};
|
||||||
|
api_watl_lex(& result, source, opts);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
To allocate onto the heap we'll make a basic slicemem_malloc to allocate, we'll make a corresponding slicemem_free aswell.
|
||||||
|
However we don't need to use it for the V1 example. The OS will cleanup the pages used by the process during its termination.
|
||||||
|
*/
|
||||||
|
|
||||||
|
SliceMem slicemem_alloc(SSIZE amount)
|
||||||
|
{
|
||||||
|
assert(amount > KILOBTYES(4));
|
||||||
|
void* result = malloc(amount);
|
||||||
|
assert(result != nullptr);
|
||||||
|
SliceMem mem = {
|
||||||
|
.ptr = result,
|
||||||
|
.len = amount
|
||||||
|
};
|
||||||
|
return mem;
|
||||||
|
}
|
||||||
|
void slicemem_free(SliceMem mem) {
|
||||||
|
free(mem.ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef DEMO__WATL_LEX_V1
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
// This will limit for our V1 read to 64kb at most.
|
||||||
|
FMem_64KB read_mem = {0};
|
||||||
|
FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
|
||||||
|
|
||||||
|
// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
|
||||||
|
SliceMem mem_toks = slicemem_alloc(MEGABYTES(8));
|
||||||
|
WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
|
||||||
|
// unnecessary in this case but if you want to explicitly:
|
||||||
|
slicemem_free(mem_toks);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
|
||||||
|
* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
|
||||||
|
|
||||||
|
We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
|
||||||
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user