V1 lex done
This commit is contained in:
parent
76fbeff084
commit
490cb76d41
284
demo.str_cache.c
284
demo.str_cache.c
@ -1,8 +1,6 @@
|
||||
/*
|
||||
A introduction to C11 with a str cache demo.
|
||||
Attempting to showcase better conventions and constructs in C; Discovered to me as of 2025 from scouring the internet.
|
||||
|
||||
"C is old and flawed, but your use of it is most likely more flawed. You must have calouses to write with barbed syntax & semantics."
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -25,7 +23,7 @@ int main()
|
||||
StrCache cache = strcache_init(varena_ainfo(cache));
|
||||
|
||||
VArena file_arena; varena_init(file_arena);
|
||||
Str path_text = lit("../demo.strcache.c");
|
||||
Str8 path_text = lit("../demo.strcache.c");
|
||||
FileContent text_file = file_read_contents(varena_ainfo(file_arena), path_text);
|
||||
|
||||
Arena ast_arena; arena_init(ast_arena);
|
||||
@ -39,6 +37,11 @@ int main()
|
||||
}
|
||||
#endif
|
||||
|
||||
// Demo selection
|
||||
// #define DEMO__STR_SLICE
|
||||
// #define DEMO__FILE_READ_CONTENTS_V1
|
||||
#define DEMO__WATL_LEX_V1
|
||||
|
||||
/*
|
||||
The above makes use of the following core concepts to achieve its net result:
|
||||
* Slices
|
||||
@ -85,7 +88,6 @@ typedef signed __int64 S64;
|
||||
typedef size_t USIZE;
|
||||
typedef ptrdiff_t SSIZE;
|
||||
|
||||
|
||||
enum {
|
||||
false,
|
||||
true,
|
||||
@ -96,14 +98,31 @@ typedef S8 B8;
|
||||
typedef S16 B16;
|
||||
typedef S32 B32;
|
||||
|
||||
// Common macros we'll use throughout this.
|
||||
|
||||
#define assert_bounds(point, start, end) do { \
|
||||
SSIZE pos_point = cast(SSIZE, point); \
|
||||
SSIZE pos_start = cast(SSIZE, start); \
|
||||
SSIZE pos_end = cast(SSIZE, end); \
|
||||
assert(pos_start <= pos_point); \
|
||||
assert(pos_point <= pos_end); \
|
||||
} while(0)
|
||||
|
||||
// Functional style cast
|
||||
#define cast(type, data) ((type)(data))
|
||||
#define cast(type, data) ((type)(data))
|
||||
#define pcast(type, data) * cast(type*, & data)
|
||||
|
||||
#define nullptr cast(void*, 0)
|
||||
|
||||
#define glue_(A, B) A ## B
|
||||
#define glue(A, B) glue_(A,B)
|
||||
|
||||
// Enforces size querying uses SSIZE type.
|
||||
#define size_of(data) cast(SSIZE, sizeof(data))
|
||||
|
||||
#define stringify_(S) #S
|
||||
#define stringify(S) stringify_(S)
|
||||
|
||||
/*
|
||||
The first construct we'll utilize is a String Slice.
|
||||
In modern programming with the memory sizes utilized, it is more ergonomic to track the length of strings with their pointer.
|
||||
@ -120,7 +139,6 @@ struct Str8 {
|
||||
#define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 }
|
||||
|
||||
// For now this string can visualized using a debugger.
|
||||
// #define DEMO__STR_SLICE
|
||||
#ifdef DEMO__STR_SLICE
|
||||
int main()
|
||||
{
|
||||
@ -147,7 +165,6 @@ typedef struct FileOpResult FileOpResult;
|
||||
typedef struct Opts__read_file_contents Opts__read_file_contents;
|
||||
void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts);
|
||||
FileOpResult file__read_contents ( Str8 path, Opts__read_file_contents* opts);
|
||||
|
||||
#define file_read_contents(path, ...) file__read_contents(path, & (Opts__read_file_contents){__VA_ARGS__} )
|
||||
|
||||
/*
|
||||
@ -285,8 +302,7 @@ void slice_zero(SliceMem mem) {
|
||||
|
||||
// Now for our "Version 1"
|
||||
|
||||
#define DEMO__FILE_READ_CONTENTS_V1
|
||||
#ifdef DEMO__FILE_READ_CONTENTS_V1
|
||||
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)
|
||||
|
||||
struct FileOpResult
|
||||
{
|
||||
@ -310,6 +326,7 @@ void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_con
|
||||
// Backing is required at this point
|
||||
slice_assert(opts->backing);
|
||||
|
||||
// This will limit a path for V1 to be 16kb worth of codepoints.
|
||||
FMem_16KB scratch = {0};
|
||||
char const* path_cstr = str8_to_cstr_capped(path, fmem_slice(scratch) );
|
||||
|
||||
@ -385,8 +402,257 @@ FileOpResult file__read_contents(Str8 path, Opts__read_file_contents* opts) {
|
||||
#ifdef DEMO__FILE_READ_CONTENTS_V1
|
||||
int main()
|
||||
{
|
||||
// This will limit for our V1 read to 64kb at most.
|
||||
FMem_64KB read_mem = {0};
|
||||
FileOpResult res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
|
||||
return 0;
|
||||
}
|
||||
#endif DEMO__FILE_READ_CONTENTS_V1
|
||||
|
||||
/*
|
||||
Now that we have file reading done we need to be able to process the content.
|
||||
|
||||
First we want to do lexical analysis. So we'll create a token listing delimiting aspects of the text file relevant to us.
|
||||
For our data structure, we are going for a Whitespace-Aware Text Layout; where we'll track text and the formatting around them.
|
||||
|
||||
Just like with the read file contents operation, we'll define an interface to performing this analysis.
|
||||
It will be called watl_lex and take the SliceMem from the file as a Str8 slice and some Opts__watl_lex;
|
||||
returning a WATL_LexInfo for providing user info on how the operation went.
|
||||
*/
|
||||
|
||||
typedef struct WATL_LexInfo WATL_LexInfo;
|
||||
typedef struct Opts__watl_lex Opts__watl_lex;
|
||||
|
||||
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts);
|
||||
WATL_LexInfo watl__lex ( Str8 source, Opts__watl_lex* opts);
|
||||
#define watl_lex(source, ...) watl__lex(source, &(Opts__watl_lex){__VA_ARGS__})
|
||||
|
||||
/*
|
||||
Token identification will be done using a WATL_TokKind enumeration.
|
||||
The token type itself will be the id along with a ptr to its start of the slice. We can resolve the width of the token by its delta to the next token.
|
||||
If its the last token, then its delta is determined by its offset to the end of the Str8 slice.
|
||||
*/
|
||||
|
||||
typedef U32 WATL_TokKind;
|
||||
enum WATL_TokKind {
|
||||
WATL_Tok_Space = ' ',
|
||||
WATL_Tok_Tab = '\t',
|
||||
WATL_Tok_CarriageReturn = '\r',
|
||||
WATL_Tok_LineFeed = '\n',
|
||||
};
|
||||
|
||||
typedef struct WATL_Tok WATL_Tok;
|
||||
struct WATL_Tok {
|
||||
char const* code;
|
||||
};
|
||||
|
||||
typedef struct WATL_SliceTok WATL_SliceTok;
|
||||
struct WATL_SliceTok {
|
||||
WATL_Tok* ptr;
|
||||
SSIZE len;
|
||||
};
|
||||
|
||||
Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
|
||||
SSIZE start = cast(SSIZE, toks->ptr);
|
||||
SSIZE curr = cast(SSIZE, tok->code);
|
||||
SSIZE offset = curr - start;
|
||||
SSIZE left = toks->len - offset;
|
||||
B32 last_tok = (start + toks->len) == (curr + left);
|
||||
Str8 text = {0};
|
||||
text.ptr = tok->code;
|
||||
text.len = last_tok ?
|
||||
left
|
||||
// Othwerise its the last minus the curr.
|
||||
: cast(SSIZE, (tok + 1) - curr);
|
||||
return text;
|
||||
}
|
||||
|
||||
/*
|
||||
Tokens are allocated to a backing slice of memory defined by the user. This pool of memory will ideally not be constrained to a fixed size on the stack.
|
||||
So for V1 we'll allocate 10 megs of heap memory to act as a pool for the tokens. We'll keep track of how much for the pool we used via a new memory tracking construct:
|
||||
The fixed-sized arena.
|
||||
|
||||
A basic fixed size arena only has three components which can vary depending on the convention the user perfers.
|
||||
In our case we'll track its capacity, its starting address, and how much has been comitted..
|
||||
*/
|
||||
|
||||
// We use this in-conjunction with Areans to save a point thats safe to rewind to by the user.
|
||||
typedef struct ArenaSP ArenaSP;
|
||||
struct ArenaSP { void* ptr; };
|
||||
|
||||
#pragma region FArena
|
||||
typedef struct FArena FArena;
|
||||
struct FArena {
|
||||
void* start;
|
||||
SSIZE capacity;
|
||||
SSIZE used;
|
||||
};
|
||||
void api_farena_init(FArena* arena, SliceMem mem);
|
||||
FArena farena_init (SliceMem mem);
|
||||
SliceMem farena__push (FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename);
|
||||
void farena_reset (FArena* arena);
|
||||
void farena_rewind (FArena* arena, ArenaSP savepoint);
|
||||
ArenaSP farena_save (FArena arena);
|
||||
|
||||
#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type)))
|
||||
#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
|
||||
|
||||
inline
|
||||
void api_farena_init(FArena* arena, SliceMem mem) {
|
||||
arena->start = mem.ptr;
|
||||
arena->capacity = mem.len;
|
||||
arena->used = 0;
|
||||
}
|
||||
inline FArena farena_init(SliceMem mem) { FArena arena; api_farena_init(& arena, mem); return arena; }
|
||||
|
||||
inline
|
||||
SliceMem farena__push(FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename) {
|
||||
SSIZE to_commit = type_size * amount;
|
||||
SSIZE unused = arena->capacity - arena->used;
|
||||
assert(to_commit <= unused);
|
||||
|
||||
SliceMem result = {0};
|
||||
result.ptr = cast(void*, cast(SSIZE, arena->start) + arena->used);
|
||||
result.len = to_commit;
|
||||
arena->used += to_commit;
|
||||
return result;
|
||||
}
|
||||
|
||||
inline
|
||||
void farena_rewind(FArena* arena, ArenaSP savepoint) {
|
||||
void* end = cast(void*, cast(SSIZE, arena->start) + arena->used);
|
||||
assert_bounds(savepoint.ptr, arena->start, end);
|
||||
arena->used -= cast(SSIZE, savepoint.ptr) - cast(SSIZE, arena->start);
|
||||
}
|
||||
|
||||
inline void farena_reset(FArena* arena) { arena->used = 0; }
|
||||
inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; }
|
||||
#pragma endregion FArena
|
||||
|
||||
#ifdef DEMO__WATL_LEX_V1
|
||||
|
||||
struct WATL_LexInfo {
|
||||
// For now just the tokens
|
||||
WATL_SliceTok tokens;
|
||||
};
|
||||
|
||||
struct Opts__watl_lex {
|
||||
/*
|
||||
For this operation we'll enforce that the arena must linearly allocate each token, forming a strictly adjacent sent of elements in an array.
|
||||
This is not necessary and an equivalent process could be done where the tokens instead are semi-contigously organized into linked list with a chained arena, or the tokens are sparely cached.
|
||||
Where their position in their originating string is not preserved. In this case we're keeping it simple. Tokens are in the same block of memory and they don't use a string cache.
|
||||
*/
|
||||
SliceMem pool_toks;
|
||||
};
|
||||
|
||||
// We are assuming everything is utf8-ascii.
|
||||
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
|
||||
{
|
||||
assert(info != nullptr);
|
||||
slice_assert(source);
|
||||
assert(opts != nullptr);
|
||||
|
||||
FArena arena = farena_init(opts->pool_toks);
|
||||
|
||||
char const* end = source.ptr + source.len;
|
||||
char const* cursor = source.ptr;
|
||||
char const* prev = source.ptr;
|
||||
char code = * cursor;
|
||||
|
||||
B32 was_text = false;
|
||||
WATL_Tok* tok = nullptr;
|
||||
for (; cursor < end;)
|
||||
{
|
||||
switch (code)
|
||||
{
|
||||
case WATL_Tok_Space:
|
||||
case WATL_Tok_Tab:
|
||||
{
|
||||
if (* prev != * cursor) {
|
||||
tok = farena_push(arena, WATL_Tok).ptr;
|
||||
tok->code = cursor;
|
||||
was_text = false;
|
||||
}
|
||||
cursor += 1;
|
||||
code = * cursor;
|
||||
}
|
||||
continue;
|
||||
|
||||
case WATL_Tok_CarriageReturn: {
|
||||
// Assumes next is line feed.
|
||||
cursor += 1;
|
||||
}
|
||||
case WATL_Tok_LineFeed:
|
||||
{
|
||||
cursor += 1;
|
||||
code = * cursor;
|
||||
}
|
||||
continue;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (! was_text) {
|
||||
tok = farena_push(arena, WATL_Tok).ptr;
|
||||
tok->code = cursor;
|
||||
was_text = true;
|
||||
}
|
||||
prev = cursor;
|
||||
cursor += 1;
|
||||
code = * cursor;
|
||||
}
|
||||
info->tokens.ptr = arena.start;
|
||||
info->tokens.len = arena.used / size_of(WATL_Tok*);
|
||||
}
|
||||
|
||||
#endif DEMO__WATL_LEX_V1
|
||||
|
||||
inline
|
||||
WATL_LexInfo watl__lex(Str8 source, Opts__watl_lex* opts) {
|
||||
WATL_LexInfo result = {0};
|
||||
api_watl_lex(& result, source, opts);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
To allocate onto the heap we'll make a basic slicemem_malloc to allocate, we'll make a corresponding slicemem_free aswell.
|
||||
However we don't need to use it for the V1 example. The OS will cleanup the pages used by the process during its termination.
|
||||
*/
|
||||
|
||||
SliceMem slicemem_alloc(SSIZE amount)
|
||||
{
|
||||
assert(amount > KILOBTYES(4));
|
||||
void* result = malloc(amount);
|
||||
assert(result != nullptr);
|
||||
SliceMem mem = {
|
||||
.ptr = result,
|
||||
.len = amount
|
||||
};
|
||||
return mem;
|
||||
}
|
||||
void slicemem_free(SliceMem mem) {
|
||||
free(mem.ptr);
|
||||
}
|
||||
|
||||
#ifdef DEMO__WATL_LEX_V1
|
||||
int main()
|
||||
{
|
||||
// This will limit for our V1 read to 64kb at most.
|
||||
FMem_64KB read_mem = {0};
|
||||
FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
|
||||
|
||||
// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
|
||||
SliceMem mem_toks = slicemem_alloc(MEGABYTES(8));
|
||||
WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
|
||||
// unnecessary in this case but if you want to explicitly:
|
||||
slicemem_free(mem_toks);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
|
||||
* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
|
||||
|
||||
We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user