progress on making the basic parser.
This commit is contained in:
parent
490cb76d41
commit
567ec7d6fd
301
demo.str_cache.c
301
demo.str_cache.c
@ -40,7 +40,8 @@ int main()
|
|||||||
// Demo selection
|
// Demo selection
|
||||||
// #define DEMO__STR_SLICE
|
// #define DEMO__STR_SLICE
|
||||||
// #define DEMO__FILE_READ_CONTENTS_V1
|
// #define DEMO__FILE_READ_CONTENTS_V1
|
||||||
#define DEMO__WATL_LEX_V1
|
// #define DEMO__WATL_LEX_V1
|
||||||
|
#define DEMO__WATL_PARSE_V1
|
||||||
|
|
||||||
/*
|
/*
|
||||||
The above makes use of the following core concepts to achieve its net result:
|
The above makes use of the following core concepts to achieve its net result:
|
||||||
@ -302,7 +303,7 @@ void slice_zero(SliceMem mem) {
|
|||||||
|
|
||||||
// Now for our "Version 1"
|
// Now for our "Version 1"
|
||||||
|
|
||||||
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)
|
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1)
|
||||||
|
|
||||||
struct FileOpResult
|
struct FileOpResult
|
||||||
{
|
{
|
||||||
@ -452,12 +453,12 @@ struct WATL_SliceTok {
|
|||||||
SSIZE len;
|
SSIZE len;
|
||||||
};
|
};
|
||||||
|
|
||||||
Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
|
Str8 watl_tok_str8(WATL_SliceTok toks, WATL_Tok* tok) {
|
||||||
SSIZE start = cast(SSIZE, toks->ptr);
|
SSIZE start = cast(SSIZE, toks.ptr);
|
||||||
SSIZE curr = cast(SSIZE, tok->code);
|
SSIZE curr = cast(SSIZE, tok->code);
|
||||||
SSIZE offset = curr - start;
|
SSIZE offset = curr - start;
|
||||||
SSIZE left = toks->len - offset;
|
SSIZE left = toks.len - offset;
|
||||||
B32 last_tok = (start + toks->len) == (curr + left);
|
B32 last_tok = (start + toks.len) == (curr + left);
|
||||||
Str8 text = {0};
|
Str8 text = {0};
|
||||||
text.ptr = tok->code;
|
text.ptr = tok->code;
|
||||||
text.len = last_tok ?
|
text.len = last_tok ?
|
||||||
@ -494,8 +495,8 @@ void farena_reset (FArena* arena);
|
|||||||
void farena_rewind (FArena* arena, ArenaSP savepoint);
|
void farena_rewind (FArena* arena, ArenaSP savepoint);
|
||||||
ArenaSP farena_save (FArena arena);
|
ArenaSP farena_save (FArena arena);
|
||||||
|
|
||||||
#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type)))
|
#define farena_push(arena, type) cast(type*, farena__push(& arena, size_of(type), 1, lit(stringify(type))).ptr)
|
||||||
#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
|
#define farena_push_array(arena, type, amount) pcast(Slice ## type, farena__push(& arena, size_of(type), amount, lit(stringify(type))) )
|
||||||
|
|
||||||
inline
|
inline
|
||||||
void api_farena_init(FArena* arena, SliceMem mem) {
|
void api_farena_init(FArena* arena, SliceMem mem) {
|
||||||
@ -529,7 +530,7 @@ inline void farena_reset(FArena* arena) { arena->used = 0; }
|
|||||||
inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; }
|
inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; }
|
||||||
#pragma endregion FArena
|
#pragma endregion FArena
|
||||||
|
|
||||||
#ifdef DEMO__WATL_LEX_V1
|
#if defined(DEMO__WATL_LEX_V1) || defined(DEMO__WATL_PARSE_V1)
|
||||||
|
|
||||||
struct WATL_LexInfo {
|
struct WATL_LexInfo {
|
||||||
// For now just the tokens
|
// For now just the tokens
|
||||||
@ -569,7 +570,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
|
|||||||
case WATL_Tok_Tab:
|
case WATL_Tok_Tab:
|
||||||
{
|
{
|
||||||
if (* prev != * cursor) {
|
if (* prev != * cursor) {
|
||||||
tok = farena_push(arena, WATL_Tok).ptr;
|
tok = farena_push(arena, WATL_Tok);
|
||||||
tok->code = cursor;
|
tok->code = cursor;
|
||||||
was_text = false;
|
was_text = false;
|
||||||
}
|
}
|
||||||
@ -582,8 +583,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
|
|||||||
// Assumes next is line feed.
|
// Assumes next is line feed.
|
||||||
cursor += 1;
|
cursor += 1;
|
||||||
}
|
}
|
||||||
case WATL_Tok_LineFeed:
|
case WATL_Tok_LineFeed: {
|
||||||
{
|
|
||||||
cursor += 1;
|
cursor += 1;
|
||||||
code = * cursor;
|
code = * cursor;
|
||||||
}
|
}
|
||||||
@ -593,7 +593,7 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (! was_text) {
|
if (! was_text) {
|
||||||
tok = farena_push(arena, WATL_Tok).ptr;
|
tok = farena_push(arena, WATL_Tok);
|
||||||
tok->code = cursor;
|
tok->code = cursor;
|
||||||
was_text = true;
|
was_text = true;
|
||||||
}
|
}
|
||||||
@ -653,6 +653,277 @@ int main()
|
|||||||
/*
|
/*
|
||||||
Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
|
Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
|
||||||
* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
|
* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
|
||||||
|
|
||||||
We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
typedef struct WATL_ParseInfo WATL_ParseInfo;
|
||||||
|
typedef struct Opts__watl_parse Opts__watl_parse;
|
||||||
|
void api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts);
|
||||||
|
WATL_ParseInfo watl__parse ( WATL_SliceTok tokens, Opts__watl_parse* opts);
|
||||||
|
#define watl_parse(tokens, ...) watl__parse(tokens, & (Opts__watl_parse) {__VA_ARGS__})
|
||||||
|
|
||||||
|
typedef struct WATL_Node WATL_Node;
|
||||||
|
struct WATL_Node {
|
||||||
|
WATL_Node* next;
|
||||||
|
Str8 entry;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct WATL_Line WATL_Line;
|
||||||
|
struct WATL_Line {
|
||||||
|
WATL_Line* next;
|
||||||
|
WATL_Node* ptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct WATL_SliceLine WATL_SliceLine;
|
||||||
|
struct WATL_SliceLine {
|
||||||
|
WATL_Line* ptr;
|
||||||
|
SSIZE len;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
For the sake of the exercise, we'll be eliminating the association with the file's strings and we'll need to instead cache them.
|
||||||
|
*/
|
||||||
|
#pragma region Str8Cache
|
||||||
|
typedef struct Str8Cache Str8Cache;
|
||||||
|
void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
|
||||||
|
Str8Cache str8cache_init ( SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
|
||||||
|
|
||||||
|
// A cache like this relies on tabling string entires utiliszing an index derived from a hashed ID.
|
||||||
|
// For these strings we'll be using a hash called djb8:
|
||||||
|
|
||||||
|
// Introducing a slice iterator:
|
||||||
|
#define slice_iter(container) typeof(container.ptr) iter = container.ptr; iter != (container.ptr + container.len); ++ iter
|
||||||
|
|
||||||
|
inline
|
||||||
|
void hash64_djb8(U64* hash, SliceByte bytes) {
|
||||||
|
for (slice_iter(bytes)) {
|
||||||
|
*hash = (((*hash) << 8) + (*hash)) + (*iter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For a library or codebase its recommended to setup a metaprogram to generate hash utilizing containers
|
||||||
|
// Or other containers that cannot be sufficiently lifted to general runtime paths without losing ergonomic debug type info or type-constraint enforcements.
|
||||||
|
// Unlike with the template markup C++ uses, you can strike a balance between how many definitions are redundantly made or optimized for collapsing to a general path
|
||||||
|
// based on target optimization and debugability.
|
||||||
|
|
||||||
|
// For this V1 example, we'll be hand-rolling a fixed sized table with excess slot chaining for colliding slots.
|
||||||
|
// Its a relatively simple implementation to hand-roll. These things tend to become unyeilding with more advanced variants.
|
||||||
|
|
||||||
|
typedef struct Str8Cache_Slot Str8Cache_Slot;
|
||||||
|
struct Str8Cache_Slot {
|
||||||
|
Str8Cache_Slot* prev;
|
||||||
|
Str8Cache_Slot* next;
|
||||||
|
Str8 value;
|
||||||
|
U64 key;
|
||||||
|
B32 occupied;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct Str8Cache_SliceSlot Str8Cache_SliceSlot;
|
||||||
|
struct Str8Cache_SliceSlot {
|
||||||
|
Str8Cache_Slot* ptr;
|
||||||
|
SSIZE len;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Str8Cache {
|
||||||
|
FArena a_str;
|
||||||
|
Str8Cache_SliceSlot pool;
|
||||||
|
Str8Cache_Slot* vacant;
|
||||||
|
Str8Cache_SliceSlot table;
|
||||||
|
};
|
||||||
|
|
||||||
|
Str8Cache str8cache_init(SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) { Str8Cache cache; api_str8cache_init(& cache, mem_strs, mem_slots, mem_table); return cache; }
|
||||||
|
inline
|
||||||
|
void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) {
|
||||||
|
assert(cache != nullptr);
|
||||||
|
slice_assert(mem_strs);
|
||||||
|
slice_assert(mem_slots);
|
||||||
|
slice_assert(mem_table);
|
||||||
|
cache->a_str = farena_init(mem_strs);
|
||||||
|
cache->pool = pcast(Str8Cache_SliceSlot, mem_slots);
|
||||||
|
cache->table = pcast(Str8Cache_SliceSlot, mem_table);
|
||||||
|
}
|
||||||
|
|
||||||
|
void str8cache_clear(Str8Cache* cache)
|
||||||
|
{
|
||||||
|
for (slice_iter(cache->table))
|
||||||
|
{
|
||||||
|
if (iter == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (Str8Cache_Slot* probe_slot = iter->next; probe_slot != nullptr; probe_slot = probe_slot->next) {
|
||||||
|
iter->occupied = false;
|
||||||
|
}
|
||||||
|
iter->occupied = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't introduce a remove option because we're not tracking fixed sized entities.
|
||||||
|
// Strings take up non-determistic sizes of their backing arena. So the only thing that can be done with the cache is wiping it and recaching all strings.
|
||||||
|
|
||||||
|
/*
|
||||||
|
When storing a hash of a slot, we can almost never utilize the full width of a key,
|
||||||
|
so we must truncate the key via module to get a "good enough" unique ID to place in the table.
|
||||||
|
*/
|
||||||
|
inline
|
||||||
|
U64 str8cache_slot_id(Str8Cache cache, U64 key) {
|
||||||
|
U64 hash_index = key % cast(U64, cache.table.len);
|
||||||
|
return hash_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
Str8* str8cache_get(Str8Cache cache, U64 key)
|
||||||
|
{
|
||||||
|
U64 hash_index = str8cache_slot_id(cache, key);
|
||||||
|
Str8Cache_Slot* surface_slot = & cache.table.ptr[hash_index];
|
||||||
|
if (surface_slot == nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (surface_slot->occupied && surface_slot->key == key) {
|
||||||
|
return & surface_slot->value;
|
||||||
|
}
|
||||||
|
for (Str8Cache_Slot* slot = surface_slot->next; slot != nullptr; slot = slot->next)
|
||||||
|
{
|
||||||
|
if (slot->occupied && slot->key == key) {
|
||||||
|
return & slot->value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
Str8* str8cache_set(Str8Cache* cache, U64 key, Str8 value)
|
||||||
|
{
|
||||||
|
U64 hash_index = str8cache_slot_id(*cache, key);
|
||||||
|
Str8Cache_Slot* surface_slot = & cache->table.ptr[hash_index];
|
||||||
|
if (! surface_slot->occupied || surface_slot->key == key)
|
||||||
|
{
|
||||||
|
if (value.ptr != surface_slot->value.ptr) {
|
||||||
|
SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8"));
|
||||||
|
slice_copy(mem, pcast(SliceMem, value));
|
||||||
|
surface_slot->value = pcast(Str8, mem);
|
||||||
|
}
|
||||||
|
surface_slot->key = key;
|
||||||
|
surface_slot->occupied = true;
|
||||||
|
return & surface_slot->value;
|
||||||
|
}
|
||||||
|
Str8Cache_Slot* slot = surface_slot;
|
||||||
|
for (;; slot = slot->next)
|
||||||
|
{
|
||||||
|
if (slot->next == nullptr)
|
||||||
|
{
|
||||||
|
// We had a collision, we need to grab a vacant slot from the pool and utilize it instead.
|
||||||
|
slot->next = cache->vacant;
|
||||||
|
* slot->next = (Str8Cache_Slot){0};
|
||||||
|
slot->next->prev = slot;
|
||||||
|
|
||||||
|
Str8Cache_Slot* next_vacant = cache->vacant + 1;
|
||||||
|
assert(next_vacant < cache->pool.ptr + cache->pool.len );
|
||||||
|
// If the above fails we ran out of extra slots.
|
||||||
|
cache->vacant = cache->vacant + 1;
|
||||||
|
}
|
||||||
|
if ( ! slot->next->occupied || slot->next->key == key)
|
||||||
|
{
|
||||||
|
if (value.ptr != slot->next->value.ptr) {
|
||||||
|
SliceMem mem = farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8"));
|
||||||
|
slice_copy(mem, pcast(SliceMem, value));
|
||||||
|
slot->next->value = pcast(Str8, mem);
|
||||||
|
}
|
||||||
|
slot->next->value = value;
|
||||||
|
slot->next->key = key;
|
||||||
|
slot->next->occupied = true;
|
||||||
|
return & slot->next->value;
|
||||||
|
}
|
||||||
|
// We keep traversing till we find a match or we find a vacancy for this list in the table.
|
||||||
|
// Make sure to tune the size of the table so it does this less!
|
||||||
|
// Note: Tables sized by prime values collide less aswell.
|
||||||
|
// You can use a closest prime number lookup table to derive what length to expose to the cache's table for hash ID resolution.
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
#pragma endregion Str8Cache
|
||||||
|
|
||||||
|
// Finally our abstracted cache interface:
|
||||||
|
Str8 cache_str8(Str8Cache* cache, Str8 str)
|
||||||
|
{
|
||||||
|
U64 key = 0; hash64_djb8(& key, pcast(SliceByte, str));
|
||||||
|
Str8* result = str8cache_set(cache, key, str);
|
||||||
|
assert(result != nullptr);
|
||||||
|
return * result;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef DEMO__WATL_PARSE_V1
|
||||||
|
|
||||||
|
struct Opts__watl_parse {
|
||||||
|
SliceMem backing_nodes;
|
||||||
|
SliceMem backing_lines;
|
||||||
|
Str8Cache* str_cache;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct WATL_ParseInfo {
|
||||||
|
WATL_SliceLine lines;
|
||||||
|
};
|
||||||
|
|
||||||
|
void api_watl_parse(WATL_ParseInfo* info, WATL_SliceTok tokens, Opts__watl_parse* opts)
|
||||||
|
{
|
||||||
|
assert(info != nullptr);
|
||||||
|
slice_assert(tokens);
|
||||||
|
assert(opts != nullptr);
|
||||||
|
|
||||||
|
FArena a_lines = farena_init(opts->backing_lines);
|
||||||
|
FArena a_nodes = farena_init(opts->backing_nodes);
|
||||||
|
|
||||||
|
WATL_Line* line = farena_push(a_lines, WATL_Line);
|
||||||
|
WATL_Node* curr = line->ptr->entry;
|
||||||
|
for (slice_iter(tokens))
|
||||||
|
{
|
||||||
|
switch (* iter->code)
|
||||||
|
{
|
||||||
|
case WATL_Tok_Space:
|
||||||
|
case WATL_Tok_Tab: {
|
||||||
|
line->ptr->entry = cache_str8(opts->str_cache, watl_tok_str8(tokens, iter));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case WATL_Tok_CarriageReturn: {
|
||||||
|
++ iter;
|
||||||
|
}
|
||||||
|
case WATL_Tok_LineFeed: {
|
||||||
|
WATL_Line* new_line = farena_push(a_lines, WATL_Line);
|
||||||
|
line = new_line;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif DEMO__WATL_PARSE_V1
|
||||||
|
|
||||||
|
WATL_ParseInfo watl__parse(WATL_SliceTok tokens, Opts__watl_parse* opts) { WATL_ParseInfo info; api_watl_parse(& info, tokens, opts); return info; }
|
||||||
|
|
||||||
|
#ifdef DEMO__WATL_PARSE_V1
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
// This will limit for our V1 read to 64kb at most.
|
||||||
|
FMem_64KB read_mem = {0};
|
||||||
|
FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
|
||||||
|
|
||||||
|
// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
|
||||||
|
SliceMem mem_toks = slicemem_alloc(MEGABYTES(8));
|
||||||
|
WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
|
||||||
|
|
||||||
|
SliceMem mem_cache_strs = slicemem_alloc(MEGABYTES(16));
|
||||||
|
SliceMem mem_cache_slots = slicemem_alloc(KILOBTYES(512));
|
||||||
|
SliceMem mem_cache_table = slicemem_alloc(KILOBTYES(64));
|
||||||
|
Str8Cache str_cache = str8cache_init(mem_cache_strs, mem_cache_slots, mem_cache_table);
|
||||||
|
|
||||||
|
SliceMem mem_parse_nodes = slicemem_alloc(MEGABYTES(4));
|
||||||
|
SliceMem mem_parse_lines = slicemem_alloc(MEGABYTES(4));
|
||||||
|
WATL_ParseInfo parse_res = watl_parse(lex_res.tokens, .backing_nodes = mem_parse_nodes, .backing_lines = mem_parse_lines, .str_cache = & str_cache);
|
||||||
|
|
||||||
|
// unnecessary in this case but if you want to explicitly:
|
||||||
|
slicemem_free(mem_toks);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user