C_Intro/demo.str_cache.c
2025-05-04 18:11:04 -04:00

659 lines
20 KiB
C

/*
A introduction to C11 with a str cache demo.
Attempting to showcase better conventions and constructs in C; Discovered to me as of 2025 from scouring the internet.
*/
/*
The below will be implemented within this single file.
Because of this, definitions will be kept on a need-to-have basis to target only one vendor target and toolchain.
We will not use nearly any libraries and will be targeting only Windows 11 x64 using MSVC.
Even so the constructs defined and their dependencies can be properly abstracted into a ergonomic library for multiple targets with enough time and pain.
The difference is just more preprocess conditionals, and how far a library is trying to support a larger range of targets and their age discrpancy.
The more minimal the less cruft.
Definitions are defined linearly on the file on-demand as needed. Since the file is to be read linearly.
This will cause non-categorical organization so it will be more difficult to sift through if you wanted
to see definitions related to a sepecific kind of data or operation (strings, memory, etc).
*/
#if 0
int main()
{
VArena cache_arena; varena_init(cache_arena);
StrCache cache = strcache_init(varena_ainfo(cache));
VArena file_arena; varena_init(file_arena);
Str8 path_text = lit("../demo.strcache.c");
FileContent text_file = file_read_contents(varena_ainfo(file_arena), path_text);
Arena ast_arena; arena_init(ast_arena);
WATL_ParseOps ops = { .str_cache = &cache, .node_backing = arena_ainfo(ast_arena) }
WATL_ParsedInfo parsed = watl_parse(text_file.content, ops);
watl_dbg_dump(parsed.root);
strcache_dbg_listing(cache);
return 0;
}
#endif
// Demo selection
// #define DEMO__STR_SLICE
// #define DEMO__FILE_READ_CONTENTS_V1
#define DEMO__WATL_LEX_V1
/*
The above makes use of the following core concepts to achieve its net result:
* Slices
* Arenas
* Generic Runtime Allocator Interface
* Hashing
Secondarily for the purposes of using the above sufficiently the following are also utilized:
* Virtual Address Space
* Read/Write Files
* Lexing & Parsing
* Debug printing
TODO(Ed): Do we introduce gencpp in this?
*/
/*
First thing we'll problably want is a way to deal with text effectively.
So we'll setup the the minimum for that when dealing with immutable constructs.
*/
// We'll need some minimum set of dependencies to adequately define the constructs.
// ASSUMING MODERN MSVC TOOLCHAIN.
#include <stdarg.h>
#include <stddef.h>
#include <intrin.h>
#include <tmmintrin.h>
#include <wmmintrin.h>
#include <assert.h>
// #include <stdbool.h>
typedef unsigned __int8 U8;
typedef signed __int8 S8;
typedef unsigned __int16 U16;
typedef signed __int16 S16;
typedef unsigned __int32 U32;
typedef signed __int32 S32;
typedef unsigned __int64 U64;
typedef signed __int64 S64;
typedef size_t USIZE;
typedef ptrdiff_t SSIZE;
enum {
false,
true,
true_overflow,
};
typedef S8 B8;
typedef S16 B16;
typedef S32 B32;
// Common macros we'll use throughout this.
#define assert_bounds(point, start, end) do { \
SSIZE pos_point = cast(SSIZE, point); \
SSIZE pos_start = cast(SSIZE, start); \
SSIZE pos_end = cast(SSIZE, end); \
assert(pos_start <= pos_point); \
assert(pos_point <= pos_end); \
} while(0)
// Functional style cast
#define cast(type, data) ((type)(data))
#define pcast(type, data) * cast(type*, & data)
#define nullptr cast(void*, 0)
#define glue_(A, B) A ## B
#define glue(A, B) glue_(A,B)
// Enforces size querying uses SSIZE type.
#define size_of(data) cast(SSIZE, sizeof(data))
#define stringify_(S) #S
#define stringify(S) stringify_(S)
/*
The first construct we'll utilize is a String Slice.
In modern programming with the memory sizes utilized, it is more ergonomic to track the length of strings with their pointer.
Most strings are not stored in some immutable table tracked statically, performance loss in doing so is negligble on modern hardware constraints.
*/
typedef struct Str8 Str8;
struct Str8 {
char const* ptr;
SSIZE len;
};
// String iterals in C include null-terminators, we aren't interested in preserving that.
#define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 }
// For now this string can visualized using a debugger.
#ifdef DEMO__STR_SLICE
int main()
{
Str8 first = lit("Our first string as a slice");
return 0;
}
#endif DEMO__STR_SLICE
/*
We now want to be able to read a file. This will be a heavy rabbit-hole as we'll need to setup a basic file interface
and related definitions for handling the memory.
For the purposes of the initial definition we'll introduced fixed-sized memory handling statically allocated onto the stack.
*/
/*
First off we need to find out how to aquire the contents of a file on Windows.
We'll be wrapping the operation in a procedure called file_read_contents. We'll have it take a path and optional arguments (Opts__read_file_contents).
It will return a result in a composite struct: FileOpResult; which may be expanded as needed in the future.
*/
typedef struct FileOpResult FileOpResult;
typedef struct Opts__read_file_contents Opts__read_file_contents;
void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts);
FileOpResult file__read_contents ( Str8 path, Opts__read_file_contents* opts);
#define file_read_contents(path, ...) file__read_contents(path, & (Opts__read_file_contents){__VA_ARGS__} )
/*
The file contents will be returned in bytes.
To view or manage any slice of bytes we'll be utilizing a byte slice.
*/
typedef struct SliceByte SliceByte;
struct SliceByte {
U8* ptr;
SSIZE len;
};
/*
To address memory we'll use a memory slice.
*/
typedef struct SliceMem SliceMem;
struct SliceMem {
void* ptr;
SSIZE len;
};
/*
The above is a pattern that can be provided so that whether or not the result is formatted and provided to the user via the stack is entirely optional.
It also allows for default parameters to be defined conviently.
*/
// We'll utilize the ReadFile procedure within the WinAPI: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-readfile
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#define WIN32_MEAN_AND_LEAN
#define VC_EXTRALEAN
#include <windows.h>
#include <windowsx.h>
#include <timeapi.h>
#include <tlhelp32.h>
#include <Shlobj.h>
#include <processthreadsapi.h>
#pragma comment(lib, "user32")
#pragma comment(lib, "winmm")
#pragma comment(lib, "shell32")
#pragma comment(lib, "advapi32")
#pragma comment(lib, "rpcrt4")
#pragma comment(lib, "shlwapi")
#pragma comment(lib, "comctl32")
#pragma comment(linker,"\"/manifestdependency:type='win32' name='Microsoft.Windows.Common-Controls' version='6.0.0.0' processorArchitecture='*' publicKeyToken='6595b64144ccf1df' language='*'\"") // this is required for loading correct comctl32 dll file
#undef NOMINMAX
#undef WIN32_LEAN_AND_MEAN
#undef WIN32_MEAN_AND_LEAN
#undef VC_EXTRALEAN
#if 0
BOOL ReadFile(
[in] HANDLE hFile,
[out] LPVOID lpBuffer,
[in] DWORD nNumberOfBytesToRead,
[out, optional] LPDWORD lpNumberOfBytesRead,
[in, out, optional] LPOVERLAPPED lpOverlapped
);
// In order to read a file we need a handle to a valid filesystem entity to read from: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-createfilea
HANDLE CreateFileA(
[in] LPCSTR lpFileName,
[in] DWORD dwDesiredAccess,
[in] DWORD dwShareMode,
[in, optional] LPSECURITY_ATTRIBUTES lpSecurityAttributes,
[in] DWORD dwCreationDisposition,
[in] DWORD dwFlagsAndAttributes,
[in, optional] HANDLE hTemplateFile
);
#endif
// We need to covert our string slice to a c-string for CreateFileA's path input.
#define KILOBTYES(n) (cast(SSIZE, n) << 10)
#define MEGABYTES(n) (cast(SSIZE, n) << 20)
#define GIGABYTES(n) (cast(SSIZE, n) << 30)
#define TERABYTES(n) (cast(SSIZE, n) << 40)
/*
We'll be defining here Fixed-sized memory blocks using typedefs on-demand
They will having the following format:
typedef U8 FMem_<size>KB [ <U8 amount> ];
*/
typedef U8 FMem_16KB [ KILOBTYES(16) ];
typedef U8 FMem_64KB [ KILOBTYES(64) ];
#define typeof __typeof__
#define fmem_slice(mem) (SliceMem) { mem, size_of(mem) }
// We'll be using an intrinsic for copying memory:
void* memory_copy(void* dest, void const* src, SSIZE length)
{
if (dest == nullptr || src == nullptr || length == 0) {
return nullptr;
}
// https://learn.microsoft.com/en-us/cpp/intrinsics/movsb?view=msvc-170
__movsb((unsigned char*)dest, (const unsigned char*)src, length);
return dest;
}
// Often we'll want to check validity of a slice:
#define slice_assert(slice) do { \
assert(slice.ptr != nullptr); \
assert(slice.len > 0); \
} while(0)
void slice_copy(SliceMem dest, SliceMem src) {
assert(dest.len >= src.len);
slice_assert(dest);
slice_assert(src);
memory_copy(dest.ptr, src.ptr, src.len);
}
// Assumes memory is zeroed.
char const* str8_to_cstr_capped(Str8 content, SliceMem mem) {
assert(mem.len >= content.len);
memory_copy(mem.ptr, content.ptr, content.len);
return mem.ptr;
}
// To support zeroing slices we'll utilize an intrinisc.
B32 memory_zero(void* dest, SSIZE length) {
if (dest == nullptr || length <= 0) {
return false;
}
__stosd((unsigned long*)dest, 0, length);
return true;
}
void slice_zero(SliceMem mem) {
slice_assert(mem);
memory_zero(mem.ptr, mem.len);
}
// Now for our "Version 1"
#if defined(DEMO__FILE_READ_CONTENTS_V1) || defined(DEMO__WATL_LEX_V1)
struct FileOpResult
{
// For now we'll just have the content
SliceByte content;
};
struct Opts__read_file_contents
{
// For now we'll just have the backing memory provided as a slice.
SliceMem backing;
// And whether we should zero the backing.
B32 zero_backing;
};
void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_contents* opts)
{
assert(result != nullptr);
assert(opts != nullptr);
slice_assert(path);
// Backing is required at this point
slice_assert(opts->backing);
// This will limit a path for V1 to be 16kb worth of codepoints.
FMem_16KB scratch = {0};
char const* path_cstr = str8_to_cstr_capped(path, fmem_slice(scratch) );
HANDLE id_file = CreateFileA(
path_cstr,
GENERIC_READ,
FILE_SHARE_READ,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
NULL
);
B32 open_failed = id_file == INVALID_HANDLE_VALUE;
if (open_failed) {
DWORD error_code = GetLastError();
assert(error_code != 0);
return;
}
LARGE_INTEGER file_size = {0};
DWORD get_size_failed = ! GetFileSizeEx(id_file, & file_size);
if (get_size_failed) {
assert(get_size_failed == INVALID_FILE_SIZE);
return;
}
// Because we are currently using fixed size memory, we need to confirm that we can hold this content.
B32 not_enough_backing = opts->backing.len < file_size.QuadPart;
if (not_enough_backing) {
assert(not_enough_backing);
// Otherwise we don't provide a result.
result->content = (SliceByte){0};
return;
}
if (opts->zero_backing) {
slice_zero(opts->backing);
}
DWORD amount_read = 0;
BOOL read_result = ReadFile(
id_file,
opts->backing.ptr,
file_size.QuadPart,
& amount_read,
nullptr
);
CloseHandle(id_file);
B32 read_failed = ! read_result;
read_failed |= amount_read != file_size.QuadPart;
if (read_failed) {
assert(read_failed);
return;
}
result->content.ptr = opts->backing.ptr;
result->content.len = file_size.QuadPart;
return;
}
#endif DEMO__FILE_READ_CONTENTS_V1
// Version agnostic code:
inline
FileOpResult file__read_contents(Str8 path, Opts__read_file_contents* opts) {
FileOpResult result;
api_file_read_contents(& result, path, opts);
return result;
}
// And now to put it all together into a test run in the debugger. Content should be properly formatted if the code is correct.
#ifdef DEMO__FILE_READ_CONTENTS_V1
int main()
{
// This will limit for our V1 read to 64kb at most.
FMem_64KB read_mem = {0};
FileOpResult res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
return 0;
}
#endif DEMO__FILE_READ_CONTENTS_V1
/*
Now that we have file reading done we need to be able to process the content.
First we want to do lexical analysis. So we'll create a token listing delimiting aspects of the text file relevant to us.
For our data structure, we are going for a Whitespace-Aware Text Layout; where we'll track text and the formatting around them.
Just like with the read file contents operation, we'll define an interface to performing this analysis.
It will be called watl_lex and take the SliceMem from the file as a Str8 slice and some Opts__watl_lex;
returning a WATL_LexInfo for providing user info on how the operation went.
*/
typedef struct WATL_LexInfo WATL_LexInfo;
typedef struct Opts__watl_lex Opts__watl_lex;
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts);
WATL_LexInfo watl__lex ( Str8 source, Opts__watl_lex* opts);
#define watl_lex(source, ...) watl__lex(source, &(Opts__watl_lex){__VA_ARGS__})
/*
Token identification will be done using a WATL_TokKind enumeration.
The token type itself will be the id along with a ptr to its start of the slice. We can resolve the width of the token by its delta to the next token.
If its the last token, then its delta is determined by its offset to the end of the Str8 slice.
*/
typedef U32 WATL_TokKind;
enum WATL_TokKind {
WATL_Tok_Space = ' ',
WATL_Tok_Tab = '\t',
WATL_Tok_CarriageReturn = '\r',
WATL_Tok_LineFeed = '\n',
};
typedef struct WATL_Tok WATL_Tok;
struct WATL_Tok {
char const* code;
};
typedef struct WATL_SliceTok WATL_SliceTok;
struct WATL_SliceTok {
WATL_Tok* ptr;
SSIZE len;
};
Str8 watl_tok_str8(WATL_SliceTok* toks, WATL_Tok* tok) {
SSIZE start = cast(SSIZE, toks->ptr);
SSIZE curr = cast(SSIZE, tok->code);
SSIZE offset = curr - start;
SSIZE left = toks->len - offset;
B32 last_tok = (start + toks->len) == (curr + left);
Str8 text = {0};
text.ptr = tok->code;
text.len = last_tok ?
left
// Othwerise its the last minus the curr.
: cast(SSIZE, (tok + 1) - curr);
return text;
}
/*
Tokens are allocated to a backing slice of memory defined by the user. This pool of memory will ideally not be constrained to a fixed size on the stack.
So for V1 we'll allocate 10 megs of heap memory to act as a pool for the tokens. We'll keep track of how much for the pool we used via a new memory tracking construct:
The fixed-sized arena.
A basic fixed size arena only has three components which can vary depending on the convention the user perfers.
In our case we'll track its capacity, its starting address, and how much has been comitted..
*/
// We use this in-conjunction with Areans to save a point thats safe to rewind to by the user.
typedef struct ArenaSP ArenaSP;
struct ArenaSP { void* ptr; };
#pragma region FArena
typedef struct FArena FArena;
struct FArena {
void* start;
SSIZE capacity;
SSIZE used;
};
void api_farena_init(FArena* arena, SliceMem mem);
FArena farena_init (SliceMem mem);
SliceMem farena__push (FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename);
void farena_reset (FArena* arena);
void farena_rewind (FArena* arena, ArenaSP savepoint);
ArenaSP farena_save (FArena arena);
#define farena_push(arena, type) farena__push(& arena, size_of(type), 1, lit(stringify(type)))
#define farena_push_array(arena, type, amount) farena__push(& arena, size_of(type), amount, lit(stringify(type)))
inline
void api_farena_init(FArena* arena, SliceMem mem) {
arena->start = mem.ptr;
arena->capacity = mem.len;
arena->used = 0;
}
inline FArena farena_init(SliceMem mem) { FArena arena; api_farena_init(& arena, mem); return arena; }
inline
SliceMem farena__push(FArena* arena, SSIZE type_size, SSIZE amount, Str8 dbg_typename) {
SSIZE to_commit = type_size * amount;
SSIZE unused = arena->capacity - arena->used;
assert(to_commit <= unused);
SliceMem result = {0};
result.ptr = cast(void*, cast(SSIZE, arena->start) + arena->used);
result.len = to_commit;
arena->used += to_commit;
return result;
}
inline
void farena_rewind(FArena* arena, ArenaSP savepoint) {
void* end = cast(void*, cast(SSIZE, arena->start) + arena->used);
assert_bounds(savepoint.ptr, arena->start, end);
arena->used -= cast(SSIZE, savepoint.ptr) - cast(SSIZE, arena->start);
}
inline void farena_reset(FArena* arena) { arena->used = 0; }
inline ArenaSP farena_save (FArena arena) { ArenaSP savepoint; savepoint.ptr = arena.start; return savepoint; }
#pragma endregion FArena
#ifdef DEMO__WATL_LEX_V1
struct WATL_LexInfo {
// For now just the tokens
WATL_SliceTok tokens;
};
struct Opts__watl_lex {
/*
For this operation we'll enforce that the arena must linearly allocate each token, forming a strictly adjacent sent of elements in an array.
This is not necessary and an equivalent process could be done where the tokens instead are semi-contigously organized into linked list with a chained arena, or the tokens are sparely cached.
Where their position in their originating string is not preserved. In this case we're keeping it simple. Tokens are in the same block of memory and they don't use a string cache.
*/
SliceMem pool_toks;
};
// We are assuming everything is utf8-ascii.
void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)
{
assert(info != nullptr);
slice_assert(source);
assert(opts != nullptr);
FArena arena = farena_init(opts->pool_toks);
char const* end = source.ptr + source.len;
char const* cursor = source.ptr;
char const* prev = source.ptr;
char code = * cursor;
B32 was_text = false;
WATL_Tok* tok = nullptr;
for (; cursor < end;)
{
switch (code)
{
case WATL_Tok_Space:
case WATL_Tok_Tab:
{
if (* prev != * cursor) {
tok = farena_push(arena, WATL_Tok).ptr;
tok->code = cursor;
was_text = false;
}
cursor += 1;
code = * cursor;
}
continue;
case WATL_Tok_CarriageReturn: {
// Assumes next is line feed.
cursor += 1;
}
case WATL_Tok_LineFeed:
{
cursor += 1;
code = * cursor;
}
continue;
default:
break;
}
if (! was_text) {
tok = farena_push(arena, WATL_Tok).ptr;
tok->code = cursor;
was_text = true;
}
prev = cursor;
cursor += 1;
code = * cursor;
}
info->tokens.ptr = arena.start;
info->tokens.len = arena.used / size_of(WATL_Tok*);
}
#endif DEMO__WATL_LEX_V1
inline
WATL_LexInfo watl__lex(Str8 source, Opts__watl_lex* opts) {
WATL_LexInfo result = {0};
api_watl_lex(& result, source, opts);
return result;
}
/*
To allocate onto the heap we'll make a basic slicemem_malloc to allocate, we'll make a corresponding slicemem_free aswell.
However we don't need to use it for the V1 example. The OS will cleanup the pages used by the process during its termination.
*/
SliceMem slicemem_alloc(SSIZE amount)
{
assert(amount > KILOBTYES(4));
void* result = malloc(amount);
assert(result != nullptr);
SliceMem mem = {
.ptr = result,
.len = amount
};
return mem;
}
void slicemem_free(SliceMem mem) {
free(mem.ptr);
}
#ifdef DEMO__WATL_LEX_V1
int main()
{
// This will limit for our V1 read to 64kb at most.
FMem_64KB read_mem = {0};
FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
SliceMem mem_toks = slicemem_alloc(MEGABYTES(8));
WATL_LexInfo lex_res = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
// unnecessary in this case but if you want to explicitly:
slicemem_free(mem_toks);
return 0;
}
#endif
/*
Next we'll parse these tokens into a rudimentary WATL Abstract Syntax Tree.
* The tree will be top-level organized by lines consisting of linked slices of visble and non-visible tokens.
We'll preserve whether or not a non-visible chunk is a tab or series of spaces.
*/