not using char const*

2025-05-12 00:44:31 -04:00 · 2025-05-12 00:44:31 -04:00 · 3094a51872
commit 3094a51872
parent 62beed20a9
1 changed files with 75 additions and 68 deletions
--- a/demo.str_cache.c
+++ b/demo.str_cache.c
@ -76,7 +76,6 @@ So we'll setup the the minimum for that when dealing with immutable constructs.
 #include <wmmintrin.h>

 #include <assert.h>
-// #include <stdbool.h>

 typedef unsigned __int8  U8;
 typedef signed   __int8  S8;
@ -130,15 +129,24 @@ In modern programming with the memory sizes utilized, it is more ergonomic to tr
 Most strings are not stored in some immutable table tracked statically, performance loss in doing so is negligble on modern hardware constraints.
 */

+// UTF-8 character encoding unit
+typedef unsigned char UTF8;
+
 typedef struct Str8 Str8;
 struct Str8 {
-	char const* ptr;
-	SSIZE       len;
+	UTF8* ptr;
+	SSIZE len;
 };

 // String iterals in C include null-terminators, we aren't interested in preserving that.
 #define lit(string_literal) (Str8){ string_literal, size_of(string_literal) - 1 }

+/*
+We'll want all of our textual process to operate with UTF-8 code pages:
+*/
+#include <locale.h>
+inline void set_utf8_codepage() { setlocale(LC_ALL, ".UTF-8"); }
+
 // For now this string can visualized using a debugger.
 #ifdef DEMO__STR_SLICE
 int main()
@ -178,15 +186,6 @@ struct SliceByte {
 	SSIZE len;
 };

-/*
-To address memory we'll use a memory slice.
-*/
-typedef struct SliceMem SliceMem;
-struct SliceMem {
-	void* ptr;
-	SSIZE len;
-};
-
 /*
 The above is a pattern that can be provided so that whether or not the result is formatted and provided to the user via the stack is entirely optional.
 It also allows for default parameters to be defined conviently.
@ -254,7 +253,7 @@ typedef U8 FMem_16KB [ KILOBTYES(16) ];
 typedef U8 FMem_64KB [ KILOBTYES(64) ];

 #define typeof          __typeof__
-#define fmem_slice(mem) (SliceMem) { mem, size_of(mem) }
+#define fmem_slice(mem) (SliceByte) { mem, size_of(mem) }

 // We'll be using an intrinsic for copying memory:
 void* memory_copy(void* dest, void const* src, USIZE length)
@ -273,19 +272,19 @@ void* memory_copy(void* dest, void const* src, USIZE length)
 	assert(slice.len > 0);        \
 } while(0)

-void slice__copy(SliceMem dest, SSIZE const dest_typewidth, SliceMem const src, SSIZE const src_typewidth) {
+void slice__copy(SliceByte dest, SSIZE dest_typewidth, SliceByte src, SSIZE src_typewidth) {
 	assert(dest.len >= src.len);
 	slice_assert(dest);
 	slice_assert(src);
 	memory_copy(dest.ptr, src.ptr, src.len);
 }
-#define slice_copy(dest,src) slice__copy(                                                 \
-	(SliceMem      ){(dest).ptr, (dest).len * size_of(*(dest).ptr)}, size_of(*(dest).ptr) \
-,	(SliceMem const){(src ).ptr, (src ).len * size_of(*(src ).ptr)}, size_of(*(src ).ptr) \
+#define slice_copy(dest,src) slice__copy(                                            \
+	(SliceByte){(dest).ptr, (dest).len * size_of(*(dest).ptr)}, size_of(*(dest).ptr) \
+,	(SliceByte){(src ).ptr, (src ).len * size_of(*(src ).ptr)}, size_of(*(src ).ptr) \
 )

 // Assumes memory is zeroed.
-char const* str8_to_cstr_capped(Str8 content, SliceMem mem) {
+char* str8_to_cstr_capped(Str8 content, SliceByte mem) {
 	assert(mem.len >= content.len);
 	memory_copy(mem.ptr, content.ptr, content.len);
 	return mem.ptr;
@ -300,11 +299,11 @@ B32 memory_zero(void* dest, USIZE const length) {
 	return true;
 }

-void slice__zero(SliceMem mem, SSIZE typewidth) {
+void slice__zero(SliceByte mem, SSIZE typewidth) {
 	slice_assert(mem);
 	memory_zero(mem.ptr, mem.len);
 }
-#define slice_zero(slice) slice__zero((SliceMem){(slice).ptr, (slice).len * size_of(*(slice).ptr)}, size_of(*(slice).ptr))
+#define slice_zero(slice) slice__zero((SliceByte){ cast(void*, (slice).ptr), (slice).len * size_of(*(slice).ptr)}, size_of(*(slice).ptr))

 // Now for our "Version 1"

@ -319,7 +318,7 @@ struct FileOpResult
 struct Opts__read_file_contents
 {
 	// For now we'll just have the backing memory provided as a slice.
-	SliceMem backing;
+	SliceByte backing;
 	// And whether we should zero the backing.
 	B32 zero_backing;
 };
@ -372,8 +371,8 @@ void api_file_read_contents(FileOpResult* result, Str8 path, Opts__read_file_con
          slice_zero(pcast(SliceByte, opts->backing));
 	}

-        DWORD amount_read = 0;
-	BOOL read_result = ReadFile(
+	DWORD amount_read = 0;
+	BOOL  read_result = ReadFile(
 		id_file,
 		opts->backing.ptr,
 		file_size.QuadPart,
@ -408,6 +407,8 @@ FileOpResult file__read_contents(Str8 path, Opts__read_file_contents* opts) {
 #ifdef DEMO__FILE_READ_CONTENTS_V1
 int main()
 {
+	set_utf8_codepage();
+
 	// This will limit for our V1 read to 64kb at most.
 	FMem_64KB    read_mem = {0};
 	FileOpResult res      = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );
@ -422,7 +423,7 @@ First we want to do lexical analysis. So we'll create a token listing delimiting
 For our data structure, we are going for a Whitespace-Aware Text Layout; where we'll track text and the formatting around them.

 Just like with the read file contents operation, we'll define an interface to performing this analysis.
-It will be called watl_lex and take the SliceMem from the file as a Str8 slice and some Opts__watl_lex; 
+It will be called watl_lex and take the SliceByte from the file as a Str8 slice and some Opts__watl_lex; 
 returning a WATL_LexInfo for providing user info on how the operation went.
 */

@ -450,7 +451,7 @@ enum WATL_TokKind {

 typedef struct WATL_Tok WATL_Tok;
 struct WATL_Tok {
-	char const* code;
+	UTF8* code;
 };

 typedef struct WATL_SliceTok WATL_SliceTok;
@ -495,8 +496,8 @@ struct FArena {
 	USIZE capacity;
 	USIZE used;
 };
-void     api_farena_init(FArena* arena, SliceMem mem);
-FArena   farena_init    (SliceMem mem);
+void     api_farena_init(FArena* arena, SliceByte mem);
+FArena   farena_init    (SliceByte mem);
 void*    farena__push   (FArena* arena, USIZE type_size, USIZE amount, Str8 dbg_typename);
 void     farena_reset   (FArena* arena);
 void     farena_rewind  (FArena* arena, ArenaSP savepoint);
@ -506,12 +507,12 @@ ArenaSP  farena_save    (FArena  arena);
 #define  farena_push_array(arena, type, amount) (Slice ## type){ farena__push(& arena, size_of(type), amount, lit(stringify(type))), amount }

 inline
-void api_farena_init(FArena* arena, SliceMem mem) {
+void api_farena_init(FArena* arena, SliceByte mem) {
 	arena->start    = mem.ptr;
 	arena->capacity = mem.len;
 	arena->used      = 0;
 }
-inline FArena farena_init(SliceMem mem) { FArena arena; api_farena_init(& arena, mem); return arena; }
+inline FArena farena_init(SliceByte mem) { FArena arena; api_farena_init(& arena, mem); return arena; }

 inline
 void* farena__push(FArena* arena, USIZE type_size, USIZE amount, Str8 dbg_typename) {
@ -547,7 +548,7 @@ struct Opts__watl_lex {
 	This is not necessary and an equivalent process could be done where the tokens instead are semi-contigously organized into linked list with a chained arena, or the tokens are sparely cached.
 	Where their position in their originating string is not preserved. In this case we're keeping it simple. Tokens are in the same block of memory and they don't use a string cache.
 */
-	SliceMem pool_toks;
+	SliceByte pool_toks;
 };

 // We are assuming everything is utf8-ascii.
@ -559,10 +560,10 @@ void api_watl_lex(WATL_LexInfo* info, Str8 source, Opts__watl_lex* opts)

 	FArena arena = farena_init(opts->pool_toks);

-	char const* end    = source.ptr + source.len;
-	char const* cursor = source.ptr;
-	char const* prev   = source.ptr;
-	char        code   = * cursor;
+	UTF8* end    = source.ptr + source.len;
+	UTF8* cursor = source.ptr;
+	UTF8* prev   = source.ptr;
+	UTF8  code   = * cursor;

 	B32       was_formatting = true;
 	WATL_Tok* tok            = nullptr;
@ -631,30 +632,32 @@ To allocate onto the heap we'll make a basic slicemem_malloc to allocate, we'll
 However we don't need to use it for the V1 example. The OS will cleanup the pages used by the process during its termination.
 */

-SliceMem slicemem_alloc(USIZE amount)
+SliceByte slicemem_alloc(USIZE amount)
 {
 	assert(amount > KILOBTYES(4));
 	void* result = malloc(amount);
 	assert(result != nullptr);
-	SliceMem mem = {
+	SliceByte mem = {
 		.ptr = result,
 		.len = amount
 	};
 	return mem;
 }
-void slicemem_free(SliceMem mem) {
+void slicemem_free(SliceByte mem) {
 	free(mem.ptr);
 }

 #ifdef DEMO__WATL_LEX_V1
 int main()
 {
+	set_utf8_codepage();
+
 	// This will limit for our V1 read to 64kb at most.
 	FMem_64KB    read_mem = {0};
 	FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );

 	// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
-	SliceMem     mem_toks = slicemem_alloc(MEGABYTES(8));
+	SliceByte     mem_toks = slicemem_alloc(MEGABYTES(8));
 	WATL_LexInfo lex_res  = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);
 	// unnecessary in this case but if you want to explicitly:
 	slicemem_free(mem_toks);
@ -679,8 +682,8 @@ For the sake of the exercise, we'll be eliminating the association with the file
 */
 #pragma region Str8Cache
 typedef struct Str8Cache Str8Cache;
-void      api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
-Str8Cache str8cache_init    (                  SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table);
+void      api_str8cache_init(Str8Cache* cache, SliceByte mem_strs, SliceByte mem_slots, SliceByte mem_table);
+Str8Cache str8cache_init    (                  SliceByte mem_strs, SliceByte mem_slots, SliceByte mem_table);

 // A cache like this relies on tabling string entires utiliszing an index derived from a hashed ID.
 // For these strings we'll be using a hash called djb8:
@ -689,7 +692,7 @@ Str8Cache str8cache_init    (                  SliceMem mem_strs, SliceMem mem_s
 #define slice_iter(container, iter) typeof((container).ptr) iter = (container).ptr; iter != ((container).ptr + (container).len); ++ iter

 inline
-void hash64_djb8(U64* hash, SliceByte const bytes) {
+void hash64_djb8(U64* hash, SliceByte bytes) {
 	for (U8 const* elem = bytes.ptr; elem != (bytes.ptr + bytes.len); ++ elem) {
 		*hash = (((*hash) << 8) + (*hash)) + (*elem);
 	}
@ -725,16 +728,16 @@ struct Str8Cache {
 	Str8Cache_SliceSlot  table;
 };

-Str8Cache str8cache_init(SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) { Str8Cache cache; api_str8cache_init(& cache, mem_strs, mem_slots, mem_table); return cache; }
+Str8Cache str8cache_init(SliceByte mem_strs, SliceByte mem_slots, SliceByte mem_table) { Str8Cache cache; api_str8cache_init(& cache, mem_strs, mem_slots, mem_table); return cache; }
 inline
-void api_str8cache_init(Str8Cache* cache, SliceMem mem_strs, SliceMem mem_slots, SliceMem mem_table) {
+void api_str8cache_init(Str8Cache* cache, SliceByte mem_strs, SliceByte mem_slots, SliceByte mem_table) {
 	assert(cache != nullptr);
 	slice_assert(mem_strs);
 	slice_assert(mem_slots);
 	slice_assert(mem_table);
 	cache->a_str = farena_init(mem_strs);
-	cache->pool  = (Str8Cache_SliceSlot){mem_slots.ptr, mem_slots.len / size_of(Str8Cache_Slot)};
-	cache->table = (Str8Cache_SliceSlot){mem_table.ptr, mem_table.len / size_of(Str8Cache_Slot)};
+	cache->pool  = (Str8Cache_SliceSlot){ cast(void*, mem_slots.ptr), mem_slots.len / size_of(Str8Cache_Slot)};
+	cache->table = (Str8Cache_SliceSlot){ cast(void*, mem_table.ptr), mem_table.len / size_of(Str8Cache_Slot)};
 	slice_zero(cache->pool);
 	slice_zero(cache->table);
 }
@ -792,8 +795,8 @@ Str8* str8cache_set(Str8Cache* cache, U64 key, Str8 value)
 	if (! surface_slot->occupied || surface_slot->key == key)
 	{
 		if (value.ptr != surface_slot->value.ptr) {
-			SliceMem mem = { farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8")), value.len };
-			slice_copy(pcast(SliceByte, mem), value);
+			SliceByte mem = { farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8")), value.len };
+			slice_copy(mem, value);
 			surface_slot->value = pcast(Str8, mem);
 		}
 		surface_slot->key      = key;
@ -818,8 +821,8 @@ Str8* str8cache_set(Str8Cache* cache, U64 key, Str8 value)
 		if ( ! slot->next->occupied || slot->next->key == key)
 		{
 			if (value.ptr != slot->next->value.ptr) {
-				SliceMem mem = { farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8")), value.len };
-				slice_copy(pcast(SliceByte, mem), value);
+				SliceByte mem = { farena__push(& cache->a_str, size_of(U8), value.len, lit("Str8")), value.len };
+				slice_copy(mem, value);
 				slot->next->value = (Str8){mem.ptr, mem.len / size_of(char)};
 			}
 			slot->next->value = value;
@ -869,8 +872,8 @@ struct WATL_SliceLine {
 #if defined(DEMO__WATL_PARSE_V1) || defined(DEMO__WATL_DUMP_V1)

 struct Opts__watl_parse {
-	SliceMem   backing_nodes;
-	SliceMem   backing_lines;
+	SliceByte   backing_nodes;
+	SliceByte   backing_lines;
 	Str8Cache* str_cache;
 };

@ -928,21 +931,23 @@ WATL_ParseInfo watl__parse(WATL_SliceTok tokens, Opts__watl_parse* opts) { WATL_
 #ifdef DEMO__WATL_PARSE_V1
 int main()
 {
+	set_utf8_codepage();
+
 	// This will limit for our V1 read to 64kb at most.
 	FMem_64KB    read_mem = {0};
 	FileOpResult read_res = file_read_contents(lit("demo.str_cache.c"), .backing = fmem_slice(read_mem) );

 	// This will limit our V1 lex to only 8 megs worth of token tracking on a file.
-	SliceMem     mem_toks = slicemem_alloc(MEGABYTES(8));
+	SliceByte     mem_toks = slicemem_alloc(MEGABYTES(8));
 	WATL_LexInfo lex_res  = watl_lex(pcast(Str8, read_res.content), .pool_toks = mem_toks);

-	SliceMem mem_cache_strs  = slicemem_alloc(MEGABYTES(64));
-	SliceMem mem_cache_slots = slicemem_alloc(1024 * 1024 * 16 * size_of(Str8Cache_SliceSlot));
-	SliceMem mem_cache_table = slicemem_alloc(1024        * 16 * size_of(Str8Cache_SliceSlot));
+	SliceByte mem_cache_strs  = slicemem_alloc(MEGABYTES(64));
+	SliceByte mem_cache_slots = slicemem_alloc(1024 * 1024 * 16 * size_of(Str8Cache_SliceSlot));
+	SliceByte mem_cache_table = slicemem_alloc(1024        * 16 * size_of(Str8Cache_SliceSlot));
 	Str8Cache str_cache      = str8cache_init(mem_cache_strs, mem_cache_slots, mem_cache_table);

-	SliceMem mem_parse_nodes = slicemem_alloc(MEGABYTES(4));
-	SliceMem mem_parse_lines = slicemem_alloc(MEGABYTES(4));
+	SliceByte mem_parse_nodes = slicemem_alloc(MEGABYTES(4));
+	SliceByte mem_parse_lines = slicemem_alloc(MEGABYTES(4));
 	WATL_ParseInfo parse_res = watl_parse(lex_res.tokens, .backing_nodes = mem_parse_nodes, .backing_lines = mem_parse_lines, .str_cache = & str_cache);

 	// unnecessary in this case but if you want to explicitly:
@ -960,24 +965,24 @@ We'll be utilizing a new construct called a string generator which be tied to al

 typedef struct Str8Gen Str8Gen;
 struct Str8Gen {
-	SliceMem backing; // For V1 the backing buffer is fixed size.
+	SliceByte backing; // For V1 the backing buffer is fixed size.
 	char* ptr;
 	SSIZE len;
 };

-void    str8gen_init(Str8Gen* gen, SliceMem backing);
-Str8Gen str8gen_make(              SliceMem backing);
+void    str8gen_init(Str8Gen* gen, SliceByte backing);
+Str8Gen str8gen_make(              SliceByte backing);

 void str8gen_append_str8(Str8Gen* gen, Str8 str);
 // void str8gen_append_fmt (Str8Gen* gen, Str8 fmt, ...);

-void str8gen_init(Str8Gen* gen, SliceMem backing) {
+void str8gen_init(Str8Gen* gen, SliceByte backing) {
 	assert(gen != nullptr);
 	gen->backing = backing;
 	gen->ptr     = backing.ptr;
 	gen->len     = 0;
 }
-Str8Gen str8gen_make(SliceMem backing) { Str8Gen gen; str8gen_init(& gen, backing); return gen; }
+Str8Gen str8gen_make(SliceByte backing) { Str8Gen gen; str8gen_init(& gen, backing); return gen; }

 void str8gen_append_str8(Str8Gen* gen, Str8 str) {
 	SSIZE  left = gen->backing.len - gen->len;
@ -1010,7 +1015,7 @@ struct SliceFmtTokEntry {
 /*
 This is a token substiuting formatter using a array table lookup for tokens to substitute.
 */
-Str8 fmt_vtoken_slice(SliceMem buffer, SliceFmtTokEntry tokens, Str8 fmt_template)
+Str8 fmt_vtoken_slice(SliceByte buffer, SliceFmtTokEntry tokens, Str8 fmt_template)
 {
 	slice_assert(buffer);
 	slice_assert(tokens);
@ -1047,7 +1052,7 @@ Str8 fmt_vtoken_slice(SliceMem buffer, SliceFmtTokEntry tokens, Str8 fmt_templat
 			}

 			// Hashing the potential token and cross checking it with our token table
-			U64   key   = 0; hash64_djb8(& key, (SliceByte){cursor_fmt + 1, potential_token_length});
+			U64   key   = 0; hash64_djb8(& key, (SliceByte){ cast(void*, cursor_fmt + 1), potential_token_length});
 			Str8* value = nullptr;
 			for (slice_iter(tokens, token))
 			{
@ -1097,7 +1102,7 @@ struct SliceStr8 {
 	SSIZE len;
 };

-Str8 fmt__vtoken(SliceMem backing, Str8 fmt_template, SliceStr8* tokens)
+Str8 fmt__vtoken(SliceByte backing, Str8 fmt_template, SliceStr8* tokens)
 {
 	FArena           a_backing = farena_init(backing);
 	SliceFmtTokEntry table     = {a_backing.start, 0};
@ -1106,12 +1111,12 @@ Str8 fmt__vtoken(SliceMem backing, Str8 fmt_template, SliceStr8* tokens)
 	for (slice_iter(*tokens, token)) {
 		FmtTokEntry* entry = farena_push(a_backing, FmtTokEntry); 
 		* entry = (FmtTokEntry){0};
-		hash64_djb8(& entry->key, (SliceByte){token->ptr, token->len});
+		hash64_djb8(& entry->key, (SliceByte){cast(void*, token->ptr), token->len});
 		++ token;
 		entry->value = * token;
 		++ table.len;
 	}
-	SliceMem buffer = { .ptr = cast(U8*, a_backing.start) + a_backing.used, .len = a_backing.capacity - a_backing.used };
+	SliceByte buffer = { .ptr = cast(U8*, a_backing.start) + a_backing.used, .len = a_backing.capacity - a_backing.used };
 	Str8 result = fmt_vtoken_slice(buffer, table, fmt_template);
 	return result;
 }
@ -1132,7 +1137,9 @@ Str8 mappings [][2] = {
 #ifdef DEMO__WATL_DUMP_V1
 int main()
 {
-	SliceMem scratch = slicemem_alloc(MEGABYTES(64));
+	set_utf8_codepage();
+
+	SliceByte scratch = slicemem_alloc(MEGABYTES(64));
 	Str8 subst_table [][2] = {
 		fmt_vtoken_entry("maybe_sub", "IT SUBST!!!"),
 	};