Iteration on whitespace parser

2024-03-05 10:17:27 -05:00
parent 218af644d6
commit fb6d66140e
1 changed files with 82 additions and 84 deletions
--- a/code/parser_whitespace.odin
+++ b/code/parser_whitespace.odin
@@ -2,7 +2,7 @@
 This is a prototype parser meant to only parse whitespace from visible blocks of code.
 Its meant to be the most minimal useful AST for boostrapping an AST Editor.

-All symbols related directly to the parser are prefixed with the WS_ namespace.
+All symbols related directly to the parser are prefixed with the PWS_ namespace.

 The AST is composed of the following node types:
 * Visible
@@ -44,7 +44,7 @@ Rune_Carriage_Return :: 'r'
 Rune_New_Line        :: '\n'
 // Rune_Tab_Vertical :: '\v'

-WS_TokenType :: enum u32 {
+PWS_TokenType :: enum u32 {
 	Invalid,
 	Visible,
 	Space,
@@ -53,85 +53,85 @@ WS_TokenType :: enum u32 {
 	Count,
 }

-// TODO(Ed) : The runes and token arrays should be handled by a slab allocator dedicated to ASTs
+// TODO(Ed) : The runes and token arrays should be handled by a slab allocator
 // This can grow in undeterministic ways, persistent will get very polluted otherwise.
-WS_LexResult :: struct {
+PWS_LexResult :: struct {
 	allocator : Allocator,
 	content   : string,
 	runes     : []rune,
-	tokens    : Array(WS_Token),
+	tokens    : Array(PWS_Token),
 }

-WS_Token :: struct {
-	type         : WS_TokenType,
+PWS_Token :: struct {
+	type         : PWS_TokenType,
 	line, column : u32,
 	ptr          : ^rune,
 }

-WS_AST_Content :: union #no_nil {
-	[] WS_Token,
+PWS_AST_Content :: union #no_nil {
+	^PWS_Token,
 	[] rune,
 }

-WS_AST_Spaces :: struct {
-	content : WS_AST_Content,
+PWS_AST_Spaces :: struct {
+	content : PWS_AST_Content,

-	using links : DLL_NodePN(WS_AST),
+	using links : DLL_NodePN(PWS_AST),
 }

-WS_AST_Tabs :: struct {
-	content : WS_AST_Content,
+PWS_AST_Tabs :: struct {
+	content : PWS_AST_Content,

-	using links : DLL_NodePN(WS_AST),
+	using links : DLL_NodePN(PWS_AST),
 }

-WS_AST_Visible :: struct {
-	content : WS_AST_Content,
+PWS_AST_Visible :: struct {
+	content : PWS_AST_Content,

-	using links : DLL_NodePN(WS_AST),
+	using links : DLL_NodePN(PWS_AST),
 }

-WS_AST_Line :: struct {
-	using content : DLL_NodeFL(WS_AST),
-	end_token     : ^ WS_Token,
+PWS_AST_Line :: struct {
+	using content : DLL_NodeFL(PWS_AST),
+	end_token     : ^ PWS_Token,

-	using links : DLL_NodePN(WS_AST),
+	using links : DLL_NodePN(PWS_AST),
 }

-WS_AST :: union #no_nil {
-	WS_AST_Visible,
-	WS_AST_Spaces,
-	WS_AST_Tabs,
-	WS_AST_Line,
+PWS_AST :: union #no_nil {
+	PWS_AST_Visible,
+	PWS_AST_Spaces,
+	PWS_AST_Tabs,
+	PWS_AST_Line,
 }

-WS_ParseError :: struct {
-	token : ^WS_Token,
+PWS_ParseError :: struct {
+	token : ^PWS_Token,
 	msg   : string,
 }

-WS_ParseError_Max        :: 32
-WS_NodeArray_ReserveSize :: Kilobyte * 4
-WS_LineArray_RserveSize  :: Kilobyte
+PWS_ParseError_Max        :: 32
+PWS_NodeArray_ReserveSize :: Kilobyte * 4
+PWS_LineArray_RserveSize  :: Kilobyte

-// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to ASTs
+// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs
 // This can grow in undeterministic ways, persistent will get very polluted otherwise.
-WS_ParseResult :: struct {
+PWS_ParseResult :: struct {
 	content   : string,
 	runes     : []rune,
-	tokens    : Array(WS_Token),
-	nodes     : Array(WS_AST),
-	lines     : Array( ^WS_AST_Line),
-	errors    : [WS_ParseError_Max] WS_ParseError,
+	tokens    : Array(PWS_Token),
+	nodes     : Array(PWS_AST),
+	lines     : Array( ^PWS_AST_Line),
+	errors    : [PWS_ParseError_Max] PWS_ParseError,
 }

 // @(private="file")
-// AST :: WS_AST
+// AST :: PWS_AST

-ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexResult, AllocatorError )
+pws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError )
 {
 	LexerData :: struct {
-		using result : WS_LexResult,
+		using result : PWS_LexResult,

 		head   : [^] rune,
 		left   : i32,
@@ -141,57 +141,60 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
 	using lexer : LexerData
 	context.user_ptr = & lexer

-	rune_type :: proc() -> WS_TokenType
+	rune_type :: proc() -> PWS_TokenType
 	{
 		using self := context_ext( LexerData)

 		switch (head[0])
 		{
 			case Rune_Space:
-				return WS_TokenType.Space
+				return PWS_TokenType.Space

 			case Rune_Tab:
-				return WS_TokenType.Tab
+				return PWS_TokenType.Tab

 			case Rune_New_Line:
-				return WS_TokenType.New_Line
+				return PWS_TokenType.New_Line

 			// Support for CRLF format
 			case Rune_Carriage_Return:
 			{
-				previous := cast( ^ rune) (uintptr(head) - 1)
-				if (previous ^) == Rune_New_Line {
-					return WS_TokenType.New_Line
+				if left - 1 ==  0 {
+					return PWS_TokenType.Invalid
+				}
+				if head[1] == Rune_New_Line {
+					return PWS_TokenType.New_Line
 				}
 			}
 		}

 		// Everything that isn't the supported whitespace code points is considered 'visible'
 		// Eventually we should support other types of whitespace
-		return WS_TokenType.Visible
+		return PWS_TokenType.Visible
 	}

-	advance :: proc() -> WS_TokenType {
+	advance :: proc() -> PWS_TokenType {
 		using self := context_ext( LexerData)

 		head    = head[1:]
 		left   -= 1
 		column += 1
 		type   := rune_type()
-		line   += u32(type == WS_TokenType.New_Line)
+		line   += u32(type == PWS_TokenType.New_Line)
 		return type
 	}

 	alloc_error : AllocatorError
 	runes, alloc_error = to_runes( content, allocator )
 	if alloc_error != AllocatorError.None {
+		ensure(false, "Failed to allocate runes from content")
 		return result, alloc_error
 	}

 	left = cast(i32) len(runes)
 	head = & runes[0]

-	tokens, alloc_error = array_init_reserve( WS_Token, allocator, u64(left / 2) )
+	tokens, alloc_error = array_init_reserve( PWS_Token, allocator, u64(left / 2) )
 	if alloc_error != AllocatorError.None {
 		ensure(false, "Failed to allocate token's array")
 		return result, alloc_error
@@ -202,7 +205,7 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes

 	for ; left > 0;
 	{
-		current       : WS_Token
+		current       : PWS_Token
 		current.type   = rune_type()
 		current.line   = line
 		current.column = column
@@ -220,21 +223,21 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
 	return result, alloc_error
 }

-ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_ParseResult, AllocatorError )
+pws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError )
 {
 	ParseData :: struct {
-		using result :  WS_ParseResult,
+		using result :  PWS_ParseResult,

 		left  : u32,
-		head  : [^]WS_Token,
-		line  : WS_AST_Line,
+		head  : [^]PWS_Token,
+		line  : PWS_AST_Line,
 	}

 	using parser : ParseData
 	context.user_ptr = & result

 	//region Helper procs
-	peek_next :: proc() -> ( ^WS_Token)
+	peek_next :: proc() -> ( ^PWS_Token)
 	{
 		using self := context_ext( ParseData)
 		if left - 1 ==  0 {
@@ -244,14 +247,14 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
 		return head[ 1: ]
 	}

-	check_next :: proc(  expected : WS_TokenType ) -> b32 {
+	check_next :: proc(  expected : PWS_TokenType ) -> b32 {
 		using self := context_ext( ParseData)

 		next := peek_next()
 		return next != nil && next.type == expected
 	}

-	advance :: proc( expected : WS_TokenType ) -> (^WS_Token)
+	advance :: proc( expected : PWS_TokenType ) -> (^PWS_Token)
 	{
 		using self := context_ext( ParseData)
 		next := peek_next()
@@ -267,7 +270,7 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
 	}
 	//endregion Helper procs

-	lex, alloc_error := ws_parser_lex( content, allocator )
+	lex, alloc_error := pws_parser_lex( content, allocator )
 	if alloc_error != AllocatorError.None {

 	}
@@ -275,12 +278,12 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
 	runes  = lex.runes
 	tokens = lex.tokens

-	nodes, alloc_error = array_init_reserve( WS_AST, allocator, WS_NodeArray_ReserveSize )
+	nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize )
 	if alloc_error != AllocatorError.None {

 	}

-	lines, alloc_error = array_init_reserve( ^WS_AST_Line, allocator, WS_LineArray_RserveSize )
+	lines, alloc_error = array_init_reserve( ^PWS_AST_Line, allocator, PWS_LineArray_RserveSize )
 	if alloc_error != AllocatorError.None {

 	}
@@ -290,22 +293,17 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
 	// Parse Line
 	for ; left > 0;
 	{
-		parse_content :: proc( $ Type : typeid, tok_type : WS_TokenType ) -> Type
+		parse_content :: proc( $ Type : typeid, tok_type : PWS_TokenType ) -> Type
 		{
 			using self := context_ext( ParseData)

-			ast   : Type
-			start := head
-			end   : [^]WS_Token
-
-			for ; check_next( WS_TokenType.Visible ); {
-				end = advance( tok_type )
-			}
-			ast.content = slice_ptr( start, ptr_sub( end, start ))
+			ast : Type
+			ast.content = cast( ^PWS_Token) head
+			advance( tok_type )
 			return ast
 		}

-		add_node :: proc( ast : WS_AST ) //-> ( should_return : b32 )
+		add_node :: proc( ast : PWS_AST ) //-> ( should_return : b32 )
 		{
 			using self := context_ext( ParseData)

@@ -313,42 +311,42 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
 			array_append( & nodes, ast )

 			if line.first == nil {
-				line.first = array_back( & nodes )
+				line.first = array_back( nodes )
 			}
 			else
 			{
-				line.last = array_back( & nodes)
+				line.last = array_back( nodes)
 			}
 		}

 		// TODO(Ed) : Harden this
 		#partial switch head[0].type
 		{
-			case WS_TokenType.Visible:
+			case PWS_TokenType.Visible:
 			{
-				ast := parse_content( WS_AST_Visible, WS_TokenType.Visible )
+				ast := parse_content( PWS_AST_Visible, PWS_TokenType.Visible )
 				add_node( ast )
 			}
-			case WS_TokenType.Space:
+			case PWS_TokenType.Space:
 			{
-				ast := parse_content( WS_AST_Visible, WS_TokenType.Space )
+				ast := parse_content( PWS_AST_Visible, PWS_TokenType.Space )
 				add_node( ast )
 			}
-			case WS_TokenType.Tab:
+			case PWS_TokenType.Tab:
 			{
-				ast := parse_content( WS_AST_Tabs, WS_TokenType.Tab )
+				ast := parse_content( PWS_AST_Tabs, PWS_TokenType.Tab )
 				add_node( ast )
 			}
-			case WS_TokenType.New_Line:
+			case PWS_TokenType.New_Line:
 			{
 				line.end_token = head

-				ast : WS_AST
+				ast : PWS_AST
 				ast = line

 				// TODO(Ed) : Harden This
 				array_append( & nodes, ast )
-				array_append( & lines, & array_back( & nodes).(WS_AST_Line) )
+				array_append( & lines, & array_back(nodes).(PWS_AST_Line) )
 				line = {}
 			}
 		}