Iteration on whitespace parser
This commit is contained in:
		| @@ -2,7 +2,7 @@ | ||||
| This is a prototype parser meant to only parse whitespace from visible blocks of code. | ||||
| Its meant to be the most minimal useful AST for boostrapping an AST Editor. | ||||
|  | ||||
| All symbols related directly to the parser are prefixed with the WS_ namespace. | ||||
| All symbols related directly to the parser are prefixed with the PWS_ namespace. | ||||
|  | ||||
| The AST is composed of the following node types: | ||||
| * Visible | ||||
| @@ -44,7 +44,7 @@ Rune_Carriage_Return :: 'r' | ||||
| Rune_New_Line        :: '\n' | ||||
| // Rune_Tab_Vertical :: '\v' | ||||
|  | ||||
| WS_TokenType :: enum u32 { | ||||
| PWS_TokenType :: enum u32 { | ||||
| 	Invalid, | ||||
| 	Visible, | ||||
| 	Space, | ||||
| @@ -53,85 +53,85 @@ WS_TokenType :: enum u32 { | ||||
| 	Count, | ||||
| } | ||||
|  | ||||
| // TODO(Ed) : The runes and token arrays should be handled by a slab allocator dedicated to ASTs | ||||
| // TODO(Ed) : The runes and token arrays should be handled by a slab allocator | ||||
| // This can grow in undeterministic ways, persistent will get very polluted otherwise. | ||||
| WS_LexResult :: struct { | ||||
| PWS_LexResult :: struct { | ||||
| 	allocator : Allocator, | ||||
| 	content   : string, | ||||
| 	runes     : []rune, | ||||
| 	tokens    : Array(WS_Token), | ||||
| 	tokens    : Array(PWS_Token), | ||||
| } | ||||
|  | ||||
| WS_Token :: struct { | ||||
| 	type         : WS_TokenType, | ||||
| PWS_Token :: struct { | ||||
| 	type         : PWS_TokenType, | ||||
| 	line, column : u32, | ||||
| 	ptr          : ^rune, | ||||
| } | ||||
|  | ||||
| WS_AST_Content :: union #no_nil { | ||||
| 	[] WS_Token, | ||||
| PWS_AST_Content :: union #no_nil { | ||||
| 	^PWS_Token, | ||||
| 	[] rune, | ||||
| } | ||||
|  | ||||
| WS_AST_Spaces :: struct { | ||||
| 	content : WS_AST_Content, | ||||
| PWS_AST_Spaces :: struct { | ||||
| 	content : PWS_AST_Content, | ||||
|  | ||||
| 	using links : DLL_NodePN(WS_AST), | ||||
| 	using links : DLL_NodePN(PWS_AST), | ||||
| } | ||||
|  | ||||
| WS_AST_Tabs :: struct { | ||||
| 	content : WS_AST_Content, | ||||
| PWS_AST_Tabs :: struct { | ||||
| 	content : PWS_AST_Content, | ||||
|  | ||||
| 	using links : DLL_NodePN(WS_AST), | ||||
| 	using links : DLL_NodePN(PWS_AST), | ||||
| } | ||||
|  | ||||
| WS_AST_Visible :: struct { | ||||
| 	content : WS_AST_Content, | ||||
| PWS_AST_Visible :: struct { | ||||
| 	content : PWS_AST_Content, | ||||
|  | ||||
| 	using links : DLL_NodePN(WS_AST), | ||||
| 	using links : DLL_NodePN(PWS_AST), | ||||
| } | ||||
|  | ||||
| WS_AST_Line :: struct { | ||||
| 	using content : DLL_NodeFL(WS_AST), | ||||
| 	end_token     : ^ WS_Token, | ||||
| PWS_AST_Line :: struct { | ||||
| 	using content : DLL_NodeFL(PWS_AST), | ||||
| 	end_token     : ^ PWS_Token, | ||||
|  | ||||
| 	using links : DLL_NodePN(WS_AST), | ||||
| 	using links : DLL_NodePN(PWS_AST), | ||||
| } | ||||
|  | ||||
| WS_AST :: union #no_nil { | ||||
| 	WS_AST_Visible, | ||||
| 	WS_AST_Spaces, | ||||
| 	WS_AST_Tabs, | ||||
| 	WS_AST_Line, | ||||
| PWS_AST :: union #no_nil { | ||||
| 	PWS_AST_Visible, | ||||
| 	PWS_AST_Spaces, | ||||
| 	PWS_AST_Tabs, | ||||
| 	PWS_AST_Line, | ||||
| } | ||||
|  | ||||
| WS_ParseError :: struct { | ||||
| 	token : ^WS_Token, | ||||
| PWS_ParseError :: struct { | ||||
| 	token : ^PWS_Token, | ||||
| 	msg   : string, | ||||
| } | ||||
|  | ||||
| WS_ParseError_Max        :: 32 | ||||
| WS_NodeArray_ReserveSize :: Kilobyte * 4 | ||||
| WS_LineArray_RserveSize  :: Kilobyte | ||||
| PWS_ParseError_Max        :: 32 | ||||
| PWS_NodeArray_ReserveSize :: Kilobyte * 4 | ||||
| PWS_LineArray_RserveSize  :: Kilobyte | ||||
|  | ||||
| // TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to ASTs | ||||
| // TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs | ||||
| // This can grow in undeterministic ways, persistent will get very polluted otherwise. | ||||
| WS_ParseResult :: struct { | ||||
| PWS_ParseResult :: struct { | ||||
| 	content   : string, | ||||
| 	runes     : []rune, | ||||
| 	tokens    : Array(WS_Token), | ||||
| 	nodes     : Array(WS_AST), | ||||
| 	lines     : Array( ^WS_AST_Line), | ||||
| 	errors    : [WS_ParseError_Max] WS_ParseError, | ||||
| 	tokens    : Array(PWS_Token), | ||||
| 	nodes     : Array(PWS_AST), | ||||
| 	lines     : Array( ^PWS_AST_Line), | ||||
| 	errors    : [PWS_ParseError_Max] PWS_ParseError, | ||||
| } | ||||
|  | ||||
| // @(private="file") | ||||
| // AST :: WS_AST | ||||
| // AST :: PWS_AST | ||||
|  | ||||
| ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexResult, AllocatorError ) | ||||
| pws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError ) | ||||
| { | ||||
| 	LexerData :: struct { | ||||
| 		using result : WS_LexResult, | ||||
| 		using result : PWS_LexResult, | ||||
|  | ||||
| 		head   : [^] rune, | ||||
| 		left   : i32, | ||||
| @@ -141,57 +141,60 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes | ||||
| 	using lexer : LexerData | ||||
| 	context.user_ptr = & lexer | ||||
|  | ||||
| 	rune_type :: proc() -> WS_TokenType | ||||
| 	rune_type :: proc() -> PWS_TokenType | ||||
| 	{ | ||||
| 		using self := context_ext( LexerData) | ||||
|  | ||||
| 		switch (head[0]) | ||||
| 		{ | ||||
| 			case Rune_Space: | ||||
| 				return WS_TokenType.Space | ||||
| 				return PWS_TokenType.Space | ||||
|  | ||||
| 			case Rune_Tab: | ||||
| 				return WS_TokenType.Tab | ||||
| 				return PWS_TokenType.Tab | ||||
|  | ||||
| 			case Rune_New_Line: | ||||
| 				return WS_TokenType.New_Line | ||||
| 				return PWS_TokenType.New_Line | ||||
|  | ||||
| 			// Support for CRLF format | ||||
| 			case Rune_Carriage_Return: | ||||
| 			{ | ||||
| 				previous := cast( ^ rune) (uintptr(head) - 1) | ||||
| 				if (previous ^) == Rune_New_Line { | ||||
| 					return WS_TokenType.New_Line | ||||
| 				if left - 1 ==  0 { | ||||
| 					return PWS_TokenType.Invalid | ||||
| 				} | ||||
| 				if head[1] == Rune_New_Line { | ||||
| 					return PWS_TokenType.New_Line | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
|  | ||||
| 		// Everything that isn't the supported whitespace code points is considered 'visible' | ||||
| 		// Eventually we should support other types of whitespace | ||||
| 		return WS_TokenType.Visible | ||||
| 		return PWS_TokenType.Visible | ||||
| 	} | ||||
|  | ||||
| 	advance :: proc() -> WS_TokenType { | ||||
| 	advance :: proc() -> PWS_TokenType { | ||||
| 		using self := context_ext( LexerData) | ||||
|  | ||||
| 		head    = head[1:] | ||||
| 		left   -= 1 | ||||
| 		column += 1 | ||||
| 		type   := rune_type() | ||||
| 		line   += u32(type == WS_TokenType.New_Line) | ||||
| 		line   += u32(type == PWS_TokenType.New_Line) | ||||
| 		return type | ||||
| 	} | ||||
|  | ||||
| 	alloc_error : AllocatorError | ||||
| 	runes, alloc_error = to_runes( content, allocator ) | ||||
| 	if alloc_error != AllocatorError.None { | ||||
| 		ensure(false, "Failed to allocate runes from content") | ||||
| 		return result, alloc_error | ||||
| 	} | ||||
|  | ||||
| 	left = cast(i32) len(runes) | ||||
| 	head = & runes[0] | ||||
|  | ||||
| 	tokens, alloc_error = array_init_reserve( WS_Token, allocator, u64(left / 2) ) | ||||
| 	tokens, alloc_error = array_init_reserve( PWS_Token, allocator, u64(left / 2) ) | ||||
| 	if alloc_error != AllocatorError.None { | ||||
| 		ensure(false, "Failed to allocate token's array") | ||||
| 		return result, alloc_error | ||||
| @@ -202,7 +205,7 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes | ||||
|  | ||||
| 	for ; left > 0; | ||||
| 	{ | ||||
| 		current       : WS_Token | ||||
| 		current       : PWS_Token | ||||
| 		current.type   = rune_type() | ||||
| 		current.line   = line | ||||
| 		current.column = column | ||||
| @@ -220,21 +223,21 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes | ||||
| 	return result, alloc_error | ||||
| } | ||||
|  | ||||
| ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_ParseResult, AllocatorError ) | ||||
| pws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError ) | ||||
| { | ||||
| 	ParseData :: struct { | ||||
| 		using result :  WS_ParseResult, | ||||
| 		using result :  PWS_ParseResult, | ||||
|  | ||||
| 		left  : u32, | ||||
| 		head  : [^]WS_Token, | ||||
| 		line  : WS_AST_Line, | ||||
| 		head  : [^]PWS_Token, | ||||
| 		line  : PWS_AST_Line, | ||||
| 	} | ||||
|  | ||||
| 	using parser : ParseData | ||||
| 	context.user_ptr = & result | ||||
|  | ||||
| 	//region Helper procs | ||||
| 	peek_next :: proc() -> ( ^WS_Token) | ||||
| 	peek_next :: proc() -> ( ^PWS_Token) | ||||
| 	{ | ||||
| 		using self := context_ext( ParseData) | ||||
| 		if left - 1 ==  0 { | ||||
| @@ -244,14 +247,14 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse | ||||
| 		return head[ 1: ] | ||||
| 	} | ||||
|  | ||||
| 	check_next :: proc(  expected : WS_TokenType ) -> b32 { | ||||
| 	check_next :: proc(  expected : PWS_TokenType ) -> b32 { | ||||
| 		using self := context_ext( ParseData) | ||||
|  | ||||
| 		next := peek_next() | ||||
| 		return next != nil && next.type == expected | ||||
| 	} | ||||
|  | ||||
| 	advance :: proc( expected : WS_TokenType ) -> (^WS_Token) | ||||
| 	advance :: proc( expected : PWS_TokenType ) -> (^PWS_Token) | ||||
| 	{ | ||||
| 		using self := context_ext( ParseData) | ||||
| 		next := peek_next() | ||||
| @@ -267,7 +270,7 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse | ||||
| 	} | ||||
| 	//endregion Helper procs | ||||
|  | ||||
| 	lex, alloc_error := ws_parser_lex( content, allocator ) | ||||
| 	lex, alloc_error := pws_parser_lex( content, allocator ) | ||||
| 	if alloc_error != AllocatorError.None { | ||||
|  | ||||
| 	} | ||||
| @@ -275,12 +278,12 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse | ||||
| 	runes  = lex.runes | ||||
| 	tokens = lex.tokens | ||||
|  | ||||
| 	nodes, alloc_error = array_init_reserve( WS_AST, allocator, WS_NodeArray_ReserveSize ) | ||||
| 	nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize ) | ||||
| 	if alloc_error != AllocatorError.None { | ||||
|  | ||||
| 	} | ||||
|  | ||||
| 	lines, alloc_error = array_init_reserve( ^WS_AST_Line, allocator, WS_LineArray_RserveSize ) | ||||
| 	lines, alloc_error = array_init_reserve( ^PWS_AST_Line, allocator, PWS_LineArray_RserveSize ) | ||||
| 	if alloc_error != AllocatorError.None { | ||||
|  | ||||
| 	} | ||||
| @@ -290,22 +293,17 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse | ||||
| 	// Parse Line | ||||
| 	for ; left > 0; | ||||
| 	{ | ||||
| 		parse_content :: proc( $ Type : typeid, tok_type : WS_TokenType ) -> Type | ||||
| 		parse_content :: proc( $ Type : typeid, tok_type : PWS_TokenType ) -> Type | ||||
| 		{ | ||||
| 			using self := context_ext( ParseData) | ||||
|  | ||||
| 			ast   : Type | ||||
| 			start := head | ||||
| 			end   : [^]WS_Token | ||||
|  | ||||
| 			for ; check_next( WS_TokenType.Visible ); { | ||||
| 				end = advance( tok_type ) | ||||
| 			} | ||||
| 			ast.content = slice_ptr( start, ptr_sub( end, start )) | ||||
| 			ast : Type | ||||
| 			ast.content = cast( ^PWS_Token) head | ||||
| 			advance( tok_type ) | ||||
| 			return ast | ||||
| 		} | ||||
|  | ||||
| 		add_node :: proc( ast : WS_AST ) //-> ( should_return : b32 ) | ||||
| 		add_node :: proc( ast : PWS_AST ) //-> ( should_return : b32 ) | ||||
| 		{ | ||||
| 			using self := context_ext( ParseData) | ||||
|  | ||||
| @@ -313,42 +311,42 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse | ||||
| 			array_append( & nodes, ast ) | ||||
|  | ||||
| 			if line.first == nil { | ||||
| 				line.first = array_back( & nodes ) | ||||
| 				line.first = array_back( nodes ) | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				line.last = array_back( & nodes) | ||||
| 				line.last = array_back( nodes) | ||||
| 			} | ||||
| 		} | ||||
|  | ||||
| 		// TODO(Ed) : Harden this | ||||
| 		#partial switch head[0].type | ||||
| 		{ | ||||
| 			case WS_TokenType.Visible: | ||||
| 			case PWS_TokenType.Visible: | ||||
| 			{ | ||||
| 				ast := parse_content( WS_AST_Visible, WS_TokenType.Visible ) | ||||
| 				ast := parse_content( PWS_AST_Visible, PWS_TokenType.Visible ) | ||||
| 				add_node( ast ) | ||||
| 			} | ||||
| 			case WS_TokenType.Space: | ||||
| 			case PWS_TokenType.Space: | ||||
| 			{ | ||||
| 				ast := parse_content( WS_AST_Visible, WS_TokenType.Space ) | ||||
| 				ast := parse_content( PWS_AST_Visible, PWS_TokenType.Space ) | ||||
| 				add_node( ast ) | ||||
| 			} | ||||
| 			case WS_TokenType.Tab: | ||||
| 			case PWS_TokenType.Tab: | ||||
| 			{ | ||||
| 				ast := parse_content( WS_AST_Tabs, WS_TokenType.Tab ) | ||||
| 				ast := parse_content( PWS_AST_Tabs, PWS_TokenType.Tab ) | ||||
| 				add_node( ast ) | ||||
| 			} | ||||
| 			case WS_TokenType.New_Line: | ||||
| 			case PWS_TokenType.New_Line: | ||||
| 			{ | ||||
| 				line.end_token = head | ||||
|  | ||||
| 				ast : WS_AST | ||||
| 				ast : PWS_AST | ||||
| 				ast = line | ||||
|  | ||||
| 				// TODO(Ed) : Harden This | ||||
| 				array_append( & nodes, ast ) | ||||
| 				array_append( & lines, & array_back( & nodes).(WS_AST_Line) ) | ||||
| 				array_append( & lines, & array_back(nodes).(PWS_AST_Line) ) | ||||
| 				line = {} | ||||
| 			} | ||||
| 		} | ||||
|   | ||||
		Reference in New Issue
	
	Block a user