From fb6d66140e4c8f1192cdd87a6663dc868638345a Mon Sep 17 00:00:00 2001 From: Ed_ Date: Tue, 5 Mar 2024 10:17:27 -0500 Subject: [PATCH] Iteration on whitespace parser --- code/parser_whitespace.odin | 166 ++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 84 deletions(-) diff --git a/code/parser_whitespace.odin b/code/parser_whitespace.odin index 3829204..a5d567c 100644 --- a/code/parser_whitespace.odin +++ b/code/parser_whitespace.odin @@ -2,7 +2,7 @@ This is a prototype parser meant to only parse whitespace from visible blocks of code. Its meant to be the most minimal useful AST for boostrapping an AST Editor. -All symbols related directly to the parser are prefixed with the WS_ namespace. +All symbols related directly to the parser are prefixed with the PWS_ namespace. The AST is composed of the following node types: * Visible @@ -44,7 +44,7 @@ Rune_Carriage_Return :: 'r' Rune_New_Line :: '\n' // Rune_Tab_Vertical :: '\v' -WS_TokenType :: enum u32 { +PWS_TokenType :: enum u32 { Invalid, Visible, Space, @@ -53,85 +53,85 @@ WS_TokenType :: enum u32 { Count, } -// TODO(Ed) : The runes and token arrays should be handled by a slab allocator dedicated to ASTs +// TODO(Ed) : The runes and token arrays should be handled by a slab allocator // This can grow in undeterministic ways, persistent will get very polluted otherwise. -WS_LexResult :: struct { +PWS_LexResult :: struct { allocator : Allocator, content : string, runes : []rune, - tokens : Array(WS_Token), + tokens : Array(PWS_Token), } -WS_Token :: struct { - type : WS_TokenType, +PWS_Token :: struct { + type : PWS_TokenType, line, column : u32, ptr : ^rune, } -WS_AST_Content :: union #no_nil { - [] WS_Token, +PWS_AST_Content :: union #no_nil { + ^PWS_Token, [] rune, } -WS_AST_Spaces :: struct { - content : WS_AST_Content, +PWS_AST_Spaces :: struct { + content : PWS_AST_Content, - using links : DLL_NodePN(WS_AST), + using links : DLL_NodePN(PWS_AST), } -WS_AST_Tabs :: struct { - content : WS_AST_Content, +PWS_AST_Tabs :: struct { + content : PWS_AST_Content, - using links : DLL_NodePN(WS_AST), + using links : DLL_NodePN(PWS_AST), } -WS_AST_Visible :: struct { - content : WS_AST_Content, +PWS_AST_Visible :: struct { + content : PWS_AST_Content, - using links : DLL_NodePN(WS_AST), + using links : DLL_NodePN(PWS_AST), } -WS_AST_Line :: struct { - using content : DLL_NodeFL(WS_AST), - end_token : ^ WS_Token, +PWS_AST_Line :: struct { + using content : DLL_NodeFL(PWS_AST), + end_token : ^ PWS_Token, - using links : DLL_NodePN(WS_AST), + using links : DLL_NodePN(PWS_AST), } -WS_AST :: union #no_nil { - WS_AST_Visible, - WS_AST_Spaces, - WS_AST_Tabs, - WS_AST_Line, +PWS_AST :: union #no_nil { + PWS_AST_Visible, + PWS_AST_Spaces, + PWS_AST_Tabs, + PWS_AST_Line, } -WS_ParseError :: struct { - token : ^WS_Token, +PWS_ParseError :: struct { + token : ^PWS_Token, msg : string, } -WS_ParseError_Max :: 32 -WS_NodeArray_ReserveSize :: Kilobyte * 4 -WS_LineArray_RserveSize :: Kilobyte +PWS_ParseError_Max :: 32 +PWS_NodeArray_ReserveSize :: Kilobyte * 4 +PWS_LineArray_RserveSize :: Kilobyte -// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to ASTs +// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs // This can grow in undeterministic ways, persistent will get very polluted otherwise. -WS_ParseResult :: struct { +PWS_ParseResult :: struct { content : string, runes : []rune, - tokens : Array(WS_Token), - nodes : Array(WS_AST), - lines : Array( ^WS_AST_Line), - errors : [WS_ParseError_Max] WS_ParseError, + tokens : Array(PWS_Token), + nodes : Array(PWS_AST), + lines : Array( ^PWS_AST_Line), + errors : [PWS_ParseError_Max] PWS_ParseError, } // @(private="file") -// AST :: WS_AST +// AST :: PWS_AST -ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexResult, AllocatorError ) +pws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError ) { LexerData :: struct { - using result : WS_LexResult, + using result : PWS_LexResult, head : [^] rune, left : i32, @@ -141,57 +141,60 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes using lexer : LexerData context.user_ptr = & lexer - rune_type :: proc() -> WS_TokenType + rune_type :: proc() -> PWS_TokenType { using self := context_ext( LexerData) switch (head[0]) { case Rune_Space: - return WS_TokenType.Space + return PWS_TokenType.Space case Rune_Tab: - return WS_TokenType.Tab + return PWS_TokenType.Tab case Rune_New_Line: - return WS_TokenType.New_Line + return PWS_TokenType.New_Line // Support for CRLF format case Rune_Carriage_Return: { - previous := cast( ^ rune) (uintptr(head) - 1) - if (previous ^) == Rune_New_Line { - return WS_TokenType.New_Line + if left - 1 == 0 { + return PWS_TokenType.Invalid + } + if head[1] == Rune_New_Line { + return PWS_TokenType.New_Line } } } // Everything that isn't the supported whitespace code points is considered 'visible' // Eventually we should support other types of whitespace - return WS_TokenType.Visible + return PWS_TokenType.Visible } - advance :: proc() -> WS_TokenType { + advance :: proc() -> PWS_TokenType { using self := context_ext( LexerData) head = head[1:] left -= 1 column += 1 type := rune_type() - line += u32(type == WS_TokenType.New_Line) + line += u32(type == PWS_TokenType.New_Line) return type } alloc_error : AllocatorError runes, alloc_error = to_runes( content, allocator ) if alloc_error != AllocatorError.None { + ensure(false, "Failed to allocate runes from content") return result, alloc_error } left = cast(i32) len(runes) head = & runes[0] - tokens, alloc_error = array_init_reserve( WS_Token, allocator, u64(left / 2) ) + tokens, alloc_error = array_init_reserve( PWS_Token, allocator, u64(left / 2) ) if alloc_error != AllocatorError.None { ensure(false, "Failed to allocate token's array") return result, alloc_error @@ -202,7 +205,7 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes for ; left > 0; { - current : WS_Token + current : PWS_Token current.type = rune_type() current.line = line current.column = column @@ -220,21 +223,21 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes return result, alloc_error } -ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_ParseResult, AllocatorError ) +pws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError ) { ParseData :: struct { - using result : WS_ParseResult, + using result : PWS_ParseResult, left : u32, - head : [^]WS_Token, - line : WS_AST_Line, + head : [^]PWS_Token, + line : PWS_AST_Line, } using parser : ParseData context.user_ptr = & result //region Helper procs - peek_next :: proc() -> ( ^WS_Token) + peek_next :: proc() -> ( ^PWS_Token) { using self := context_ext( ParseData) if left - 1 == 0 { @@ -244,14 +247,14 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse return head[ 1: ] } - check_next :: proc( expected : WS_TokenType ) -> b32 { + check_next :: proc( expected : PWS_TokenType ) -> b32 { using self := context_ext( ParseData) next := peek_next() return next != nil && next.type == expected } - advance :: proc( expected : WS_TokenType ) -> (^WS_Token) + advance :: proc( expected : PWS_TokenType ) -> (^PWS_Token) { using self := context_ext( ParseData) next := peek_next() @@ -267,7 +270,7 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse } //endregion Helper procs - lex, alloc_error := ws_parser_lex( content, allocator ) + lex, alloc_error := pws_parser_lex( content, allocator ) if alloc_error != AllocatorError.None { } @@ -275,12 +278,12 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse runes = lex.runes tokens = lex.tokens - nodes, alloc_error = array_init_reserve( WS_AST, allocator, WS_NodeArray_ReserveSize ) + nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize ) if alloc_error != AllocatorError.None { } - lines, alloc_error = array_init_reserve( ^WS_AST_Line, allocator, WS_LineArray_RserveSize ) + lines, alloc_error = array_init_reserve( ^PWS_AST_Line, allocator, PWS_LineArray_RserveSize ) if alloc_error != AllocatorError.None { } @@ -290,22 +293,17 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse // Parse Line for ; left > 0; { - parse_content :: proc( $ Type : typeid, tok_type : WS_TokenType ) -> Type + parse_content :: proc( $ Type : typeid, tok_type : PWS_TokenType ) -> Type { using self := context_ext( ParseData) - ast : Type - start := head - end : [^]WS_Token - - for ; check_next( WS_TokenType.Visible ); { - end = advance( tok_type ) - } - ast.content = slice_ptr( start, ptr_sub( end, start )) + ast : Type + ast.content = cast( ^PWS_Token) head + advance( tok_type ) return ast } - add_node :: proc( ast : WS_AST ) //-> ( should_return : b32 ) + add_node :: proc( ast : PWS_AST ) //-> ( should_return : b32 ) { using self := context_ext( ParseData) @@ -313,42 +311,42 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse array_append( & nodes, ast ) if line.first == nil { - line.first = array_back( & nodes ) + line.first = array_back( nodes ) } else { - line.last = array_back( & nodes) + line.last = array_back( nodes) } } // TODO(Ed) : Harden this #partial switch head[0].type { - case WS_TokenType.Visible: + case PWS_TokenType.Visible: { - ast := parse_content( WS_AST_Visible, WS_TokenType.Visible ) + ast := parse_content( PWS_AST_Visible, PWS_TokenType.Visible ) add_node( ast ) } - case WS_TokenType.Space: + case PWS_TokenType.Space: { - ast := parse_content( WS_AST_Visible, WS_TokenType.Space ) + ast := parse_content( PWS_AST_Visible, PWS_TokenType.Space ) add_node( ast ) } - case WS_TokenType.Tab: + case PWS_TokenType.Tab: { - ast := parse_content( WS_AST_Tabs, WS_TokenType.Tab ) + ast := parse_content( PWS_AST_Tabs, PWS_TokenType.Tab ) add_node( ast ) } - case WS_TokenType.New_Line: + case PWS_TokenType.New_Line: { line.end_token = head - ast : WS_AST + ast : PWS_AST ast = line // TODO(Ed) : Harden This array_append( & nodes, ast ) - array_append( & lines, & array_back( & nodes).(WS_AST_Line) ) + array_append( & lines, & array_back(nodes).(PWS_AST_Line) ) line = {} } }