Iteration on whitespace parser

This commit is contained in:
Edward R. Gonzalez 2024-03-05 10:17:27 -05:00
parent 218af644d6
commit fb6d66140e

View File

@ -2,7 +2,7 @@
This is a prototype parser meant to only parse whitespace from visible blocks of code.
Its meant to be the most minimal useful AST for boostrapping an AST Editor.
All symbols related directly to the parser are prefixed with the WS_ namespace.
All symbols related directly to the parser are prefixed with the PWS_ namespace.
The AST is composed of the following node types:
* Visible
@ -44,7 +44,7 @@ Rune_Carriage_Return :: 'r'
Rune_New_Line :: '\n'
// Rune_Tab_Vertical :: '\v'
WS_TokenType :: enum u32 {
PWS_TokenType :: enum u32 {
Invalid,
Visible,
Space,
@ -53,85 +53,85 @@ WS_TokenType :: enum u32 {
Count,
}
// TODO(Ed) : The runes and token arrays should be handled by a slab allocator dedicated to ASTs
// TODO(Ed) : The runes and token arrays should be handled by a slab allocator
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
WS_LexResult :: struct {
PWS_LexResult :: struct {
allocator : Allocator,
content : string,
runes : []rune,
tokens : Array(WS_Token),
tokens : Array(PWS_Token),
}
WS_Token :: struct {
type : WS_TokenType,
PWS_Token :: struct {
type : PWS_TokenType,
line, column : u32,
ptr : ^rune,
}
WS_AST_Content :: union #no_nil {
[] WS_Token,
PWS_AST_Content :: union #no_nil {
^PWS_Token,
[] rune,
}
WS_AST_Spaces :: struct {
content : WS_AST_Content,
PWS_AST_Spaces :: struct {
content : PWS_AST_Content,
using links : DLL_NodePN(WS_AST),
using links : DLL_NodePN(PWS_AST),
}
WS_AST_Tabs :: struct {
content : WS_AST_Content,
PWS_AST_Tabs :: struct {
content : PWS_AST_Content,
using links : DLL_NodePN(WS_AST),
using links : DLL_NodePN(PWS_AST),
}
WS_AST_Visible :: struct {
content : WS_AST_Content,
PWS_AST_Visible :: struct {
content : PWS_AST_Content,
using links : DLL_NodePN(WS_AST),
using links : DLL_NodePN(PWS_AST),
}
WS_AST_Line :: struct {
using content : DLL_NodeFL(WS_AST),
end_token : ^ WS_Token,
PWS_AST_Line :: struct {
using content : DLL_NodeFL(PWS_AST),
end_token : ^ PWS_Token,
using links : DLL_NodePN(WS_AST),
using links : DLL_NodePN(PWS_AST),
}
WS_AST :: union #no_nil {
WS_AST_Visible,
WS_AST_Spaces,
WS_AST_Tabs,
WS_AST_Line,
PWS_AST :: union #no_nil {
PWS_AST_Visible,
PWS_AST_Spaces,
PWS_AST_Tabs,
PWS_AST_Line,
}
WS_ParseError :: struct {
token : ^WS_Token,
PWS_ParseError :: struct {
token : ^PWS_Token,
msg : string,
}
WS_ParseError_Max :: 32
WS_NodeArray_ReserveSize :: Kilobyte * 4
WS_LineArray_RserveSize :: Kilobyte
PWS_ParseError_Max :: 32
PWS_NodeArray_ReserveSize :: Kilobyte * 4
PWS_LineArray_RserveSize :: Kilobyte
// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to ASTs
// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
WS_ParseResult :: struct {
PWS_ParseResult :: struct {
content : string,
runes : []rune,
tokens : Array(WS_Token),
nodes : Array(WS_AST),
lines : Array( ^WS_AST_Line),
errors : [WS_ParseError_Max] WS_ParseError,
tokens : Array(PWS_Token),
nodes : Array(PWS_AST),
lines : Array( ^PWS_AST_Line),
errors : [PWS_ParseError_Max] PWS_ParseError,
}
// @(private="file")
// AST :: WS_AST
// AST :: PWS_AST
ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexResult, AllocatorError )
pws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError )
{
LexerData :: struct {
using result : WS_LexResult,
using result : PWS_LexResult,
head : [^] rune,
left : i32,
@ -141,57 +141,60 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
using lexer : LexerData
context.user_ptr = & lexer
rune_type :: proc() -> WS_TokenType
rune_type :: proc() -> PWS_TokenType
{
using self := context_ext( LexerData)
switch (head[0])
{
case Rune_Space:
return WS_TokenType.Space
return PWS_TokenType.Space
case Rune_Tab:
return WS_TokenType.Tab
return PWS_TokenType.Tab
case Rune_New_Line:
return WS_TokenType.New_Line
return PWS_TokenType.New_Line
// Support for CRLF format
case Rune_Carriage_Return:
{
previous := cast( ^ rune) (uintptr(head) - 1)
if (previous ^) == Rune_New_Line {
return WS_TokenType.New_Line
if left - 1 == 0 {
return PWS_TokenType.Invalid
}
if head[1] == Rune_New_Line {
return PWS_TokenType.New_Line
}
}
}
// Everything that isn't the supported whitespace code points is considered 'visible'
// Eventually we should support other types of whitespace
return WS_TokenType.Visible
return PWS_TokenType.Visible
}
advance :: proc() -> WS_TokenType {
advance :: proc() -> PWS_TokenType {
using self := context_ext( LexerData)
head = head[1:]
left -= 1
column += 1
type := rune_type()
line += u32(type == WS_TokenType.New_Line)
line += u32(type == PWS_TokenType.New_Line)
return type
}
alloc_error : AllocatorError
runes, alloc_error = to_runes( content, allocator )
if alloc_error != AllocatorError.None {
ensure(false, "Failed to allocate runes from content")
return result, alloc_error
}
left = cast(i32) len(runes)
head = & runes[0]
tokens, alloc_error = array_init_reserve( WS_Token, allocator, u64(left / 2) )
tokens, alloc_error = array_init_reserve( PWS_Token, allocator, u64(left / 2) )
if alloc_error != AllocatorError.None {
ensure(false, "Failed to allocate token's array")
return result, alloc_error
@ -202,7 +205,7 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
for ; left > 0;
{
current : WS_Token
current : PWS_Token
current.type = rune_type()
current.line = line
current.column = column
@ -220,21 +223,21 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
return result, alloc_error
}
ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_ParseResult, AllocatorError )
pws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError )
{
ParseData :: struct {
using result : WS_ParseResult,
using result : PWS_ParseResult,
left : u32,
head : [^]WS_Token,
line : WS_AST_Line,
head : [^]PWS_Token,
line : PWS_AST_Line,
}
using parser : ParseData
context.user_ptr = & result
//region Helper procs
peek_next :: proc() -> ( ^WS_Token)
peek_next :: proc() -> ( ^PWS_Token)
{
using self := context_ext( ParseData)
if left - 1 == 0 {
@ -244,14 +247,14 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
return head[ 1: ]
}
check_next :: proc( expected : WS_TokenType ) -> b32 {
check_next :: proc( expected : PWS_TokenType ) -> b32 {
using self := context_ext( ParseData)
next := peek_next()
return next != nil && next.type == expected
}
advance :: proc( expected : WS_TokenType ) -> (^WS_Token)
advance :: proc( expected : PWS_TokenType ) -> (^PWS_Token)
{
using self := context_ext( ParseData)
next := peek_next()
@ -267,7 +270,7 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
}
//endregion Helper procs
lex, alloc_error := ws_parser_lex( content, allocator )
lex, alloc_error := pws_parser_lex( content, allocator )
if alloc_error != AllocatorError.None {
}
@ -275,12 +278,12 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
runes = lex.runes
tokens = lex.tokens
nodes, alloc_error = array_init_reserve( WS_AST, allocator, WS_NodeArray_ReserveSize )
nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize )
if alloc_error != AllocatorError.None {
}
lines, alloc_error = array_init_reserve( ^WS_AST_Line, allocator, WS_LineArray_RserveSize )
lines, alloc_error = array_init_reserve( ^PWS_AST_Line, allocator, PWS_LineArray_RserveSize )
if alloc_error != AllocatorError.None {
}
@ -290,22 +293,17 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
// Parse Line
for ; left > 0;
{
parse_content :: proc( $ Type : typeid, tok_type : WS_TokenType ) -> Type
parse_content :: proc( $ Type : typeid, tok_type : PWS_TokenType ) -> Type
{
using self := context_ext( ParseData)
ast : Type
start := head
end : [^]WS_Token
for ; check_next( WS_TokenType.Visible ); {
end = advance( tok_type )
}
ast.content = slice_ptr( start, ptr_sub( end, start ))
ast : Type
ast.content = cast( ^PWS_Token) head
advance( tok_type )
return ast
}
add_node :: proc( ast : WS_AST ) //-> ( should_return : b32 )
add_node :: proc( ast : PWS_AST ) //-> ( should_return : b32 )
{
using self := context_ext( ParseData)
@ -313,42 +311,42 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
array_append( & nodes, ast )
if line.first == nil {
line.first = array_back( & nodes )
line.first = array_back( nodes )
}
else
{
line.last = array_back( & nodes)
line.last = array_back( nodes)
}
}
// TODO(Ed) : Harden this
#partial switch head[0].type
{
case WS_TokenType.Visible:
case PWS_TokenType.Visible:
{
ast := parse_content( WS_AST_Visible, WS_TokenType.Visible )
ast := parse_content( PWS_AST_Visible, PWS_TokenType.Visible )
add_node( ast )
}
case WS_TokenType.Space:
case PWS_TokenType.Space:
{
ast := parse_content( WS_AST_Visible, WS_TokenType.Space )
ast := parse_content( PWS_AST_Visible, PWS_TokenType.Space )
add_node( ast )
}
case WS_TokenType.Tab:
case PWS_TokenType.Tab:
{
ast := parse_content( WS_AST_Tabs, WS_TokenType.Tab )
ast := parse_content( PWS_AST_Tabs, PWS_TokenType.Tab )
add_node( ast )
}
case WS_TokenType.New_Line:
case PWS_TokenType.New_Line:
{
line.end_token = head
ast : WS_AST
ast : PWS_AST
ast = line
// TODO(Ed) : Harden This
array_append( & nodes, ast )
array_append( & lines, & array_back( & nodes).(WS_AST_Line) )
array_append( & lines, & array_back(nodes).(PWS_AST_Line) )
line = {}
}
}