Iteration on whitespace parser
This commit is contained in:
parent
218af644d6
commit
fb6d66140e
@ -2,7 +2,7 @@
|
||||
This is a prototype parser meant to only parse whitespace from visible blocks of code.
|
||||
Its meant to be the most minimal useful AST for boostrapping an AST Editor.
|
||||
|
||||
All symbols related directly to the parser are prefixed with the WS_ namespace.
|
||||
All symbols related directly to the parser are prefixed with the PWS_ namespace.
|
||||
|
||||
The AST is composed of the following node types:
|
||||
* Visible
|
||||
@ -44,7 +44,7 @@ Rune_Carriage_Return :: 'r'
|
||||
Rune_New_Line :: '\n'
|
||||
// Rune_Tab_Vertical :: '\v'
|
||||
|
||||
WS_TokenType :: enum u32 {
|
||||
PWS_TokenType :: enum u32 {
|
||||
Invalid,
|
||||
Visible,
|
||||
Space,
|
||||
@ -53,85 +53,85 @@ WS_TokenType :: enum u32 {
|
||||
Count,
|
||||
}
|
||||
|
||||
// TODO(Ed) : The runes and token arrays should be handled by a slab allocator dedicated to ASTs
|
||||
// TODO(Ed) : The runes and token arrays should be handled by a slab allocator
|
||||
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
|
||||
WS_LexResult :: struct {
|
||||
PWS_LexResult :: struct {
|
||||
allocator : Allocator,
|
||||
content : string,
|
||||
runes : []rune,
|
||||
tokens : Array(WS_Token),
|
||||
tokens : Array(PWS_Token),
|
||||
}
|
||||
|
||||
WS_Token :: struct {
|
||||
type : WS_TokenType,
|
||||
PWS_Token :: struct {
|
||||
type : PWS_TokenType,
|
||||
line, column : u32,
|
||||
ptr : ^rune,
|
||||
}
|
||||
|
||||
WS_AST_Content :: union #no_nil {
|
||||
[] WS_Token,
|
||||
PWS_AST_Content :: union #no_nil {
|
||||
^PWS_Token,
|
||||
[] rune,
|
||||
}
|
||||
|
||||
WS_AST_Spaces :: struct {
|
||||
content : WS_AST_Content,
|
||||
PWS_AST_Spaces :: struct {
|
||||
content : PWS_AST_Content,
|
||||
|
||||
using links : DLL_NodePN(WS_AST),
|
||||
using links : DLL_NodePN(PWS_AST),
|
||||
}
|
||||
|
||||
WS_AST_Tabs :: struct {
|
||||
content : WS_AST_Content,
|
||||
PWS_AST_Tabs :: struct {
|
||||
content : PWS_AST_Content,
|
||||
|
||||
using links : DLL_NodePN(WS_AST),
|
||||
using links : DLL_NodePN(PWS_AST),
|
||||
}
|
||||
|
||||
WS_AST_Visible :: struct {
|
||||
content : WS_AST_Content,
|
||||
PWS_AST_Visible :: struct {
|
||||
content : PWS_AST_Content,
|
||||
|
||||
using links : DLL_NodePN(WS_AST),
|
||||
using links : DLL_NodePN(PWS_AST),
|
||||
}
|
||||
|
||||
WS_AST_Line :: struct {
|
||||
using content : DLL_NodeFL(WS_AST),
|
||||
end_token : ^ WS_Token,
|
||||
PWS_AST_Line :: struct {
|
||||
using content : DLL_NodeFL(PWS_AST),
|
||||
end_token : ^ PWS_Token,
|
||||
|
||||
using links : DLL_NodePN(WS_AST),
|
||||
using links : DLL_NodePN(PWS_AST),
|
||||
}
|
||||
|
||||
WS_AST :: union #no_nil {
|
||||
WS_AST_Visible,
|
||||
WS_AST_Spaces,
|
||||
WS_AST_Tabs,
|
||||
WS_AST_Line,
|
||||
PWS_AST :: union #no_nil {
|
||||
PWS_AST_Visible,
|
||||
PWS_AST_Spaces,
|
||||
PWS_AST_Tabs,
|
||||
PWS_AST_Line,
|
||||
}
|
||||
|
||||
WS_ParseError :: struct {
|
||||
token : ^WS_Token,
|
||||
PWS_ParseError :: struct {
|
||||
token : ^PWS_Token,
|
||||
msg : string,
|
||||
}
|
||||
|
||||
WS_ParseError_Max :: 32
|
||||
WS_NodeArray_ReserveSize :: Kilobyte * 4
|
||||
WS_LineArray_RserveSize :: Kilobyte
|
||||
PWS_ParseError_Max :: 32
|
||||
PWS_NodeArray_ReserveSize :: Kilobyte * 4
|
||||
PWS_LineArray_RserveSize :: Kilobyte
|
||||
|
||||
// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to ASTs
|
||||
// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs
|
||||
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
|
||||
WS_ParseResult :: struct {
|
||||
PWS_ParseResult :: struct {
|
||||
content : string,
|
||||
runes : []rune,
|
||||
tokens : Array(WS_Token),
|
||||
nodes : Array(WS_AST),
|
||||
lines : Array( ^WS_AST_Line),
|
||||
errors : [WS_ParseError_Max] WS_ParseError,
|
||||
tokens : Array(PWS_Token),
|
||||
nodes : Array(PWS_AST),
|
||||
lines : Array( ^PWS_AST_Line),
|
||||
errors : [PWS_ParseError_Max] PWS_ParseError,
|
||||
}
|
||||
|
||||
// @(private="file")
|
||||
// AST :: WS_AST
|
||||
// AST :: PWS_AST
|
||||
|
||||
ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexResult, AllocatorError )
|
||||
pws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError )
|
||||
{
|
||||
LexerData :: struct {
|
||||
using result : WS_LexResult,
|
||||
using result : PWS_LexResult,
|
||||
|
||||
head : [^] rune,
|
||||
left : i32,
|
||||
@ -141,57 +141,60 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
|
||||
using lexer : LexerData
|
||||
context.user_ptr = & lexer
|
||||
|
||||
rune_type :: proc() -> WS_TokenType
|
||||
rune_type :: proc() -> PWS_TokenType
|
||||
{
|
||||
using self := context_ext( LexerData)
|
||||
|
||||
switch (head[0])
|
||||
{
|
||||
case Rune_Space:
|
||||
return WS_TokenType.Space
|
||||
return PWS_TokenType.Space
|
||||
|
||||
case Rune_Tab:
|
||||
return WS_TokenType.Tab
|
||||
return PWS_TokenType.Tab
|
||||
|
||||
case Rune_New_Line:
|
||||
return WS_TokenType.New_Line
|
||||
return PWS_TokenType.New_Line
|
||||
|
||||
// Support for CRLF format
|
||||
case Rune_Carriage_Return:
|
||||
{
|
||||
previous := cast( ^ rune) (uintptr(head) - 1)
|
||||
if (previous ^) == Rune_New_Line {
|
||||
return WS_TokenType.New_Line
|
||||
if left - 1 == 0 {
|
||||
return PWS_TokenType.Invalid
|
||||
}
|
||||
if head[1] == Rune_New_Line {
|
||||
return PWS_TokenType.New_Line
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Everything that isn't the supported whitespace code points is considered 'visible'
|
||||
// Eventually we should support other types of whitespace
|
||||
return WS_TokenType.Visible
|
||||
return PWS_TokenType.Visible
|
||||
}
|
||||
|
||||
advance :: proc() -> WS_TokenType {
|
||||
advance :: proc() -> PWS_TokenType {
|
||||
using self := context_ext( LexerData)
|
||||
|
||||
head = head[1:]
|
||||
left -= 1
|
||||
column += 1
|
||||
type := rune_type()
|
||||
line += u32(type == WS_TokenType.New_Line)
|
||||
line += u32(type == PWS_TokenType.New_Line)
|
||||
return type
|
||||
}
|
||||
|
||||
alloc_error : AllocatorError
|
||||
runes, alloc_error = to_runes( content, allocator )
|
||||
if alloc_error != AllocatorError.None {
|
||||
ensure(false, "Failed to allocate runes from content")
|
||||
return result, alloc_error
|
||||
}
|
||||
|
||||
left = cast(i32) len(runes)
|
||||
head = & runes[0]
|
||||
|
||||
tokens, alloc_error = array_init_reserve( WS_Token, allocator, u64(left / 2) )
|
||||
tokens, alloc_error = array_init_reserve( PWS_Token, allocator, u64(left / 2) )
|
||||
if alloc_error != AllocatorError.None {
|
||||
ensure(false, "Failed to allocate token's array")
|
||||
return result, alloc_error
|
||||
@ -202,7 +205,7 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
|
||||
|
||||
for ; left > 0;
|
||||
{
|
||||
current : WS_Token
|
||||
current : PWS_Token
|
||||
current.type = rune_type()
|
||||
current.line = line
|
||||
current.column = column
|
||||
@ -220,21 +223,21 @@ ws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( WS_LexRes
|
||||
return result, alloc_error
|
||||
}
|
||||
|
||||
ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_ParseResult, AllocatorError )
|
||||
pws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError )
|
||||
{
|
||||
ParseData :: struct {
|
||||
using result : WS_ParseResult,
|
||||
using result : PWS_ParseResult,
|
||||
|
||||
left : u32,
|
||||
head : [^]WS_Token,
|
||||
line : WS_AST_Line,
|
||||
head : [^]PWS_Token,
|
||||
line : PWS_AST_Line,
|
||||
}
|
||||
|
||||
using parser : ParseData
|
||||
context.user_ptr = & result
|
||||
|
||||
//region Helper procs
|
||||
peek_next :: proc() -> ( ^WS_Token)
|
||||
peek_next :: proc() -> ( ^PWS_Token)
|
||||
{
|
||||
using self := context_ext( ParseData)
|
||||
if left - 1 == 0 {
|
||||
@ -244,14 +247,14 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
|
||||
return head[ 1: ]
|
||||
}
|
||||
|
||||
check_next :: proc( expected : WS_TokenType ) -> b32 {
|
||||
check_next :: proc( expected : PWS_TokenType ) -> b32 {
|
||||
using self := context_ext( ParseData)
|
||||
|
||||
next := peek_next()
|
||||
return next != nil && next.type == expected
|
||||
}
|
||||
|
||||
advance :: proc( expected : WS_TokenType ) -> (^WS_Token)
|
||||
advance :: proc( expected : PWS_TokenType ) -> (^PWS_Token)
|
||||
{
|
||||
using self := context_ext( ParseData)
|
||||
next := peek_next()
|
||||
@ -267,7 +270,7 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
|
||||
}
|
||||
//endregion Helper procs
|
||||
|
||||
lex, alloc_error := ws_parser_lex( content, allocator )
|
||||
lex, alloc_error := pws_parser_lex( content, allocator )
|
||||
if alloc_error != AllocatorError.None {
|
||||
|
||||
}
|
||||
@ -275,12 +278,12 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
|
||||
runes = lex.runes
|
||||
tokens = lex.tokens
|
||||
|
||||
nodes, alloc_error = array_init_reserve( WS_AST, allocator, WS_NodeArray_ReserveSize )
|
||||
nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize )
|
||||
if alloc_error != AllocatorError.None {
|
||||
|
||||
}
|
||||
|
||||
lines, alloc_error = array_init_reserve( ^WS_AST_Line, allocator, WS_LineArray_RserveSize )
|
||||
lines, alloc_error = array_init_reserve( ^PWS_AST_Line, allocator, PWS_LineArray_RserveSize )
|
||||
if alloc_error != AllocatorError.None {
|
||||
|
||||
}
|
||||
@ -290,22 +293,17 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
|
||||
// Parse Line
|
||||
for ; left > 0;
|
||||
{
|
||||
parse_content :: proc( $ Type : typeid, tok_type : WS_TokenType ) -> Type
|
||||
parse_content :: proc( $ Type : typeid, tok_type : PWS_TokenType ) -> Type
|
||||
{
|
||||
using self := context_ext( ParseData)
|
||||
|
||||
ast : Type
|
||||
start := head
|
||||
end : [^]WS_Token
|
||||
|
||||
for ; check_next( WS_TokenType.Visible ); {
|
||||
end = advance( tok_type )
|
||||
}
|
||||
ast.content = slice_ptr( start, ptr_sub( end, start ))
|
||||
ast : Type
|
||||
ast.content = cast( ^PWS_Token) head
|
||||
advance( tok_type )
|
||||
return ast
|
||||
}
|
||||
|
||||
add_node :: proc( ast : WS_AST ) //-> ( should_return : b32 )
|
||||
add_node :: proc( ast : PWS_AST ) //-> ( should_return : b32 )
|
||||
{
|
||||
using self := context_ext( ParseData)
|
||||
|
||||
@ -313,42 +311,42 @@ ws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( WS_Parse
|
||||
array_append( & nodes, ast )
|
||||
|
||||
if line.first == nil {
|
||||
line.first = array_back( & nodes )
|
||||
line.first = array_back( nodes )
|
||||
}
|
||||
else
|
||||
{
|
||||
line.last = array_back( & nodes)
|
||||
line.last = array_back( nodes)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(Ed) : Harden this
|
||||
#partial switch head[0].type
|
||||
{
|
||||
case WS_TokenType.Visible:
|
||||
case PWS_TokenType.Visible:
|
||||
{
|
||||
ast := parse_content( WS_AST_Visible, WS_TokenType.Visible )
|
||||
ast := parse_content( PWS_AST_Visible, PWS_TokenType.Visible )
|
||||
add_node( ast )
|
||||
}
|
||||
case WS_TokenType.Space:
|
||||
case PWS_TokenType.Space:
|
||||
{
|
||||
ast := parse_content( WS_AST_Visible, WS_TokenType.Space )
|
||||
ast := parse_content( PWS_AST_Visible, PWS_TokenType.Space )
|
||||
add_node( ast )
|
||||
}
|
||||
case WS_TokenType.Tab:
|
||||
case PWS_TokenType.Tab:
|
||||
{
|
||||
ast := parse_content( WS_AST_Tabs, WS_TokenType.Tab )
|
||||
ast := parse_content( PWS_AST_Tabs, PWS_TokenType.Tab )
|
||||
add_node( ast )
|
||||
}
|
||||
case WS_TokenType.New_Line:
|
||||
case PWS_TokenType.New_Line:
|
||||
{
|
||||
line.end_token = head
|
||||
|
||||
ast : WS_AST
|
||||
ast : PWS_AST
|
||||
ast = line
|
||||
|
||||
// TODO(Ed) : Harden This
|
||||
array_append( & nodes, ast )
|
||||
array_append( & lines, & array_back( & nodes).(WS_AST_Line) )
|
||||
array_append( & lines, & array_back(nodes).(PWS_AST_Line) )
|
||||
line = {}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user