SectrPrototype/code/parser_whitespace.odin

357 lines
8.0 KiB
Odin
Raw Normal View History

2024-03-02 15:19:27 -08:00
/* Parser: Whitespace
This is a prototype parser meant to only parse whitespace from visible blocks of code.
Its meant to be the most minimal useful AST for boostrapping an AST Editor.
2024-03-05 07:17:27 -08:00
All symbols related directly to the parser are prefixed with the PWS_ namespace.
2024-03-02 15:19:27 -08:00
The AST is composed of the following node types:
* Visible
* Spaces
* Tabs
* Line
AST_Visible tracks a slice of visible codepoints.
It tracks a neighboring ASTs (left or right) which should always be Spaces, or Tabs.
AST_Spaces tracks a slice of singluar or consecutive Spaces.
Neighboring ASTS should either be Visible, Tabs.
AST_Tabs tracks a slice of singlar or consectuive Tabs.
Neighboring ASTS should be either Visible or Spaces.
AST_Line tracks a slice of AST nodes of Visible, Spaces, or Tabs that terminate with a New-Line token.
Neighboring ASTS are only Lines.
The ParseData struct will contain an Array of AST_Line. This represents the entire AST where the root is the first entry.
ASTs keep track of neighboring ASTs in double-linked list pattern for ease of use.
This may be removed in the future for perforamance reasons,
since this is a prototype it will only be removed if there is a performance issue.
Because this parser is so primtive, it can only be
manually constructed via an AST editor or from parsed text.
So there is only a parser directly dealing with text.
If its constructed from an AST-Editor. There will not be a content string referencable or runes derived fromt hat content string.
Instead the AST's content will directly contain the runes associated.
*/
package sectr
import "core:os"
Rune_Space :: ' '
Rune_Tab :: '\t'
Rune_Carriage_Return :: 'r'
Rune_New_Line :: '\n'
// Rune_Tab_Vertical :: '\v'
2024-03-05 07:17:27 -08:00
PWS_TokenType :: enum u32 {
2024-03-02 15:19:27 -08:00
Invalid,
Visible,
Space,
Tab,
New_Line,
Count,
}
2024-03-05 07:17:27 -08:00
// TODO(Ed) : The runes and token arrays should be handled by a slab allocator
2024-03-02 15:19:27 -08:00
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
2024-03-05 07:17:27 -08:00
PWS_LexResult :: struct {
2024-03-02 15:19:27 -08:00
allocator : Allocator,
content : string,
runes : []rune,
2024-03-05 07:17:27 -08:00
tokens : Array(PWS_Token),
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_Token :: struct {
type : PWS_TokenType,
2024-03-02 15:19:27 -08:00
line, column : u32,
ptr : ^rune,
}
2024-03-05 07:17:27 -08:00
PWS_AST_Content :: union #no_nil {
^PWS_Token,
2024-03-02 15:19:27 -08:00
[] rune,
}
2024-03-05 07:17:27 -08:00
PWS_AST_Spaces :: struct {
content : PWS_AST_Content,
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
using links : DLL_NodePN(PWS_AST),
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_AST_Tabs :: struct {
content : PWS_AST_Content,
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
using links : DLL_NodePN(PWS_AST),
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_AST_Visible :: struct {
content : PWS_AST_Content,
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
using links : DLL_NodePN(PWS_AST),
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_AST_Line :: struct {
using content : DLL_NodeFL(PWS_AST),
end_token : ^ PWS_Token,
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
using links : DLL_NodePN(PWS_AST),
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_AST :: union #no_nil {
PWS_AST_Visible,
PWS_AST_Spaces,
PWS_AST_Tabs,
PWS_AST_Line,
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_ParseError :: struct {
token : ^PWS_Token,
2024-03-02 15:19:27 -08:00
msg : string,
}
2024-03-05 07:17:27 -08:00
PWS_ParseError_Max :: 32
PWS_NodeArray_ReserveSize :: Kilobyte * 4
PWS_LineArray_RserveSize :: Kilobyte
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs
2024-03-02 15:19:27 -08:00
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
2024-03-05 07:17:27 -08:00
PWS_ParseResult :: struct {
2024-03-02 15:19:27 -08:00
content : string,
runes : []rune,
2024-03-05 07:17:27 -08:00
tokens : Array(PWS_Token),
nodes : Array(PWS_AST),
lines : Array( ^PWS_AST_Line),
errors : [PWS_ParseError_Max] PWS_ParseError,
2024-03-02 15:19:27 -08:00
}
// @(private="file")
2024-03-05 07:17:27 -08:00
// AST :: PWS_AST
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
pws_parser_lex :: proc ( content : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError )
2024-03-02 15:19:27 -08:00
{
LexerData :: struct {
2024-03-05 07:17:27 -08:00
using result : PWS_LexResult,
2024-03-02 15:19:27 -08:00
head : [^] rune,
left : i32,
line : u32,
column : u32,
}
using lexer : LexerData
context.user_ptr = & lexer
2024-03-05 07:17:27 -08:00
rune_type :: proc() -> PWS_TokenType
2024-03-02 15:19:27 -08:00
{
using self := context_ext( LexerData)
switch (head[0])
{
case Rune_Space:
2024-03-05 07:17:27 -08:00
return PWS_TokenType.Space
2024-03-02 15:19:27 -08:00
case Rune_Tab:
2024-03-05 07:17:27 -08:00
return PWS_TokenType.Tab
2024-03-02 15:19:27 -08:00
case Rune_New_Line:
2024-03-05 07:17:27 -08:00
return PWS_TokenType.New_Line
2024-03-02 15:19:27 -08:00
// Support for CRLF format
case Rune_Carriage_Return:
{
2024-03-05 07:17:27 -08:00
if left - 1 == 0 {
return PWS_TokenType.Invalid
}
if head[1] == Rune_New_Line {
return PWS_TokenType.New_Line
2024-03-02 15:19:27 -08:00
}
}
}
// Everything that isn't the supported whitespace code points is considered 'visible'
// Eventually we should support other types of whitespace
2024-03-05 07:17:27 -08:00
return PWS_TokenType.Visible
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
advance :: proc() -> PWS_TokenType {
2024-03-02 15:19:27 -08:00
using self := context_ext( LexerData)
head = head[1:]
left -= 1
column += 1
type := rune_type()
2024-03-05 07:17:27 -08:00
line += u32(type == PWS_TokenType.New_Line)
2024-03-02 15:19:27 -08:00
return type
}
alloc_error : AllocatorError
runes, alloc_error = to_runes( content, allocator )
if alloc_error != AllocatorError.None {
2024-03-05 07:17:27 -08:00
ensure(false, "Failed to allocate runes from content")
2024-03-02 15:19:27 -08:00
return result, alloc_error
}
left = cast(i32) len(runes)
head = & runes[0]
2024-03-05 07:17:27 -08:00
tokens, alloc_error = array_init_reserve( PWS_Token, allocator, u64(left / 2) )
2024-03-02 15:19:27 -08:00
if alloc_error != AllocatorError.None {
ensure(false, "Failed to allocate token's array")
return result, alloc_error
}
line = 0
column = 0
for ; left > 0;
{
2024-03-05 07:17:27 -08:00
current : PWS_Token
2024-03-02 15:19:27 -08:00
current.type = rune_type()
current.line = line
current.column = column
for ; advance() == current.type; {
}
alloc_error = array_append( & tokens, current )
if alloc_error != AllocatorError.None {
ensure(false, "Failed to append token to token array")
return lexer, alloc_error
}
}
return result, alloc_error
}
2024-03-05 07:17:27 -08:00
pws_parser_parse :: proc( content : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError )
2024-03-02 15:19:27 -08:00
{
ParseData :: struct {
2024-03-05 07:17:27 -08:00
using result : PWS_ParseResult,
2024-03-02 15:19:27 -08:00
left : u32,
2024-03-05 07:17:27 -08:00
head : [^]PWS_Token,
line : PWS_AST_Line,
2024-03-02 15:19:27 -08:00
}
using parser : ParseData
context.user_ptr = & result
//region Helper procs
2024-03-05 07:17:27 -08:00
peek_next :: proc() -> ( ^PWS_Token)
2024-03-02 15:19:27 -08:00
{
using self := context_ext( ParseData)
if left - 1 == 0 {
return nil
}
return head[ 1: ]
}
2024-03-05 07:17:27 -08:00
check_next :: proc( expected : PWS_TokenType ) -> b32 {
2024-03-02 15:19:27 -08:00
using self := context_ext( ParseData)
next := peek_next()
return next != nil && next.type == expected
}
2024-03-05 07:17:27 -08:00
advance :: proc( expected : PWS_TokenType ) -> (^PWS_Token)
2024-03-02 15:19:27 -08:00
{
using self := context_ext( ParseData)
next := peek_next()
if next == nil {
return nil
}
if next.type != expected {
ensure( false, "Didn't get expected token type from next in lexed" )
return nil
}
head = next
return head
}
//endregion Helper procs
2024-03-05 07:17:27 -08:00
lex, alloc_error := pws_parser_lex( content, allocator )
2024-03-02 15:19:27 -08:00
if alloc_error != AllocatorError.None {
}
runes = lex.runes
tokens = lex.tokens
2024-03-05 07:17:27 -08:00
nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize )
2024-03-02 15:19:27 -08:00
if alloc_error != AllocatorError.None {
}
2024-03-05 07:17:27 -08:00
lines, alloc_error = array_init_reserve( ^PWS_AST_Line, allocator, PWS_LineArray_RserveSize )
2024-03-02 15:19:27 -08:00
if alloc_error != AllocatorError.None {
}
head = & tokens.data[0]
// Parse Line
for ; left > 0;
{
2024-03-05 07:17:27 -08:00
parse_content :: proc( $ Type : typeid, tok_type : PWS_TokenType ) -> Type
2024-03-02 15:19:27 -08:00
{
using self := context_ext( ParseData)
2024-03-05 07:17:27 -08:00
ast : Type
ast.content = cast( ^PWS_Token) head
advance( tok_type )
2024-03-02 15:19:27 -08:00
return ast
}
2024-03-05 07:17:27 -08:00
add_node :: proc( ast : PWS_AST ) //-> ( should_return : b32 )
2024-03-02 15:19:27 -08:00
{
using self := context_ext( ParseData)
// TODO(Ed) : Harden this
array_append( & nodes, ast )
if line.first == nil {
2024-03-05 07:17:27 -08:00
line.first = array_back( nodes )
2024-03-02 15:19:27 -08:00
}
else
{
2024-03-05 07:17:27 -08:00
line.last = array_back( nodes)
2024-03-02 15:19:27 -08:00
}
}
// TODO(Ed) : Harden this
#partial switch head[0].type
{
2024-03-05 07:17:27 -08:00
case PWS_TokenType.Visible:
2024-03-02 15:19:27 -08:00
{
2024-03-05 07:17:27 -08:00
ast := parse_content( PWS_AST_Visible, PWS_TokenType.Visible )
2024-03-02 15:19:27 -08:00
add_node( ast )
}
2024-03-05 07:17:27 -08:00
case PWS_TokenType.Space:
2024-03-02 15:19:27 -08:00
{
2024-03-05 07:17:27 -08:00
ast := parse_content( PWS_AST_Visible, PWS_TokenType.Space )
2024-03-02 15:19:27 -08:00
add_node( ast )
}
2024-03-05 07:17:27 -08:00
case PWS_TokenType.Tab:
2024-03-02 15:19:27 -08:00
{
2024-03-05 07:17:27 -08:00
ast := parse_content( PWS_AST_Tabs, PWS_TokenType.Tab )
2024-03-02 15:19:27 -08:00
add_node( ast )
}
2024-03-05 07:17:27 -08:00
case PWS_TokenType.New_Line:
2024-03-02 15:19:27 -08:00
{
line.end_token = head
2024-03-05 07:17:27 -08:00
ast : PWS_AST
2024-03-02 15:19:27 -08:00
ast = line
// TODO(Ed) : Harden This
array_append( & nodes, ast )
2024-03-05 07:17:27 -08:00
array_append( & lines, & array_back(nodes).(PWS_AST_Line) )
2024-03-02 15:19:27 -08:00
line = {}
}
}
}
return result, alloc_error
}