SectrPrototype/code/parser_whitespace.odin

362 lines
8.6 KiB
Odin
Raw Normal View History

2024-03-02 15:19:27 -08:00
/* Parser: Whitespace
This is a prototype parser meant to only parse whitespace from visible blocks of code.
Its meant to be the most minimal useful AST with coupling to traditional text file formatting.
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
All symbols related directly to the parser are prefixed with the PWS_ namespace.
2024-03-02 15:19:27 -08:00
The AST is composed of the following node types:
* Visible
* Spaces
* Tabs
* Line
AST_Visible tracks a slice of visible codepoints.
It tracks a neighboring ASTs (left or right) which should always be Spaces, or Tabs.
AST_Spaces tracks a slice of singluar or consecutive Spaces.
Neighboring ASTS should either be Visible, Tabs.
AST_Tabs tracks a slice of singlar or consectuive Tabs.
Neighboring ASTS should be either Visible or Spaces.
AST_Line tracks a slice of AST nodes of Visible, Spaces, or Tabs that terminate with a New-Line token.
Neighboring ASTS are only Lines.
The ParseData struct will contain an Array of AST_Line. This represents the entire AST where the root is the first entry.
ASTs keep track of neighboring ASTs in double-linked list pattern for ease of use.
This may be removed in the future for perforamance reasons,
since this is a prototype it will only be removed if there is a performance issue.
Because this parser is so primtive, it can only be
manually constructed via an AST editor or from parsed text.
So there is only a parser directly dealing with text.
If its constructed from an AST-Editor. There will not be a content string referencable or runes derived fromt hat content string.
Instead the AST's content will directly contain the runes associated.
*/
package sectr
import "core:os"
Rune_Space :: ' '
Rune_Tab :: '\t'
Rune_Carriage_Return :: '\r'
Rune_Line_Feed :: '\n'
2024-03-02 15:19:27 -08:00
// Rune_Tab_Vertical :: '\v'
2024-03-05 07:17:27 -08:00
PWS_TokenType :: enum u32 {
2024-03-02 15:19:27 -08:00
Invalid,
Visible,
Spaces,
Tabs,
2024-03-02 15:19:27 -08:00
New_Line,
End_Of_File,
2024-03-02 15:19:27 -08:00
Count,
}
2024-03-05 07:17:27 -08:00
// TODO(Ed) : The runes and token arrays should be handled by a slab allocator
2024-03-02 15:19:27 -08:00
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
2024-03-05 07:17:27 -08:00
PWS_LexResult :: struct {
tokens : Array(PWS_Token),
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_Token :: struct {
type : PWS_TokenType,
2024-03-02 15:19:27 -08:00
line, column : u32,
content : StringCached,
2024-03-02 15:19:27 -08:00
}
PWS_AST_Type :: enum u32 {
Invalid,
Visible,
Spaces,
Tabs,
Line,
Count,
2024-03-02 15:19:27 -08:00
}
PWS_AST :: struct {
using links : DLL_NodeFull(PWS_AST),
type : PWS_AST_Type,
2024-03-02 15:19:27 -08:00
line, column : u32,
content : StringCached,
2024-03-02 15:19:27 -08:00
}
2024-03-05 07:17:27 -08:00
PWS_ParseError :: struct {
token : ^PWS_Token,
2024-03-02 15:19:27 -08:00
msg : string,
}
2024-03-05 07:17:27 -08:00
PWS_ParseError_Max :: 32
PWS_NodeArray_ReserveSize :: Kilobyte * 4
PWS_LineArray_RserveSize :: Kilobyte
2024-03-02 15:19:27 -08:00
2024-03-05 07:17:27 -08:00
// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs
2024-03-02 15:19:27 -08:00
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
2024-03-05 07:17:27 -08:00
PWS_ParseResult :: struct {
2024-03-02 15:19:27 -08:00
content : string,
2024-03-05 07:17:27 -08:00
tokens : Array(PWS_Token),
nodes : Array(PWS_AST), // Nodes should be dumped in a pool.
lines : Array( ^PWS_AST),
2024-03-05 07:17:27 -08:00
errors : [PWS_ParseError_Max] PWS_ParseError,
2024-03-02 15:19:27 -08:00
}
PWS_LexerData :: struct {
using result : PWS_LexResult,
content : string,
previous_rune : rune,
previous : PWS_TokenType,
line : u32,
column : u32,
start : int,
length : int,
current : PWS_Token,
}
2024-03-02 15:19:27 -08:00
pws_parser_lex :: proc ( text : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError )
2024-03-02 15:19:27 -08:00
{
using lexer : PWS_LexerData
context.user_ptr = & lexer
content = text
2024-03-02 15:19:27 -08:00
if len(text) == 0 {
ensure( false, "Attempted to lex nothing")
return result, .None
2024-03-02 15:19:27 -08:00
}
rune_type :: proc( codepoint : rune ) -> PWS_TokenType
2024-03-02 15:19:27 -08:00
{
using self := context_ext( PWS_LexerData)
2024-03-02 15:19:27 -08:00
switch codepoint
2024-03-02 15:19:27 -08:00
{
case Rune_Space:
return PWS_TokenType.Spaces
2024-03-02 15:19:27 -08:00
case Rune_Tab:
return PWS_TokenType.Tabs
2024-03-02 15:19:27 -08:00
case Rune_Line_Feed:
2024-03-05 07:17:27 -08:00
return PWS_TokenType.New_Line
2024-03-02 15:19:27 -08:00
// Support for CRLF format
case Rune_Carriage_Return:
{
if previous_rune == 0 {
2024-03-05 07:17:27 -08:00
return PWS_TokenType.Invalid
}
// Assume for now its a new line
return PWS_TokenType.New_Line
2024-03-02 15:19:27 -08:00
}
}
// Everything that isn't the supported whitespace code points is considered 'visible'
// Eventually we should support other types of whitespace
2024-03-05 07:17:27 -08:00
return PWS_TokenType.Visible
2024-03-02 15:19:27 -08:00
}
alloc_error : AllocatorError
tokens, alloc_error = array_init_reserve( PWS_Token, allocator, u64( len(text)) )
2024-03-02 15:19:27 -08:00
if alloc_error != AllocatorError.None {
ensure(false, "Failed to allocate token's array")
return result, alloc_error
}
line = 0
column = 0
make_token :: proc ( codepoint : rune, byte_offset : int ) -> AllocatorError
2024-03-02 15:19:27 -08:00
{
self := context_ext( PWS_LexerData); using self
2024-03-02 15:19:27 -08:00
if previous_rune == Rune_Carriage_Return && codepoint != Rune_Line_Feed {
ensure(false, "Rouge Carriage Return")
2024-03-02 15:19:27 -08:00
}
start_ptr := uintptr( raw_data(content)) + uintptr(start)
token_slice := transmute(string) byte_slice( rawptr(start_ptr), length )
2024-03-02 15:19:27 -08:00
current.content = str_intern( token_slice )
2024-03-02 15:19:27 -08:00
start = byte_offset
length = 0
line += cast(u32) (current.type == .New_Line)
column = 0
2024-03-02 15:19:27 -08:00
return array_append( & tokens, current )
2024-03-02 15:19:27 -08:00
}
last_rune : rune
last_byte_offset : int
for codepoint, byte_offset in text
2024-03-02 15:19:27 -08:00
{
type := rune_type( codepoint )
if (current.type != type && previous != .Invalid) || current.type == .New_Line
{
alloc_error = make_token( previous_rune, byte_offset )
if alloc_error != AllocatorError.None {
ensure(false, "Failed to append token to token array")
return lexer, alloc_error
}
2024-03-02 15:19:27 -08:00
}
current.type = type
current.line = line
current.column = column
column += 1
length += 1
previous = current.type
previous_rune = codepoint
last_byte_offset = byte_offset
2024-03-02 15:19:27 -08:00
}
make_token( previous_rune, last_byte_offset )
2024-03-02 15:19:27 -08:00
return result, alloc_error
}
2024-03-02 15:19:27 -08:00
PWS_ParseData :: struct {
using result : PWS_ParseResult,
2024-03-02 15:19:27 -08:00
left : u32,
head : [^]PWS_Token,
line : PWS_AST,
prev_line : ^PWS_AST,
}
pws_parser_parse :: proc( text : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError )
{
using parser : PWS_ParseData
context.user_ptr = & result
2024-03-02 15:19:27 -08:00
if len(text) == 0 {
ensure( false, "Attempted to lex nothing")
return result, .None
2024-03-02 15:19:27 -08:00
}
lex, alloc_error := pws_parser_lex( text, allocator = allocator )
verify( alloc_error == nil, "Allocation faiure in lex")
2024-03-02 15:19:27 -08:00
tokens = lex.tokens
2024-03-05 07:17:27 -08:00
nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize )
verify( alloc_error == nil, "Allocation failure creating nodes array")
2024-03-02 15:19:27 -08:00
lines, alloc_error = array_init_reserve( ^PWS_AST, allocator, PWS_LineArray_RserveSize )
verify( alloc_error == nil, "Allocation failure creating line array")
2024-03-02 15:19:27 -08:00
//region Helper procs
eat_line :: proc()
{
self := context_ext( PWS_ParseData); using self
tok := cast( ^PWS_Token) head
ast : PWS_AST
ast.type = .Line
ast.line = tok.line
ast.column = tok.column
ast.content = tok.content
alloc_error := array_append( & nodes, line )
verify( alloc_error == nil, "Allocation failure appending node")
node := & nodes.data[ nodes.num - 1 ]
// TODO(Ed): Review this with multiple line test
dll_push_back( & prev_line, node )
prev_line = node
// Debug build compile error
// alloc_error = array_append( & lines, prev_line )
// verify( alloc_error == nil, "Allocation failure appending node")
2024-03-02 15:19:27 -08:00
line = {}
2024-03-02 15:19:27 -08:00
}
//endregion
2024-03-02 15:19:27 -08:00
head = & tokens.data[0]
left = u32(tokens.num)
2024-03-02 15:19:27 -08:00
// Parse Line
for ; left > 0;
{
type : PWS_AST_Type
#partial switch head[0].type
2024-03-02 15:19:27 -08:00
{
case .Tabs:
type = .Tabs
2024-03-02 15:19:27 -08:00
case .Spaces:
type = .Spaces
2024-03-02 15:19:27 -08:00
case .Visible:
type = .Visible
2024-03-02 15:19:27 -08:00
case .New_Line:
2024-03-02 15:19:27 -08:00
{
eat_line()
alloc_error = array_append( & lines, prev_line )
verify( alloc_error == nil, "Allocation failure appending node")
2024-03-02 15:19:27 -08:00
}
case PWS_TokenType.End_Of_File:
2024-03-02 15:19:27 -08:00
}
if type != .Line
2024-03-02 15:19:27 -08:00
{
tok := cast( ^PWS_Token) head
ast : PWS_AST
ast.type = type
ast.line = tok.line
ast.column = tok.column
ast.content = tok.content
// Compiler Error (-Debug)
// prev_node = array_back( nodes )
prev_node : ^PWS_AST = nil
if nodes.num > 0 {
prev_node = & nodes.data[ nodes.num - 1 ]
2024-03-02 15:19:27 -08:00
}
alloc_error := array_append( & nodes, ast )
verify( alloc_error == nil, "Allocation failure appending node")
node := & nodes.data[ nodes.num - 1 ]
// dll_push_back( & prev_node, last_node )
2024-03-02 15:19:27 -08:00
{
if prev_node != nil
{
node.prev = prev_node
prev_node.next = node
}
2024-03-02 15:19:27 -08:00
}
// dll_fl_append( & line, last_node )
if line.first == nil {
line.first = node
line.last = node
}
else {
line.last = node
2024-03-02 15:19:27 -08:00
}
}
head = head[ 1:]
left -= 1
}
if line.first != nil {
eat_line()
alloc_error = array_append( & lines, prev_line )
verify( alloc_error == nil, "Allocation failure appending node")
2024-03-02 15:19:27 -08:00
}
return result, alloc_error
}