2024-03-02 18:19:27 -05:00
|
|
|
/* Parser: Whitespace
|
|
|
|
This is a prototype parser meant to only parse whitespace from visible blocks of code.
|
2024-03-10 10:31:21 -04:00
|
|
|
Its meant to be the most minimal useful AST with coupling to traditional text file formatting.
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-05 10:17:27 -05:00
|
|
|
All symbols related directly to the parser are prefixed with the PWS_ namespace.
|
2024-03-02 18:19:27 -05:00
|
|
|
|
|
|
|
The AST is composed of the following node types:
|
|
|
|
* Visible
|
|
|
|
* Spaces
|
|
|
|
* Tabs
|
|
|
|
* Line
|
|
|
|
|
|
|
|
AST_Visible tracks a slice of visible codepoints.
|
|
|
|
It tracks a neighboring ASTs (left or right) which should always be Spaces, or Tabs.
|
|
|
|
|
|
|
|
AST_Spaces tracks a slice of singluar or consecutive Spaces.
|
|
|
|
Neighboring ASTS should either be Visible, Tabs.
|
|
|
|
|
|
|
|
AST_Tabs tracks a slice of singlar or consectuive Tabs.
|
|
|
|
Neighboring ASTS should be either Visible or Spaces.
|
|
|
|
|
|
|
|
AST_Line tracks a slice of AST nodes of Visible, Spaces, or Tabs that terminate with a New-Line token.
|
|
|
|
Neighboring ASTS are only Lines.
|
|
|
|
|
|
|
|
The ParseData struct will contain an Array of AST_Line. This represents the entire AST where the root is the first entry.
|
|
|
|
ASTs keep track of neighboring ASTs in double-linked list pattern for ease of use.
|
|
|
|
This may be removed in the future for perforamance reasons,
|
|
|
|
since this is a prototype it will only be removed if there is a performance issue.
|
|
|
|
|
|
|
|
Because this parser is so primtive, it can only be
|
|
|
|
manually constructed via an AST editor or from parsed text.
|
|
|
|
So there is only a parser directly dealing with text.
|
|
|
|
|
|
|
|
If its constructed from an AST-Editor. There will not be a content string referencable or runes derived fromt hat content string.
|
|
|
|
Instead the AST's content will directly contain the runes associated.
|
|
|
|
*/
|
|
|
|
package sectr
|
|
|
|
|
|
|
|
import "core:os"
|
|
|
|
|
|
|
|
Rune_Space :: ' '
|
|
|
|
Rune_Tab :: '\t'
|
2024-03-10 10:31:21 -04:00
|
|
|
Rune_Carriage_Return :: '\r'
|
|
|
|
Rune_Line_Feed :: '\n'
|
2024-03-02 18:19:27 -05:00
|
|
|
// Rune_Tab_Vertical :: '\v'
|
|
|
|
|
2024-03-05 10:17:27 -05:00
|
|
|
PWS_TokenType :: enum u32 {
|
2024-03-02 18:19:27 -05:00
|
|
|
Invalid,
|
|
|
|
Visible,
|
2024-03-10 10:31:21 -04:00
|
|
|
Spaces,
|
|
|
|
Tabs,
|
2024-03-02 18:19:27 -05:00
|
|
|
New_Line,
|
2024-03-10 10:31:21 -04:00
|
|
|
End_Of_File,
|
2024-03-02 18:19:27 -05:00
|
|
|
Count,
|
|
|
|
}
|
|
|
|
|
2024-03-05 10:17:27 -05:00
|
|
|
// TODO(Ed) : The runes and token arrays should be handled by a slab allocator
|
2024-03-02 18:19:27 -05:00
|
|
|
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
|
2024-03-05 10:17:27 -05:00
|
|
|
PWS_LexResult :: struct {
|
|
|
|
tokens : Array(PWS_Token),
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-05 10:17:27 -05:00
|
|
|
PWS_Token :: struct {
|
|
|
|
type : PWS_TokenType,
|
2024-03-02 18:19:27 -05:00
|
|
|
line, column : u32,
|
2024-05-08 02:26:39 -04:00
|
|
|
content : StrRunesPair,
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
PWS_AST_Type :: enum u32 {
|
|
|
|
Invalid,
|
|
|
|
Visible,
|
|
|
|
Spaces,
|
|
|
|
Tabs,
|
|
|
|
Line,
|
|
|
|
Count,
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
PWS_AST :: struct {
|
|
|
|
using links : DLL_NodeFull(PWS_AST),
|
|
|
|
type : PWS_AST_Type,
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
line, column : u32,
|
2024-05-08 02:26:39 -04:00
|
|
|
content : StrRunesPair,
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-05 10:17:27 -05:00
|
|
|
PWS_ParseError :: struct {
|
|
|
|
token : ^PWS_Token,
|
2024-03-02 18:19:27 -05:00
|
|
|
msg : string,
|
|
|
|
}
|
|
|
|
|
2024-03-19 23:25:48 -04:00
|
|
|
PWS_ParseError_Max :: 32
|
2024-05-14 11:47:44 -04:00
|
|
|
PWS_TokenArray_ReserveSize :: 128
|
|
|
|
PWS_NodeArray_ReserveSize :: 32 * Kilobyte
|
|
|
|
PWS_LineArray_ReserveSize :: 32
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-05 10:17:27 -05:00
|
|
|
// TODO(Ed) : The ast arrays should be handled by a slab allocator dedicated to PWS_ASTs
|
2024-03-02 18:19:27 -05:00
|
|
|
// This can grow in undeterministic ways, persistent will get very polluted otherwise.
|
2024-03-05 10:17:27 -05:00
|
|
|
PWS_ParseResult :: struct {
|
2024-03-02 18:19:27 -05:00
|
|
|
content : string,
|
2024-03-05 10:17:27 -05:00
|
|
|
tokens : Array(PWS_Token),
|
2024-03-10 10:31:21 -04:00
|
|
|
nodes : Array(PWS_AST), // Nodes should be dumped in a pool.
|
|
|
|
lines : Array( ^PWS_AST),
|
2024-03-05 10:17:27 -05:00
|
|
|
errors : [PWS_ParseError_Max] PWS_ParseError,
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
PWS_LexerData :: struct {
|
|
|
|
using result : PWS_LexResult,
|
|
|
|
|
|
|
|
content : string,
|
|
|
|
previous_rune : rune,
|
2024-03-10 20:09:04 -04:00
|
|
|
current_rune : rune,
|
2024-03-10 10:31:21 -04:00
|
|
|
previous : PWS_TokenType,
|
|
|
|
line : u32,
|
|
|
|
column : u32,
|
|
|
|
start : int,
|
|
|
|
length : int,
|
|
|
|
current : PWS_Token,
|
|
|
|
}
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
pws_parser_lex :: proc ( text : string, allocator : Allocator ) -> ( PWS_LexResult, AllocatorError )
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
2024-03-12 20:55:29 -04:00
|
|
|
bytes := transmute([]byte) text
|
|
|
|
log( str_fmt_tmp( "lexing: %v ...", (len(text) > 30 ? transmute(string) bytes[ :30] : text) ))
|
|
|
|
|
2024-03-11 02:05:18 -04:00
|
|
|
profile(#procedure)
|
2024-03-10 10:31:21 -04:00
|
|
|
using lexer : PWS_LexerData
|
|
|
|
context.user_ptr = & lexer
|
|
|
|
content = text
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
if len(text) == 0 {
|
|
|
|
ensure( false, "Attempted to lex nothing")
|
|
|
|
return result, .None
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
rune_type :: proc( codepoint : rune ) -> PWS_TokenType
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
using self := context_ext( PWS_LexerData)
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
switch codepoint
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
|
|
|
case Rune_Space:
|
2024-03-10 10:31:21 -04:00
|
|
|
return PWS_TokenType.Spaces
|
2024-03-02 18:19:27 -05:00
|
|
|
|
|
|
|
case Rune_Tab:
|
2024-03-10 10:31:21 -04:00
|
|
|
return PWS_TokenType.Tabs
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
case Rune_Line_Feed:
|
2024-03-05 10:17:27 -05:00
|
|
|
return PWS_TokenType.New_Line
|
2024-03-02 18:19:27 -05:00
|
|
|
|
|
|
|
// Support for CRLF format
|
|
|
|
case Rune_Carriage_Return:
|
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
if previous_rune == 0 {
|
2024-03-05 10:17:27 -05:00
|
|
|
return PWS_TokenType.Invalid
|
|
|
|
}
|
2024-03-10 10:31:21 -04:00
|
|
|
|
|
|
|
// Assume for now its a new line
|
|
|
|
return PWS_TokenType.New_Line
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Everything that isn't the supported whitespace code points is considered 'visible'
|
|
|
|
// Eventually we should support other types of whitespace
|
2024-03-05 10:17:27 -05:00
|
|
|
return PWS_TokenType.Visible
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
alloc_error : AllocatorError
|
2024-05-07 01:12:02 -04:00
|
|
|
// tokens, alloc_error = array_init_reserve( PWS_Token, allocator, Kilobyte * 4 )
|
|
|
|
tokens, alloc_error = array_init_reserve( PWS_Token, allocator, PWS_TokenArray_ReserveSize )
|
2024-03-02 18:19:27 -05:00
|
|
|
if alloc_error != AllocatorError.None {
|
|
|
|
ensure(false, "Failed to allocate token's array")
|
|
|
|
return result, alloc_error
|
|
|
|
}
|
|
|
|
|
|
|
|
line = 0
|
|
|
|
column = 0
|
|
|
|
|
2024-03-10 20:09:04 -04:00
|
|
|
make_token :: proc ( byte_offset : int ) -> AllocatorError
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
self := context_ext( PWS_LexerData); using self
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 20:09:04 -04:00
|
|
|
if previous_rune == Rune_Carriage_Return && current_rune != Rune_Line_Feed {
|
2024-03-10 10:31:21 -04:00
|
|
|
ensure(false, "Rouge Carriage Return")
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
start_ptr := uintptr( raw_data(content)) + uintptr(start)
|
|
|
|
token_slice := transmute(string) byte_slice( rawptr(start_ptr), length )
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
current.content = str_intern( token_slice )
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
start = byte_offset
|
|
|
|
length = 0
|
|
|
|
line += cast(u32) (current.type == .New_Line)
|
|
|
|
column = 0
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
return array_append( & tokens, current )
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
last_rune : rune
|
|
|
|
last_byte_offset : int
|
|
|
|
for codepoint, byte_offset in text
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
type := rune_type( codepoint )
|
2024-03-10 20:09:04 -04:00
|
|
|
current_rune = codepoint
|
2024-03-10 10:31:21 -04:00
|
|
|
|
2024-03-10 20:09:04 -04:00
|
|
|
if (current.type != type && previous != .Invalid) ||
|
|
|
|
( previous_rune != Rune_Carriage_Return && current.type == .New_Line )
|
2024-03-10 10:31:21 -04:00
|
|
|
{
|
2024-03-10 20:09:04 -04:00
|
|
|
alloc_error = make_token( byte_offset )
|
2024-03-10 10:31:21 -04:00
|
|
|
if alloc_error != AllocatorError.None {
|
|
|
|
ensure(false, "Failed to append token to token array")
|
|
|
|
return lexer, alloc_error
|
|
|
|
}
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
current.type = type
|
|
|
|
current.line = line
|
|
|
|
current.column = column
|
|
|
|
|
|
|
|
column += 1
|
|
|
|
length += 1
|
|
|
|
previous = current.type
|
|
|
|
previous_rune = codepoint
|
|
|
|
last_byte_offset = byte_offset
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 20:09:04 -04:00
|
|
|
make_token( last_byte_offset )
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
return result, alloc_error
|
|
|
|
}
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
PWS_ParseData :: struct {
|
|
|
|
using result : PWS_ParseResult,
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
left : u32,
|
|
|
|
head : [^]PWS_Token,
|
|
|
|
line : PWS_AST,
|
|
|
|
prev_line : ^PWS_AST,
|
|
|
|
}
|
|
|
|
|
|
|
|
pws_parser_parse :: proc( text : string, allocator : Allocator ) -> ( PWS_ParseResult, AllocatorError )
|
|
|
|
{
|
2024-03-12 20:55:29 -04:00
|
|
|
bytes := transmute([]byte) text
|
|
|
|
|
2024-03-11 02:05:18 -04:00
|
|
|
profile(#procedure)
|
2024-03-10 10:31:21 -04:00
|
|
|
using parser : PWS_ParseData
|
|
|
|
context.user_ptr = & result
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
if len(text) == 0 {
|
|
|
|
ensure( false, "Attempted to lex nothing")
|
|
|
|
return result, .None
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
lex, alloc_error := pws_parser_lex( text, allocator = allocator )
|
|
|
|
verify( alloc_error == nil, "Allocation faiure in lex")
|
|
|
|
|
2024-03-02 18:19:27 -05:00
|
|
|
tokens = lex.tokens
|
|
|
|
|
2024-03-19 23:25:48 -04:00
|
|
|
log( str_fmt_tmp( "parsing: %v ...", (len(text) > 30 ? transmute(string) bytes[ :30] : text) ))
|
|
|
|
|
2024-05-14 11:47:44 -04:00
|
|
|
// TODO(Ed): Change this to use a node pool
|
2024-03-19 23:25:48 -04:00
|
|
|
nodes, alloc_error = array_init_reserve( PWS_AST, allocator, PWS_NodeArray_ReserveSize )
|
2024-03-10 10:31:21 -04:00
|
|
|
verify( alloc_error == nil, "Allocation failure creating nodes array")
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-19 23:25:48 -04:00
|
|
|
parser.lines, alloc_error = array_init_reserve( ^PWS_AST, allocator, PWS_LineArray_ReserveSize )
|
2024-03-10 10:31:21 -04:00
|
|
|
verify( alloc_error == nil, "Allocation failure creating line array")
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
//region Helper procs
|
2024-03-10 20:09:04 -04:00
|
|
|
eat_line :: #force_inline proc()
|
2024-03-10 10:31:21 -04:00
|
|
|
{
|
|
|
|
self := context_ext( PWS_ParseData); using self
|
|
|
|
tok := cast( ^PWS_Token) head
|
|
|
|
|
2024-03-10 20:09:04 -04:00
|
|
|
line.type = .Line
|
|
|
|
line.line = tok.line
|
|
|
|
line.column = tok.column
|
|
|
|
line.content = tok.content
|
2024-03-10 10:31:21 -04:00
|
|
|
|
|
|
|
alloc_error := array_append( & nodes, line )
|
|
|
|
verify( alloc_error == nil, "Allocation failure appending node")
|
|
|
|
node := & nodes.data[ nodes.num - 1 ]
|
|
|
|
|
|
|
|
// TODO(Ed): Review this with multiple line test
|
|
|
|
dll_push_back( & prev_line, node )
|
|
|
|
prev_line = node
|
|
|
|
|
|
|
|
// Debug build compile error
|
|
|
|
// alloc_error = array_append( & lines, prev_line )
|
|
|
|
// verify( alloc_error == nil, "Allocation failure appending node")
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
line = {}
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
2024-03-10 10:31:21 -04:00
|
|
|
//endregion
|
2024-03-02 18:19:27 -05:00
|
|
|
|
|
|
|
head = & tokens.data[0]
|
2024-03-10 10:31:21 -04:00
|
|
|
left = u32(tokens.num)
|
2024-03-02 18:19:27 -05:00
|
|
|
|
|
|
|
// Parse Line
|
|
|
|
for ; left > 0;
|
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
type : PWS_AST_Type
|
|
|
|
#partial switch head[0].type
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
case .Tabs:
|
|
|
|
type = .Tabs
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
case .Spaces:
|
|
|
|
type = .Spaces
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
case .Visible:
|
|
|
|
type = .Visible
|
2024-03-02 18:19:27 -05:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
case .New_Line:
|
|
|
|
eat_line()
|
|
|
|
|
2024-03-12 02:32:16 -04:00
|
|
|
alloc_error = array_append( & parser.lines, prev_line )
|
2024-03-10 10:31:21 -04:00
|
|
|
verify( alloc_error == nil, "Allocation failure appending node")
|
2024-03-12 02:32:16 -04:00
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
case PWS_TokenType.End_Of_File:
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
if type != .Line
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
tok := cast( ^PWS_Token) head
|
|
|
|
ast : PWS_AST
|
|
|
|
ast.type = type
|
|
|
|
ast.line = tok.line
|
|
|
|
ast.column = tok.column
|
|
|
|
ast.content = tok.content
|
|
|
|
|
|
|
|
// Compiler Error (-Debug)
|
|
|
|
// prev_node = array_back( nodes )
|
|
|
|
prev_node : ^PWS_AST = nil
|
|
|
|
if nodes.num > 0 {
|
|
|
|
prev_node = & nodes.data[ nodes.num - 1 ]
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
2024-03-10 10:31:21 -04:00
|
|
|
|
|
|
|
alloc_error := array_append( & nodes, ast )
|
|
|
|
verify( alloc_error == nil, "Allocation failure appending node")
|
|
|
|
|
|
|
|
node := & nodes.data[ nodes.num - 1 ]
|
|
|
|
|
|
|
|
// dll_push_back( & prev_node, last_node )
|
2024-03-02 18:19:27 -05:00
|
|
|
{
|
2024-03-10 10:31:21 -04:00
|
|
|
if prev_node != nil
|
|
|
|
{
|
|
|
|
node.prev = prev_node
|
|
|
|
prev_node.next = node
|
|
|
|
}
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
2024-03-10 10:31:21 -04:00
|
|
|
// dll_fl_append( & line, last_node )
|
|
|
|
if line.first == nil {
|
|
|
|
line.first = node
|
|
|
|
line.last = node
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
line.last = node
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
}
|
2024-03-10 10:31:21 -04:00
|
|
|
|
|
|
|
head = head[ 1:]
|
|
|
|
left -= 1
|
|
|
|
}
|
|
|
|
|
|
|
|
if line.first != nil {
|
|
|
|
eat_line()
|
|
|
|
|
2024-03-12 02:32:16 -04:00
|
|
|
alloc_error = array_append( & parser.lines, prev_line )
|
2024-03-10 10:31:21 -04:00
|
|
|
verify( alloc_error == nil, "Allocation failure appending node")
|
2024-03-02 18:19:27 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
return result, alloc_error
|
|
|
|
}
|