LangStudies/App/RDP/Scripts/Lexer.gd
Ed94 17c3b8fe36 SRegEx works!!!!
Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand.
2022-07-17 07:32:57 -04:00

306 lines
7.1 KiB
GDScript

extends Object
var SRegEx = preload("res://RegM/Scripts/SRegex.gd").new()
class_name Lexer
const TokenType : Dictionary = \
{
# Comments
cmt_SL = "Comment Single Line",
cmt_ML = "Comment Multi-Line",
# Formatting
fmt_S = "Formatting String",
# Delimiters
delim_Comma = "Comma Delimiter",
delim_SMR = "Symbol Member Resolution",
# Statements
def_End = "Statement End",
def_BStart = "Block Start",
def_BEnd = "Block End",
def_Var = "Variable Declaration",
def_Class = "Class",
# Iteration
def_While = "While",
def_Do = "Do-While",
def_For = "For",
# Procedures
def_Proc = "Procedure Declaration",
def_Return = "Return",
# Conditional
def_If = "If Statement",
def_Else = "Else Statement",
# Expressions
expr_PStart = "Parenthesis Start",
expr_PEnd = "Parenthesis End",
expr_SBStart = "Bracket Start",
expr_SBEnd = "Bracket End",
expr_New = "New Expression",
expr_Super = "Super Expression",
expr_Extends = "Class Extension",
# Operators
# Logical
op_Relational = "Relational",
op_Equality = "Equality",
op_LAnd = "Logical And",
op_LOr = "Logical Or",
op_LNot = "Logical Not",
# Arithmetic
op_CAssign = "ComplexAssignment",
op_Assign = "Assignment",
op_Additive = "AdditiveOperator",
op_Multiplicative = "MultiplicativeOperator",
# Literals
literal_BTrue = "True",
literal_BFalse = "False",
literal_Number = "Number",
literal_String = "String",
literal_Null = "Null Value",
# Symbols
sym_This = "This Reference",
sym_Identifier = "User Identifier",
}
const Spec : Dictionary = \
{
# Comments
TokenType.cmt_SL : "^\\/\\/.*",
TokenType.cmt_ML : "^\\/\\*[\\s\\S]*?\\*\\/",
# Formatting
TokenType.fmt_S : "^\\s+",
# Delimiters
TokenType.delim_Comma : "^,",
TokenType.delim_SMR : "^\\.",
# Statements
TokenType.def_End : "^;",
TokenType.def_BStart : "^{",
TokenType.def_BEnd : "^}",
TokenType.def_Var : "^\\blet\\b",
TokenType.def_Class : "^\\bclass\\b",
# Iteration
TokenType.def_While : "^\\bwhile\\b",
TokenType.def_Do : "^\\bdo\\b",
TokenType.def_For : "^\\bfor\\b",
# Procedures
TokenType.def_Proc : "^\\bdef\\b",
TokenType.def_Return : "^\\breturn\\b",
# Conditional
TokenType.def_If : "^\\bif\\b",
TokenType.def_Else : "^\\belse\\b",
# Expressions
TokenType.expr_PStart : "^\\(",
TokenType.expr_PEnd : "^\\)",
TokenType.expr_SBStart : "^\\[",
TokenType.expr_SBEnd : "^\\]",
TokenType.expr_New : "^\\bnew\\b",
TokenType.expr_Super : "^\\bsuper\\b",
TokenType.expr_Extends : "^\\bextends\\b",
#Operators
# Logical
TokenType.op_Relational : "^[><]=?",
TokenType.op_Equality : "^[=!]=",
TokenType.op_LAnd : "^&&",
TokenType.op_LOr : "^\\|\\|",
TokenType.op_LNot : "^!",
# Arithmetic
TokenType.op_CAssign : "^[\\*\\/+-]=",
TokenType.op_Assign : "^=",
TokenType.op_Additive : "^[+-]",
TokenType.op_Multiplicative : "^[\\*\\/]",
# Literals
TokenType.literal_BTrue : "^\\btrue\\b",
TokenType.literal_BFalse : "^\\bfalse\\b",
TokenType.literal_Number : "^\\d+",
TokenType.literal_String : "^\"[^\"]*\"",
TokenType.literal_Null : "^\\bnull\\b",
# Symbols
TokenType.sym_This : "^\\bthis\\b",
TokenType.sym_Identifier : "^\\w+"
}
const SSpec : Dictionary = \
{
# Comments
TokenType.cmt_SL : "start // inline.repeat(0-)",
TokenType.cmt_ML : "start /* set(whitespace !whitespace).repeat(0-).lazy */",
# Formatting
TokenType.fmt_S : "start whitespace.repeat(1-)",
# Delimiters
TokenType.delim_Comma : "start ,",
TokenType.delim_SMR : "start \\.",
# Statements
TokenType.def_End : "start ;",
TokenType.def_BStart : "start {",
TokenType.def_BEnd : "start }",
TokenType.def_Var : "start \"let\"",
TokenType.def_Class : "start \"class\"",
# Iteration
TokenType.def_While : "start \"while\"",
TokenType.def_Do : "start \"do\"",
TokenType.def_For : "start \"for\"",
# Procedures
TokenType.def_Proc : "start \"def\"",
TokenType.def_Return : "start \"return\"",
# Conditional
TokenType.def_If : "start \"if\"",
TokenType.def_Else : "start \"else\"",
# Expressions
TokenType.expr_PStart : "start \\(",
TokenType.expr_PEnd : "start \\)",
TokenType.expr_SBStart : "start [",
TokenType.expr_SBEnd : "start ]",
TokenType.expr_New : "start \"new\"",
TokenType.expr_Super : "start \"super\"",
TokenType.expr_Extends : "start \"extends\"",
#Operators
# Logical
TokenType.op_Relational : "start set(> <) =.repeat(0-1)",
TokenType.op_Equality : "start set(= \\!) =",
TokenType.op_LAnd : "start &&",
TokenType.op_LOr : "start \\| \\|",
TokenType.op_LNot : "start \\!",
# Arithmetic
TokenType.op_CAssign : "start set(* / + \\-) =",
TokenType.op_Assign : "start =",
TokenType.op_Additive : "start set(+ \\-)",
TokenType.op_Multiplicative : "start set(* /)",
# Literals
TokenType.literal_BTrue : "start \"true\"",
TokenType.literal_BFalse : "start \"false\"",
TokenType.literal_Number : "start digit.repeat(1-)",
TokenType.literal_String : "start \\\" !set( \\\" ).repeat(0-) \\\"",
TokenType.literal_Null : "start \"null\"",
# Symbols
TokenType.sym_This : "start \"this\"",
TokenType.sym_Identifier : "start word.repeat(1-)"
}
class Token:
var Type : String
var Value : String
var SourceText : String
var Cursor : int
var SpecRegex : Dictionary
var Tokens : Array
var TokenIndex : int = 0
func compile_regex():
for type in TokenType.values() :
var \
regex = RegEx.new()
var original = Spec[type]
var transpiled = SRegEx.transpile(SSpec[type])
assert(transpiled == original, "transpiled did not match original")
regex.compile( transpiled )
SpecRegex[type] = regex
func init(programSrcText):
SourceText = programSrcText
Cursor = 0
TokenIndex = 0
if SpecRegex.size() == 0 :
compile_regex()
tokenize()
func next_Token():
var nextToken = null
if Tokens.size() > TokenIndex :
nextToken = Tokens[TokenIndex]
TokenIndex += 1
return nextToken
func reached_EndOfText():
return Cursor >= SourceText.length()
func tokenize():
Tokens.clear()
while reached_EndOfText() == false :
var srcLeft = SourceText.substr(Cursor)
var token = Token.new()
var error = true
for type in TokenType.values() :
var result = SpecRegex[type].search( srcLeft )
if result == null || result.get_start() != 0 :
continue
# Skip Comments
if type == TokenType.cmt_SL || type == TokenType.cmt_ML :
Cursor += result.get_string().length()
error = false
break
# Skip Whitespace
if type == TokenType.fmt_S :
var addVal = result.get_string().length()
Cursor += addVal
error = false
break
token.Type = type
token.Value = result.get_string()
Cursor += ( result.get_string().length() )
Tokens.append( token )
error = false
break;
if error :
var assertStrTmplt = "next_token: Source text not understood by tokenizer at Cursor pos: {value} -: {txt}"
var assertStr = assertStrTmplt.format({"value" : Cursor, "txt" : srcLeft})
assert(true != true, assertStr)
return