LangStudies/App/RDP/Scripts/RDP_Lexer.gd

extends Object

var SRegEx = preload("res://RegM/Scripts/SRegex.gd").new()


class_name RDP_Lexer


const TokenType : Dictionary = \
{
	# Comments
	cmt_SL = "Comment Single Line",
	cmt_ML = "Comment Multi-Line",
	
	# Formatting
	fmt_S = "Formatting String",
	
	# Delimiters
	delim_Comma = "Comma Delimiter",
	delim_SMR   = "Symbol Member Resolution",

	# Statements
	def_End    = "Statement End",
	def_BStart = "Block Start",
	def_BEnd   = "Block End",
	def_Var    = "Variable Declaration",
	def_Class  = "Class",

	# Iteration
	def_While  = "While",
	def_Do	   = "Do-While",
	def_For    = "For",

	# Procedures
	def_Proc   = "Procedure Declaration",
	def_Return = "Return",

	# Conditional
	def_If   = "If Statement",
	def_Else = "Else Statement",

	# Expressions
	expr_PStart  = "Parenthesis Start",
	expr_PEnd	 = "Parenthesis End",
	expr_SBStart = "Bracket Start",
	expr_SBEnd   = "Bracket End",
	expr_New     = "New Expression",
	expr_Super   = "Super Expression",
	expr_Extends = "Class Extension",

	# Operators

	# Logical
	op_Relational = "Relational",
	op_Equality   = "Equality",
	op_LAnd       = "Logical And",
	op_LOr        = "Logical Or",
	op_LNot       = "Logical Not",

	# Arithmetic
	op_CAssign         = "ComplexAssignment",
	op_Assign          = "Assignment",
	op_Additive        = "AdditiveOperator",
	op_Multiplicative  = "MultiplicativeOperator",

	# Literals
	literal_BTrue  = "True", 
	literal_BFalse = "False",
	literal_Number = "Number",
	literal_String = "String",
	literal_Null   = "Null Value",

	# Symbols
	sym_This        = "This Reference",
	sym_Identifier  = "User Identifier",
}

const Spec : Dictionary = \
{
	# Comments
	TokenType.cmt_SL : "^\\/\\/.*",
	TokenType.cmt_ML : "^\\/\\*[\\s\\S]*?\\*\\/",

	# Formatting
	TokenType.fmt_S : "^\\s+",

	# Delimiters
	TokenType.delim_Comma : "^,",
	TokenType.delim_SMR   : "^\\.",
	
	# Statements
	TokenType.def_End    : "^;",
	TokenType.def_BStart : "^{",
	TokenType.def_BEnd   : "^}",
	TokenType.def_Var    : "^\\blet\\b",
	TokenType.def_Class  : "^\\bclass\\b",

	# Iteration
	TokenType.def_While : "^\\bwhile\\b",
	TokenType.def_Do    : "^\\bdo\\b",
	TokenType.def_For   : "^\\bfor\\b",

	# Procedures
	TokenType.def_Proc   : "^\\bdef\\b",
	TokenType.def_Return : "^\\breturn\\b",

	# Conditional
	TokenType.def_If     : "^\\bif\\b",
	TokenType.def_Else   : "^\\belse\\b",

	# Expressions
	TokenType.expr_PStart  : "^\\(",
	TokenType.expr_PEnd    : "^\\)",
	TokenType.expr_SBStart : "^\\[",
	TokenType.expr_SBEnd   : "^\\]",
	TokenType.expr_New     : "^\\bnew\\b",
	TokenType.expr_Super   : "^\\bsuper\\b",
	TokenType.expr_Extends : "^\\bextends\\b",

	#Operators

	# Logical
	TokenType.op_Relational : "^[><]=?",
	TokenType.op_Equality   : "^[=!]=",
	TokenType.op_LAnd       : "^&&",
	TokenType.op_LOr        : "^\\|\\|",
	TokenType.op_LNot       : "^!",

	# Arithmetic
	TokenType.op_CAssign        : "^[\\*\\/+\\-]=",
	TokenType.op_Assign         : "^=",
	TokenType.op_Additive       : "^[+\\-]",
	TokenType.op_Multiplicative : "^[\\*\\/]",

	# Literals
	TokenType.literal_BTrue  : "^\\btrue\\b",
	TokenType.literal_BFalse : "^\\bfalse\\b",
	TokenType.literal_Number : "^\\d+",
	TokenType.literal_String : "^\"[^\"]*\"",
	TokenType.literal_Null   : "^\\bnull\\b",

	# Symbols
	TokenType.sym_This       : "^\\bthis\\b",
	TokenType.sym_Identifier : "^\\w+"
}

const SSpec : Dictionary = \
{
	# Comments
	TokenType.cmt_SL : "start // inline.repeat(0-)",
	TokenType.cmt_ML : "start /* set(whitespace !whitespace).repeat(0-).lazy */",

	# Formatting
	TokenType.fmt_S : "start whitespace.repeat(1-)",

	# Delimiters
	TokenType.delim_Comma : "start ,",
	TokenType.delim_SMR   : "start \\.",

	# Statements
	TokenType.def_End    : "start ;",
	TokenType.def_BStart : "start {",
	TokenType.def_BEnd   : "start }",
	TokenType.def_Var    : "start \"let\"",
	TokenType.def_Class  : "start \"class\"",

	# Iteration
	TokenType.def_While : "start \"while\"",
	TokenType.def_Do    : "start \"do\"",
	TokenType.def_For   : "start \"for\"",

	# Procedures
	TokenType.def_Proc   : "start \"def\"",
	TokenType.def_Return : "start \"return\"",

	# Conditional
	TokenType.def_If     : "start \"if\"",
	TokenType.def_Else   : "start \"else\"",

	# Expressions
	TokenType.expr_PStart  : "start \\(",
	TokenType.expr_PEnd    : "start \\)",
	TokenType.expr_SBStart : "start [",
	TokenType.expr_SBEnd   : "start ]",
	TokenType.expr_New     : "start \"new\"",
	TokenType.expr_Super   : "start \"super\"",
	TokenType.expr_Extends : "start \"extends\"",

	#Operators

	# Logical
	TokenType.op_Relational : "start set(> <) =.repeat(0-1)",
	TokenType.op_Equality   : "start set(= \\!) =",
	TokenType.op_LAnd       : "start &&",
	TokenType.op_LOr        : "start \\| \\|",
	TokenType.op_LNot       : "start \\!",

	# Arithmetic
	TokenType.op_CAssign        : "start set(* / + \\-) =",
	TokenType.op_Assign         : "start =",
	TokenType.op_Additive       : "start set(+ \\-)",
	TokenType.op_Multiplicative : "start set(* /)",

	# Literals
	TokenType.literal_BTrue  : "start \"true\"",
	TokenType.literal_BFalse : "start \"false\"",
	TokenType.literal_Number : "start digit.repeat(1-)",
	TokenType.literal_String : "start \\\" !set( \\\" ).repeat(0-) \\\"",
	TokenType.literal_Null   : "start \"null\"",

	# Symbols
	TokenType.sym_This       : "start \"this\"",
	TokenType.sym_Identifier : "start word.repeat(1-)"
}

class Token:
	var Type  : String
	var Value : String


var SourceText : String
var Cursor     : int
var SpecRegex  : Dictionary
var Tokens     : Array
var TokenIndex : int = 0


func compile_regex():
	for type in TokenType.values() :
		var \
		regex = RegEx.new()
		
		var original   = Spec[type]
		var transpiled = SRegEx.compile(SSpec[type])
		
		assert(transpiled == original, "transpiled did not match original")
		
		regex.compile( transpiled )
		
		SpecRegex[type] = regex

func init(programSrcText):
	SourceText = programSrcText
	Cursor     = 0
	TokenIndex = 0

	if SpecRegex.size() == 0 :
		compile_regex()

	tokenize()

func next_Token():
	
	var nextToken = null
	
	if Tokens.size() > TokenIndex :
		nextToken   = Tokens[TokenIndex]
		TokenIndex += 1
	
	return nextToken

func reached_EndOfText():
	return Cursor >= SourceText.length()

func tokenize():
	Tokens.clear()

	while reached_EndOfText() == false :
		var srcLeft = SourceText.substr(Cursor)
		var token   = Token.new()

		var error = true
		for type in TokenType.values() :
			var result = SpecRegex[type].search( srcLeft )
			if  result == null || result.get_start() != 0 :
				continue

			# Skip Comments
			if type == TokenType.cmt_SL || type == TokenType.cmt_ML :
				Cursor += result.get_string().length()
				error   = false
				break
				
			# Skip Whitespace
			if type == TokenType.fmt_S :
				var addVal   = result.get_string().length()
				
				Cursor += addVal
				error   = false
				break

			token.Type   = type
			token.Value  = result.get_string()
			Cursor      += ( result.get_string().length() )
			
			Tokens.append( token )
			
			error = false
			break;

		if error :
			var assertStrTmplt = "next_token: Source text not understood by tokenizer at Cursor pos: {value} -: {txt}"
			var assertStr      = assertStrTmplt.format({"value" : Cursor, "txt" : srcLeft})
			assert(true != true, assertStr)
			return
Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00			`extends Object`

SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`var SRegEx = preload("res://RegM/Scripts/SRegex.gd").new()`

Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00
EOI: Lecture 7 complete 2022-07-23 23:56:37 -07:00			`class_name RDP_Lexer`
Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00

			`const TokenType : Dictionary = \`
			`{`
			`# Comments`
			`cmt_SL = "Comment Single Line",`
			`cmt_ML = "Comment Multi-Line",`

			`# Formatting`
			`fmt_S = "Formatting String",`

			`# Delimiters`
			`delim_Comma = "Comma Delimiter",`
			`delim_SMR = "Symbol Member Resolution",`

			`# Statements`
			`def_End = "Statement End",`
			`def_BStart = "Block Start",`
			`def_BEnd = "Block End",`
			`def_Var = "Variable Declaration",`
			`def_Class = "Class",`

			`# Iteration`
			`def_While = "While",`
			`def_Do = "Do-While",`
			`def_For = "For",`

			`# Procedures`
			`def_Proc = "Procedure Declaration",`
			`def_Return = "Return",`

			`# Conditional`
			`def_If = "If Statement",`
			`def_Else = "Else Statement",`

			`# Expressions`
			`expr_PStart = "Parenthesis Start",`
			`expr_PEnd = "Parenthesis End",`
			`expr_SBStart = "Bracket Start",`
			`expr_SBEnd = "Bracket End",`
			`expr_New = "New Expression",`
			`expr_Super = "Super Expression",`
			`expr_Extends = "Class Extension",`

			`# Operators`

			`# Logical`
			`op_Relational = "Relational",`
			`op_Equality = "Equality",`
			`op_LAnd = "Logical And",`
			`op_LOr = "Logical Or",`
			`op_LNot = "Logical Not",`

			`# Arithmetic`
			`op_CAssign = "ComplexAssignment",`
			`op_Assign = "Assignment",`
			`op_Additive = "AdditiveOperator",`
			`op_Multiplicative = "MultiplicativeOperator",`

			`# Literals`
			`literal_BTrue = "True",`
			`literal_BFalse = "False",`
			`literal_Number = "Number",`
			`literal_String = "String",`
			`literal_Null = "Null Value",`

			`# Symbols`
			`sym_This = "This Reference",`
			`sym_Identifier = "User Identifier",`
			`}`

			`const Spec : Dictionary = \`
			`{`
			`# Comments`
			`TokenType.cmt_SL : "^\\/\\/.*",`
			`TokenType.cmt_ML : "^\\/\\[\\s\\S]?\\*\\/",`

			`# Formatting`
			`TokenType.fmt_S : "^\\s+",`

			`# Delimiters`
			`TokenType.delim_Comma : "^,",`
			`TokenType.delim_SMR : "^\\.",`

			`# Statements`
			`TokenType.def_End : "^;",`
			`TokenType.def_BStart : "^{",`
			`TokenType.def_BEnd : "^}",`
			`TokenType.def_Var : "^\\blet\\b",`
			`TokenType.def_Class : "^\\bclass\\b",`

			`# Iteration`
			`TokenType.def_While : "^\\bwhile\\b",`
			`TokenType.def_Do : "^\\bdo\\b",`
			`TokenType.def_For : "^\\bfor\\b",`

			`# Procedures`
			`TokenType.def_Proc : "^\\bdef\\b",`
			`TokenType.def_Return : "^\\breturn\\b",`

			`# Conditional`
			`TokenType.def_If : "^\\bif\\b",`
			`TokenType.def_Else : "^\\belse\\b",`

			`# Expressions`
			`TokenType.expr_PStart : "^\\(",`
			`TokenType.expr_PEnd : "^\\)",`
			`TokenType.expr_SBStart : "^\\[",`
			`TokenType.expr_SBEnd : "^\\]",`
			`TokenType.expr_New : "^\\bnew\\b",`
			`TokenType.expr_Super : "^\\bsuper\\b",`
			`TokenType.expr_Extends : "^\\bextends\\b",`

			`#Operators`

			`# Logical`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.op_Relational : "^[><]=?",`
Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00			`TokenType.op_Equality : "^[=!]=",`
			`TokenType.op_LAnd : "^&&",`
			`TokenType.op_LOr : "^\\\|\\\|",`
			`TokenType.op_LNot : "^!",`

			`# Arithmetic`
Lecture 6 complete. 2022-07-20 11:57:26 -07:00			`TokenType.op_CAssign : "^[\\*\\/+\\-]=",`
Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00			`TokenType.op_Assign : "^=",`
Lecture 6 complete. 2022-07-20 11:57:26 -07:00			`TokenType.op_Additive : "^[+\\-]",`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.op_Multiplicative : "^[\\*\\/]",`
Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00
			`# Literals`
			`TokenType.literal_BTrue : "^\\btrue\\b",`
			`TokenType.literal_BFalse : "^\\bfalse\\b",`
			`TokenType.literal_Number : "^\\d+",`
			`TokenType.literal_String : "^\"[^\"]*\"",`
			`TokenType.literal_Null : "^\\bnull\\b",`

			`# Symbols`
			`TokenType.sym_This : "^\\bthis\\b",`
			`TokenType.sym_Identifier : "^\\w+"`
			`}`

SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`const SSpec : Dictionary = \`
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00			`{`
			`# Comments`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.cmt_SL : "start // inline.repeat(0-)",`
			`TokenType.cmt_ML : "start /* set(whitespace !whitespace).repeat(0-).lazy */",`
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00
			`# Formatting`
			`TokenType.fmt_S : "start whitespace.repeat(1-)",`

			`# Delimiters`
			`TokenType.delim_Comma : "start ,",`
			`TokenType.delim_SMR : "start \\.",`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00			`# Statements`
			`TokenType.def_End : "start ;",`
			`TokenType.def_BStart : "start {",`
			`TokenType.def_BEnd : "start }",`
			`TokenType.def_Var : "start \"let\"",`
			`TokenType.def_Class : "start \"class\"",`

			`# Iteration`
			`TokenType.def_While : "start \"while\"",`
			`TokenType.def_Do : "start \"do\"",`
			`TokenType.def_For : "start \"for\"",`

			`# Procedures`
			`TokenType.def_Proc : "start \"def\"",`
			`TokenType.def_Return : "start \"return\"",`

			`# Conditional`
			`TokenType.def_If : "start \"if\"",`
			`TokenType.def_Else : "start \"else\"",`

			`# Expressions`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.expr_PStart : "start \\(",`
			`TokenType.expr_PEnd : "start \\)",`
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00			`TokenType.expr_SBStart : "start [",`
			`TokenType.expr_SBEnd : "start ]",`
			`TokenType.expr_New : "start \"new\"",`
			`TokenType.expr_Super : "start \"super\"",`
			`TokenType.expr_Extends : "start \"extends\"",`

			`#Operators`

			`# Logical`
			`TokenType.op_Relational : "start set(> <) =.repeat(0-1)",`
			`TokenType.op_Equality : "start set(= \\!) =",`
			`TokenType.op_LAnd : "start &&",`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.op_LOr : "start \\\| \\\|",`
			`TokenType.op_LNot : "start \\!",`
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00
			`# Arithmetic`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.op_CAssign : "start set(* / + \\-) =",`
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00			`TokenType.op_Assign : "start =",`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.op_Additive : "start set(+ \\-)",`
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00			`TokenType.op_Multiplicative : "start set(* /)",`

			`# Literals`
			`TokenType.literal_BTrue : "start \"true\"",`
			`TokenType.literal_BFalse : "start \"false\"",`
			`TokenType.literal_Number : "start digit.repeat(1-)",`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00			`TokenType.literal_String : "start \\\" !set( \\\" ).repeat(0-) \\\"",`
Worked on SRegex transpiler to RegEx, 2022-07-17 00:09:42 -07:00			`TokenType.literal_Null : "start \"null\"",`

			`# Symbols`
			`TokenType.sym_This : "start \"this\"",`
			`TokenType.sym_Identifier : "start word.repeat(1-)"`
			`}`
Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00
			`class Token:`
			`var Type : String`
			`var Value : String`


			`var SourceText : String`
			`var Cursor : int`
			`var SpecRegex : Dictionary`
			`var Tokens : Array`
			`var TokenIndex : int = 0`


			`func compile_regex():`
			`for type in TokenType.values() :`
			`var \`
			`regex = RegEx.new()`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00
			`var original = Spec[type]`
Lecture 6 complete. 2022-07-20 11:57:26 -07:00			`var transpiled = SRegEx.compile(SSpec[type])`
SRegEx works!!!! Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand. 2022-07-17 04:32:57 -07:00
			`assert(transpiled == original, "transpiled did not match original")`

			`regex.compile( transpiled )`
Renamed BAPFS -> RDP, RDP completed. 2022-07-14 14:12:25 -07:00
			`SpecRegex[type] = regex`

			`func init(programSrcText):`
			`SourceText = programSrcText`
			`Cursor = 0`
			`TokenIndex = 0`

			`if SpecRegex.size() == 0 :`
			`compile_regex()`

			`tokenize()`

			`func next_Token():`

			`var nextToken = null`

			`if Tokens.size() > TokenIndex :`
			`nextToken = Tokens[TokenIndex]`
			`TokenIndex += 1`

			`return nextToken`

			`func reached_EndOfText():`
			`return Cursor >= SourceText.length()`

			`func tokenize():`
			`Tokens.clear()`

			`while reached_EndOfText() == false :`
			`var srcLeft = SourceText.substr(Cursor)`
			`var token = Token.new()`

			`var error = true`
			`for type in TokenType.values() :`
			`var result = SpecRegex[type].search( srcLeft )`
			`if result == null \|\| result.get_start() != 0 :`
			`continue`

			`# Skip Comments`
			`if type == TokenType.cmt_SL \|\| type == TokenType.cmt_ML :`
			`Cursor += result.get_string().length()`
			`error = false`
			`break`

			`# Skip Whitespace`
			`if type == TokenType.fmt_S :`
			`var addVal = result.get_string().length()`

			`Cursor += addVal`
			`error = false`
			`break`

			`token.Type = type`
			`token.Value = result.get_string()`
			`Cursor += ( result.get_string().length() )`

			`Tokens.append( token )`

			`error = false`
			`break;`

			`if error :`
			`var assertStrTmplt = "next_token: Source text not understood by tokenizer at Cursor pos: {value} -: {txt}"`
			`var assertStr = assertStrTmplt.format({"value" : Cursor, "txt" : srcLeft})`
			`assert(true != true, assertStr)`
			`return`