SRegEx works!!!!

Its not a full flegged transpiler but it works at least on RDP's lexer. I can expand on demand.
This commit is contained in:
Edward R. Gonzalez 2022-07-17 07:32:57 -04:00
parent 5ae405e284
commit 17c3b8fe36
6 changed files with 653 additions and 490 deletions

View File

@ -1,5 +1,7 @@
extends Object
var SRegEx = preload("res://RegM/Scripts/SRegex.gd").new()
class_name Lexer
@ -118,17 +120,17 @@ const Spec : Dictionary = \
#Operators
# Logical
TokenType.op_Relational : "^[>\\<]=?",
TokenType.op_Relational : "^[><]=?",
TokenType.op_Equality : "^[=!]=",
TokenType.op_LAnd : "^&&",
TokenType.op_LOr : "^\\|\\|",
TokenType.op_LNot : "^!",
# Arithmetic
TokenType.op_CAssign : "^[*\\/\\+\\-]=",
TokenType.op_CAssign : "^[\\*\\/+-]=",
TokenType.op_Assign : "^=",
TokenType.op_Additive : "^[+\\-]",
TokenType.op_Multiplicative : "^[*\\/]",
TokenType.op_Additive : "^[+-]",
TokenType.op_Multiplicative : "^[\\*\\/]",
# Literals
TokenType.literal_BTrue : "^\\btrue\\b",
@ -142,11 +144,11 @@ const Spec : Dictionary = \
TokenType.sym_Identifier : "^\\w+"
}
const SSpec : Dictonary =
const SSpec : Dictionary = \
{
# Comments
TokenType.cmt_SL : "start // inline.repeat()",
TokenType.cmt_ML : "start /* set(whitespace !whitespace).repeat.lazy */",
TokenType.cmt_SL : "start // inline.repeat(0-)",
TokenType.cmt_ML : "start /* set(whitespace !whitespace).repeat(0-).lazy */",
# Formatting
TokenType.fmt_S : "start whitespace.repeat(1-)",
@ -176,8 +178,8 @@ const SSpec : Dictonary =
TokenType.def_Else : "start \"else\"",
# Expressions
TokenType.expr_PStart : "start \(",
TokenType.expr_PEnd : "start \)",
TokenType.expr_PStart : "start \\(",
TokenType.expr_PEnd : "start \\)",
TokenType.expr_SBStart : "start [",
TokenType.expr_SBEnd : "start ]",
TokenType.expr_New : "start \"new\"",
@ -190,20 +192,20 @@ const SSpec : Dictonary =
TokenType.op_Relational : "start set(> <) =.repeat(0-1)",
TokenType.op_Equality : "start set(= \\!) =",
TokenType.op_LAnd : "start &&",
TokenType.op_LOr : "start \\\| \\\|",
TokenType.op_LNot : "start \\\!",
TokenType.op_LOr : "start \\| \\|",
TokenType.op_LNot : "start \\!",
# Arithmetic
TokenType.op_CAssign : "start set(* / + -) =",
TokenType.op_CAssign : "start set(* / + \\-) =",
TokenType.op_Assign : "start =",
TokenType.op_Additive : "start set(+ -)",
TokenType.op_Additive : "start set(+ \\-)",
TokenType.op_Multiplicative : "start set(* /)",
# Literals
TokenType.literal_BTrue : "start \"true\"",
TokenType.literal_BFalse : "start \"false\"",
TokenType.literal_Number : "start digit.repeat(1-)",
TokenType.literal_String : "start \\\" !set( \\\" ).repeat(1-) \\\" ",
TokenType.literal_String : "start \\\" !set( \\\" ).repeat(0-) \\\"",
TokenType.literal_Null : "start \"null\"",
# Symbols
@ -227,10 +229,15 @@ func compile_regex():
for type in TokenType.values() :
var \
regex = RegEx.new()
regex.compile( Spec[type] )
var original = Spec[type]
var transpiled = SRegEx.transpile(SSpec[type])
assert(transpiled == original, "transpiled did not match original")
regex.compile( transpiled )
SpecRegex[type] = regex
# SpecRegex[type].compile( Spec[type] )
func init(programSrcText):
SourceText = programSrcText

View File

@ -1,7 +1,7 @@
## Concatenation
Regex : `/^AB$/`
Psuedo: `start str(AB) end`
Psuedo: `start AB end`
Machine:
```
@ -13,7 +13,7 @@ Submachine_A --epsilon--> Submachine_B
## Union
Regex : `/^A|B$/`
Psuedo: `start glyph(A) | glyph(B) end`
Psuedo: `start A | B end`
Machine:
```
@ -27,7 +27,7 @@ Machine:
## Kleene Closure
Regex : `/^A*$/`
Psuedo: `start glyph(A).repeating end`
Psuedo: `start A.repeat(0-) end`
Machine:
```

View File

@ -0,0 +1,30 @@
# Complex Machines
Ex:
RegEx : `/xy*|z`
SRegEx: `x y.repeat(0-) | z`
## Decomposition
### Stage 1: Union
```
->o.start (o)
\epsilon-> o --xy*-> o -epsilon-->/
\epsilon-> o --z---> o -epsilon->/
```
### Stage 2: Concatenation
```
->o.start (o)
\epsilon -> o --x--> o -epsilon-> o --y* -epsilon->/
\epsilon -> o --z--> o -epsilon------------------>/
```
### Stage 2: Kleene Closure
```
|<------------<|
->epsi -> o -x-> o -epsi-> o -epsi-> o -y-> -epsi-> o ->epsi->|
| |>---------------------->| /
->o.start (o)
\epsi -> o -z-> o -epsi------------------------------------>/
```

View File

@ -0,0 +1,11 @@
# Syntactic Sugar
Ex:
RegEx : `/a+|[0-3]/`
SRegEx: `a.repeat(1-) | set(0-3)`
`A+` === `AA*` === `A.repeat(1-)` === `AA.repeat(0-)`
`A?` === `A|ε` === `A.repeat(0-1)`
`[0-9]` === `0|1|2|3|4|5|6|7|8|9` === `set(0-9)`

View File

@ -96,8 +96,6 @@ func union_pair(a : NFA, b : NFA):
return NFA.new(start, accepting)
func test():
var state_1 = State.new(false)
var state_2 = State.new(true)

View File

@ -12,27 +12,30 @@ extends Object
# Lexer
const TokenType = \
const TokenType : Dictionary = \
{
fmt_S = "Formatting",
str_start = "String Start",
str_end = "String End",
glyph_bPOpen = "\\(",
glyph_bPClose = "\\)",
expr_PStart = "Parenthesis Start",
expr_PEnd = "Parenthesis End",
glyph = "Glyph",
glyph_between = "Glyphs Between",
glyph_digit = "Digit",
glyph_inline = "inline",
glyph_word = "Word",
glyph_ws = "Whitespace",
glyph_dash = "-"
glyph_dash = "-",
glyph_dot = ". dot",
glyph_excla = "! Mark",
glyph_vertS = "|",
glyph_bPOpen = "(",
glyph_bPClose = ")",
glyph_dQuote = "\""
glyph_vertS = "\\|",
glyph_dQuote = "\"",
op_lazy = "Lazy Operator",
op_look = "Lookahead",
@ -43,47 +46,49 @@ const TokenType = \
ref = "Backreference Group",
set = "Set",
str_start = "String Start",
str_end = "String End",
string = "String"
string = "String",
glyph = "Glyph",
}
const TokenSpec = \
const Spec : Dictionary = \
{
TokenType.fmt_S = "^\\s",
TokenType.fmt_S : "^\\s",
TokenType.string = "^\"[^\"]*\"",
TokenType.str_start : "^\\bstart\\b",
TokenType.str_end : "^\\bend\\b",
TokenType.expr_PStart = "^\\(",
TokenType.expr_PEnd = "^\\)",
TokenType.string : "^\"[^\"]*\"",
TokenType.glyph_between = "^\\-"
TokenType.glyph_digit = "^\\bdigit\\b",
TokenType.glyph_inline = "^\\binline\\b",
TokenType.glyph_word = "^\\bword\\b",
TokenType.glyph_ws = "^\\bwhitespace\\b",
TokenType.glyph_bPOpen : "^\\\\\\(",
TokenType.glyph_bPClose : "^\\\\\\)",
TokenType.op_lazy = "^\\b.lazy\\b",
TokenType.op_repeat = "^\\b\\.repeat\\b",
TokenType.expr_PStart : "^\\(",
TokenType.expr_PEnd : "^\\)",
TokenType.glyph_dash = "^\\\-"
TokenType.glyph_dot = "^\\\.",
TokenType.glyph_excla = "^\\\!",
TokenType.glyph_vertS = "^\\\|",
TokenType.glyph_bPOpen = "^\\\(",
TokenType.glyph_bPClose = "^\\\)",
TokenType.glpyh_dQuote = "^\\\"",
TokenType.glyph_between : "^\\-",
TokenType.glyph_digit : "^\\bdigit\\b",
TokenType.glyph_inline : "^\\binline\\b",
TokenType.glyph_word : "^\\bword\\b",
TokenType.glyph_ws : "^\\bwhitespace\\b",
TokenType.op_look = "^\\blook\\b",
TokenType.op_not = "^\\!",
TokenType.op_union = "^\\|",
TokenType.op_lazy : "^\\.\\blazy\\b",
TokenType.op_repeat : "^\\.\\brepeat\\b",
TokenType.ref = "^\\bbackref\\b",
TokenType.set = "^\\bset\\b",
TokenType.str_start = "^\\bstart\\b",
TokenType.str_end = "^\\bend\\b",
TokenType.glyph_dash : "^\\\\\\-",
TokenType.glyph_dot : "^\\\\\\.",
TokenType.glyph_excla : "^\\\\\\!",
TokenType.glyph_vertS : "^\\\\\\|",
TokenType.glyph_dQuote : "^\\\\\"",
TokenType.glyph = "^[\\w\\d]"
TokenType.op_look : "^\\blook\\b",
TokenType.op_not : "^\\!",
TokenType.op_union : "^\\|",
TokenType.ref : "^\\bbackref\\b",
TokenType.set : "^\\bset\\b",
TokenType.glyph : "^[^\\s]"
}
@ -103,6 +108,7 @@ func compile_regex():
for type in TokenType.values() :
var \
regex = RegEx.new()
var _spec = Spec[type]
regex.compile( Spec[type] )
SpecRegex[type] = regex
@ -161,7 +167,7 @@ func tokenize():
break;
if error :
var assertStrTmplt = "next_token: Source text not understood by tokenizer at Cursor pos: {value} -: {txt}"
var assertStrTmplt = "next_Token: Source text not understood by tokenizer at Cursor pos: {value} -: {txt}"
var assertStr = assertStrTmplt.format({"value" : Cursor, "txt" : srcLeft})
assert(true != true, assertStr)
return
@ -245,23 +251,23 @@ const NodeType = \
{
expression = "Expression",
between = "Glyphs Between Set"
between = "Glyphs Between Set",
capture = "Capture Group",
lazy = "Lazy",
look = "Lookahead",
op_not = "Not Operator",
ref = "Backreference Group",
repeat = "Repeat",
set = "Set",
union = "Union",
inline = "Inline",
digit = "Digit",
inline = "Any Inline"
inline = "Any Inline",
word = "Word",
whitespace = "Whitespace",
string = "String"
strStart = "String Start",
strEnd = "String End",
string = "String",
str_start = "String Start",
str_end = "String End",
glyph = "Glyph",
}
@ -286,19 +292,30 @@ func eat(tokenType):
return currToken
func is_Glyph() :
match NextToken:
func is_Glyph(glyph = NextToken) :
match glyph.Type:
TokenType.glyph :
return true
TokenType.glyph_digit :
return true
TokenType.glyph_inline :
return true
TokenType.glyph_word :
return true
TokenType.glyph_ws :
return true
TokenType.glyph_dash :
return true
TokenType.glyph_dot :
return true
TokenType.glyph_excla :
return true
TokenType.glyph_vertS :
return true
TokenType.glyph_bPOpen :
return true
TokenType.glyph_bPClose :
return true
TokenType.glyph_dQuote :
return true
@ -307,6 +324,39 @@ func is_Glyph() :
func is_GlyphOrStr() :
return is_Glyph() || NextToken.Type == TokenType.string
func is_GroupToken() :
if NextToken.Value.length() == 2 && NextToken.Value[0] == "\\" :
match NextToken.Value[1] :
"0" : continue
"1" : continue
"2" : continue
"3" : continue
"4" : continue
"5" : continue
"6" : continue
"7" : continue
"8" : continue
"9" : continue
_:
return true
return false
func is_RegExToken() :
match NextToken.Value :
"^" :
return true
"$" :
return true
"*" :
return true
"[" :
return true
"]" :
return true
"?" :
return true
return
# --------------------------------------------------------------------- HELPERS
# > Union
@ -314,10 +364,10 @@ func is_GlyphOrStr() :
# : expression | expression ..
# | expression
# ;
func parse_OpUnion():
var expression = parse_Expression(TokenType.union)
func parse_OpUnion(endToken : Token):
var expression = parse_Expression(endToken)
if NextToken.Type != TokenType.union :
if NextToken == null || NextToken.Type != TokenType.union :
return expression
eat(TokenType.op_union)
@ -325,7 +375,7 @@ func parse_OpUnion():
var \
node = ASTNode.new()
node.Type = NodeType.union
node.Value = [ expression, parse_union() ]
node.Value = [ expression, parse_OpUnion(endToken) ]
return node
@ -333,14 +383,18 @@ func parse_OpUnion():
# Expression
# : EVERYTHING (Almost)
# ;
func parse_Expression(end_token : Token):
func parse_Expression(endToken : Token):
var \
node = ASTNode.new()
node.Type = NodeType.Expression
node.Type = NodeType.expression
node.Value = []
while NextToken != null && NextToken.Type != end_token :
match NextToken.Type
var sentinel = endToken != null
if sentinel :
sentinel = NextToken.Type == endToken.Type
while NextToken != null && !sentinel :
match NextToken.Type :
TokenType.str_start :
node.Value.append( parse_StrStart() )
@ -414,7 +468,7 @@ func parse_StrStart():
var \
node = ASTNode.new()
node.Type = NodeType.strStart
node.Type = NodeType.str_start
return node
@ -424,7 +478,7 @@ func parse_StrEnd():
var \
node = ASTNode.new()
node.Type = NodeType.strEnd
node.Type = NodeType.str_end
return node
@ -434,9 +488,46 @@ func parse_StrEnd():
# | glyph - glyph
# ;
func parse_Between():
var glyph = parse_Glyph()
var glyph
if NextToken.Type != TokenType.between :
match NextToken.Type :
TokenType.glyph :
glyph = parse_Glyph()
TokenType.glyph_digit :
glyph = parse_GlyphDigit()
TokenType.glyph_inline :
glyph = parse_GlyphInline()
TokenType.glyph_word :
glyph = parse_GlyphWord()
TokenType.glyph_ws :
glyph = parse_GlyphWhitespace()
TokenType.glyph_dash :
glyph = parse_GlyphDash()
TokenType.glyph_dot :
glyph = parse_GlyphDot()
TokenType.glyph_excla :
glyph = parse_GlyphExclamation()
TokenType.glyph_vertS :
glyph = parse_GlyphVertS()
TokenType.glyph_bPOpen :
glyph = parse_Glyph_bPOpen()
TokenType.glyph_bPClose :
glyph = parse_Glyph_bPClose()
TokenType.glyph_dQuote :
glyph = parse_Glyph_DQuote()
if NextToken.Type != TokenType.glyph_between :
return glyph
var \
@ -449,7 +540,7 @@ func parse_Between():
if NextToken.Type == TokenType.glyph_between:
eat(TokenType.glyph_between)
if is_Glyph()
if is_Glyph() :
node.Value.append( parse_Glyph() )
return node
@ -464,7 +555,7 @@ func parse_CaptureGroup():
var \
node = ASTNode.new()
node.Type = NodeType.capture
node.Value = parse_union(TokenType.expr_PEnd)
node.Value = parse_OpUnion(TokenType.expr_PEnd)
eat(TokenType.expr_PEnd)
@ -476,13 +567,21 @@ func parse_CaptureGroup():
# : glyph
# ;
func parse_Glyph():
eat(TokenType.glyph)
var \
node = ASTNode.new()
node.Type = NodeType.glyph
if NextToken.Value == "/" :
node.Value = "\\/"
elif is_RegExToken() :
node.Value = "\\" + NextToken.Value
elif is_GroupToken() :
node.Value = "\\\\" + NextToken.Value[1]
else :
node.Value = NextToken.Value
eat(TokenType.glyph)
return node
func parse_GlyphDigit():
@ -501,7 +600,7 @@ func parse_GlyphInline():
var \
node = ASTNode.new()
node.Type = NodeType.inline
node.Value = "\."
node.Value = "."
return node
@ -550,8 +649,8 @@ func parse_GlyphExclamation():
var \
node = ASTNode.new()
ndoe.Type = NodeType.glyph
node.Value = "\\!"
node.Type = NodeType.glyph
node.Value = "!"
return node
@ -591,7 +690,7 @@ func parse_Glyph_DQuote():
var \
node = ASTNode.new()
node.Type = NodeType.glyph
node.Value = "\\\""
node.Value = "\""
return node
@ -636,7 +735,7 @@ func parse_OpNot():
var \
node = ASTNode.new()
node.Type = NodeType.op_Not
node.Type = NodeType.op_not
match NextToken.Type:
TokenType.expr_PStart:
@ -651,7 +750,7 @@ func parse_OpNot():
TokenType.glyph_ws:
node.Value = parse_GlyphWhitespace()
TokenType.look:
TokenType.op_look:
node.Value = parse_OpLook()
TokenType.string:
@ -673,19 +772,19 @@ func parse_OpRepeat():
node = ASTNode.new()
node.Type = NodeType.repeat
var range = null
var vrange = null
var lazy = null
eat(TokenType.expr_PStart)
range = parse_Between()
vrange = parse_Between()
eat(TokenType.expr_PEnd)
if NextToken.Type == TokenType.lazy :
if NextToken && NextToken.Type == TokenType.op_lazy :
lazy = parse_OpLazy();
node.Value = [ range, lazy ]
node.Value = [ vrange, lazy ]
return node
@ -699,7 +798,7 @@ func parse_Backreference():
eat(TokenType.expr_PStart)
var assertStrTmplt = "Error when parsing a backreference expression: Expected digit but got: {value}"
var assertStr = assertStrTmplt.format({"value" : NextToken.Value)
var assertStr = assertStrTmplt.format({"value" : NextToken.Value})
assert(NextToken.Type == TokenType.glyph_digit, assertStr)
node.Value = NextToken.Value
@ -718,7 +817,15 @@ func parse_Set():
eat(TokenType.expr_PStart)
while is_Glyph() :
while is_Glyph() || NextToken.Type == TokenType.op_not :
if NextToken.Type == TokenType.op_not :
var possibleGlyph = parse_OpNot()
if is_Glyph(possibleGlyph.Value) :
node.Value.append( possibleGlyph )
continue
assert(true == false, "Bad ! operator in set.")
node.Value.append( parse_Between() )
eat(TokenType.expr_PEnd)
@ -726,12 +833,19 @@ func parse_Set():
return node
func parse_String():
var string = ""
var index = 1
while NextToken.Value[index] != "\"" :
string += NextToken.Value[index]
index += 1
var \
node = ASTNode.new()
node.Type = NodeType.string
node.Value = NextToken.Value
node.Value = string
eat(TokenType.str)
eat(TokenType.string)
return node
@ -746,17 +860,20 @@ var RegexResult : String
func transpile(expression : String):
init( expression )
NextToken = next_token()
ExprAST = parse_union()
NextToken = next_Token()
ExprAST = parse_OpUnion(null)
return transiple_Union(ExprAST)
func transiple_Union(node : ASTNode):
var result = String
var expressionLeft = node.Value[0]
var result = ""
var expressionLeft = node.Value
for entry in expressionLeft
match entry :
if node.Type == NodeType.union :
expressionLeft = node.Value[0]
for entry in expressionLeft :
match entry.Type :
NodeType.str_start:
result += "^"
NodeType.str_end:
@ -775,13 +892,13 @@ func transiple_Union(node : ASTNode):
NodeType.glyph:
result += entry.Value
NodeType.glyph_inline:
NodeType.inline:
result += entry.Value
NodeType.glyph_digit:
NodeType.digit:
result += entry.Value
NodeType.glyph_word:
NodeType.word:
result += entry.Value
NodeType.glyph_ws:
NodeType.whitespace:
result += entry.Value
NodeType.string:
@ -791,21 +908,12 @@ func transiple_Union(node : ASTNode):
result += transpile_OpNot(entry)
if node.Value[1] != null :
if node.Type == NodeType.union && node.Value[1] != null :
result += "|"
result += transiple_Union(node.Value[1])
return result
func transpile_Between(node : ASTNode):
var \
result : "["
result += node.Value[0]
result += node.Value[1]
result += "]"
return result
func transpile_CaptureGroup(node : ASTNode, negate : bool):
var result = ""
@ -830,6 +938,8 @@ func transpile_LookAhead(node : ASTNode, negate : bool):
result += transiple_Union(node.Value)
result += ")"
return result
func transpile_Backreference(node : ASTNode):
var \
result = "\\"
@ -837,31 +947,31 @@ func transpile_Backreference(node : ASTNode):
return result
func transpile_Repeat(node : ASTNode)
func transpile_Repeat(node : ASTNode):
var result = ""
var range = node.Value[0]
var vrange = node.Value[0]
var lazy = node.Value[1]
if range.Type == NodeType.between :
if range.Value.length() == 1 :
if range.Value[0] == "0" :
if vrange.Type == NodeType.between :
if vrange.Value.size() == 1 :
if vrange.Value[0].Value == "0" :
result += "*"
if range.Value[0] == "1" :
if vrange.Value[0].Value == "1" :
result += "+"
if range.Value.length() == 2 :
if range.Vlaue[0] == "0" && range.Value[1] == "1" :
if vrange.Value.size() == 2 :
if vrange.Value[0].Value == "0" && vrange.Value[1].Value == "1" :
result += "?"
else :
result += "{" + range.Value[0] + "," + range.Value[1] + "}"
result += "{" + vrange.Value[0].Value[0] + "," + vrange.Value[0].Value[1] + "}"
else :
result += "{" + range.Value[0] + "}"
result += "{" + vrange.Value[0] + "}"
if lazy != null :
result += "?"
return result
func transpile_Set(node : ASTNode, negate : bool)
func transpile_Set(node : ASTNode, negate : bool):
var result = ""
if negate :
@ -870,6 +980,13 @@ func transpile_Set(node : ASTNode, negate : bool)
result += "["
for entry in node.Value :
if entry.Type == NodeType.op_not :
result += transpile_OpNot(entry)
elif entry.Type == NodeType.between :
result += entry.Value[0]
result += "-"
result += entry.Value[1]
else :
result += entry.Value
result += "]"
@ -898,18 +1015,18 @@ func transpile_OpNot(node : ASTNode):
var entry = node.Value
match entry :
match entry.Type :
NodeType.capture:
result += transpile_CaptureGroup(entry, true)
NodeType.glyph_digit:
NodeType.digit:
result += "\\D"
NodeType.glyph_word:
NodeType.word:
result += "\\W"
NodeType.glyph_ws:
NodeType.whitespace:
result += "\\S"
NodeType.glyph_look:
NodeType.look:
result += transpile_LookAhead(entry, true)
NodType.string:
NodeType.string:
result += transpile_String(entry, true)
NodeType.set:
result += transpile_Set(entry, true)