mirror of
https://github.com/Ed94/LangStudies.git
synced 2025-01-22 12:33:45 -08:00
1075 lines
20 KiB
GDScript
1075 lines
20 KiB
GDScript
extends Object
|
|
|
|
|
|
# Simple Regular Expressions
|
|
# This is a "high-level" langauge and transpiler for regex
|
|
# That makes it easier to write out and read
|
|
# than the original notation or syntax.
|
|
#
|
|
# The main interface function is transpile( <string> )
|
|
# Which can take any valid string from gdscript.
|
|
|
|
|
|
# Lexer
|
|
|
|
const TokenType : Dictionary = \
|
|
{
|
|
fmt_S = "Formatting",
|
|
cmt_SL = "Comment Single Line",
|
|
|
|
str_start = "String Start",
|
|
str_end = "String End",
|
|
|
|
glyph_bPOpen = "\\(",
|
|
glyph_bPClose = "\\)",
|
|
|
|
expr_PStart = "Parenthesis Start",
|
|
expr_PEnd = "Parenthesis End",
|
|
|
|
glyph_between = "Glyphs Between",
|
|
glyph_digit = "Digit",
|
|
glyph_inline = "inline",
|
|
glyph_space = "Space",
|
|
glyph_word = "Word",
|
|
glyph_ws = "Whitespace",
|
|
|
|
glyph_dash = "-",
|
|
glyph_dot = ". dot",
|
|
glyph_excla = "! Mark",
|
|
glyph_vertS = "\\|",
|
|
glyph_dQuote = "\"",
|
|
|
|
op_lazy = "Lazy Operator",
|
|
op_look = "Lookahead",
|
|
op_not = "Not Operator",
|
|
op_repeat = "Repeating Operator",
|
|
op_union = "Union Operator",
|
|
|
|
ref = "Backreference Group",
|
|
set = "Set",
|
|
|
|
string = "String",
|
|
|
|
glyph = "Glyph",
|
|
}
|
|
|
|
const Spec : Dictionary = \
|
|
{
|
|
TokenType.fmt_S : "^\\s",
|
|
TokenType.cmt_SL: "^(\\(\\?\\#).+(\\))",
|
|
|
|
TokenType.str_start : "^\\bstart\\b",
|
|
TokenType.str_end : "^\\bend\\b",
|
|
|
|
TokenType.string : "^\"[^\"]*\"",
|
|
|
|
TokenType.glyph_bPOpen : "^\\\\\\(",
|
|
TokenType.glyph_bPClose : "^\\\\\\)",
|
|
|
|
TokenType.expr_PStart : "^\\(",
|
|
TokenType.expr_PEnd : "^\\)",
|
|
|
|
TokenType.glyph_between : "^\\-",
|
|
TokenType.glyph_digit : "^\\bdigit\\b",
|
|
TokenType.glyph_inline : "^\\binline\\b",
|
|
TokenType.glyph_space : "^\\bspace\\b",
|
|
TokenType.glyph_word : "^\\bword\\b",
|
|
TokenType.glyph_ws : "^\\bwhitespace\\b",
|
|
|
|
TokenType.op_lazy : "^\\.\\blazy\\b",
|
|
TokenType.op_repeat : "^\\.\\brepeat\\b",
|
|
|
|
TokenType.glyph_dash : "^\\\\\\-",
|
|
TokenType.glyph_dot : "^\\\\\\.",
|
|
TokenType.glyph_excla : "^\\\\\\!",
|
|
TokenType.glyph_vertS : "^\\\\\\|",
|
|
TokenType.glyph_dQuote : "^\\\\\"",
|
|
|
|
TokenType.op_look : "^\\blook\\b",
|
|
TokenType.op_not : "^\\!",
|
|
TokenType.op_union : "^\\|",
|
|
|
|
TokenType.ref : "^\\bbackref\\b",
|
|
TokenType.set : "^\\bset\\b",
|
|
|
|
TokenType.glyph : "^[^\\s]"
|
|
}
|
|
|
|
|
|
class Token:
|
|
var Type : String
|
|
var Value : String
|
|
|
|
|
|
var SourceText : String
|
|
var Cursor : int
|
|
var SpecRegex : Dictionary
|
|
var Tokens : Array
|
|
var TokenIndex : int = 0
|
|
|
|
|
|
func compile_regex():
|
|
for type in TokenType.values() :
|
|
var \
|
|
regex = RegEx.new()
|
|
var _spec = Spec[type]
|
|
regex.compile( Spec[type] )
|
|
|
|
SpecRegex[type] = regex
|
|
|
|
func init(programSrcText):
|
|
SourceText = programSrcText
|
|
Cursor = 0
|
|
TokenIndex = 0
|
|
|
|
if SpecRegex.size() == 0 :
|
|
compile_regex()
|
|
|
|
tokenize()
|
|
|
|
func next_Token():
|
|
|
|
var nextToken = null
|
|
|
|
if Tokens.size() > TokenIndex :
|
|
nextToken = Tokens[TokenIndex]
|
|
TokenIndex += 1
|
|
|
|
return nextToken
|
|
|
|
func reached_EndOfText():
|
|
return Cursor >= SourceText.length()
|
|
|
|
func tokenize():
|
|
Tokens.clear()
|
|
|
|
while reached_EndOfText() == false :
|
|
var srcLeft = SourceText.substr(Cursor)
|
|
var token = Token.new()
|
|
|
|
var error = true
|
|
for type in TokenType.values() :
|
|
var result = SpecRegex[type].search( srcLeft )
|
|
if result == null || result.get_start() != 0 :
|
|
continue
|
|
|
|
# Skip Comments
|
|
if type == TokenType.cmt_SL :
|
|
Cursor += result.get_string().length()
|
|
error = false
|
|
break
|
|
|
|
# Skip Whitespace
|
|
if type == TokenType.fmt_S :
|
|
var addVal = result.get_string().length()
|
|
|
|
Cursor += addVal
|
|
error = false
|
|
break
|
|
|
|
token.Type = type
|
|
token.Value = result.get_string()
|
|
Cursor += ( result.get_string().length() )
|
|
|
|
Tokens.append( token )
|
|
|
|
error = false
|
|
break;
|
|
|
|
if error :
|
|
var assertStrTmplt = "next_Token: Source text not understood by tokenizer at Cursor pos: {value} -: {txt}"
|
|
var assertStr = assertStrTmplt.format({"value" : Cursor, "txt" : srcLeft})
|
|
assert(true != true, assertStr)
|
|
return
|
|
|
|
# End : Lexer
|
|
|
|
|
|
|
|
# Parser
|
|
|
|
class ASTNode:
|
|
var Type : String
|
|
var Value # Not specifing a type implicity declares a Variant type.
|
|
|
|
func array_Serialize(array, fn_objSerializer) :
|
|
var result = []
|
|
|
|
for entry in array :
|
|
if typeof(entry) == TYPE_ARRAY :
|
|
result.append( array_Serialize( entry, fn_objSerializer ))
|
|
|
|
elif typeof(entry) == TYPE_OBJECT :
|
|
fn_objSerializer.set_instance(entry)
|
|
result.append( fn_objSerializer.call_func() )
|
|
|
|
else :
|
|
result.append( entry )
|
|
|
|
return result
|
|
|
|
func to_SExpression():
|
|
var expression = [ Type ]
|
|
|
|
if typeof(Value) == TYPE_ARRAY :
|
|
var \
|
|
to_SExpression_Fn = FuncRef.new()
|
|
to_SExpression_Fn.set_function("to_SExpression")
|
|
|
|
var array = array_Serialize( self.Value, to_SExpression_Fn )
|
|
|
|
expression.append(array)
|
|
return expression
|
|
|
|
if typeof(Value) == TYPE_OBJECT :
|
|
var result = [ Type, Value.to_SExpression() ]
|
|
return result
|
|
|
|
expression.append(Value)
|
|
return expression
|
|
|
|
func to_Dictionary():
|
|
if typeof(Value) == TYPE_ARRAY :
|
|
var \
|
|
to_Dictionary_Fn = FuncRef.new()
|
|
to_Dictionary_Fn.set_function("to_Dictionary")
|
|
|
|
var array = array_Serialize( self.Value, to_Dictionary_Fn )
|
|
var result = \
|
|
{
|
|
Type = self.Type,
|
|
Value = array
|
|
}
|
|
return result
|
|
|
|
if typeof(Value) == TYPE_OBJECT :
|
|
var result = \
|
|
{
|
|
Type = self.Type,
|
|
Value = self.Value.to_Dictionary()
|
|
}
|
|
return result
|
|
|
|
var result = \
|
|
{
|
|
Type = self.Type,
|
|
Value = self.Value
|
|
}
|
|
return result
|
|
|
|
const NodeType : Dictionary = \
|
|
{
|
|
expression = "Expression",
|
|
|
|
between = "Glyphs Between Set",
|
|
capture = "Capture Group",
|
|
lazy = "Lazy",
|
|
look = "Lookahead",
|
|
op_not = "Not Operator",
|
|
ref = "Backreference Group",
|
|
repeat = "Repeat",
|
|
set = "Set",
|
|
union = "Union",
|
|
|
|
digit = "Digit",
|
|
inline = "Any Inline",
|
|
space = "Space",
|
|
word = "Word",
|
|
whitespace = "Whitespace",
|
|
string = "String",
|
|
str_start = "String Start",
|
|
str_end = "String End",
|
|
|
|
glyph = "Glyph",
|
|
}
|
|
|
|
|
|
var NextToken : Token
|
|
|
|
# --------------------------------------------------------------------- HELPERS
|
|
|
|
# Gets the next token only if the current token is the specified intended token (tokenType)
|
|
func eat(tokenType):
|
|
var currToken = NextToken
|
|
|
|
assert(currToken != null, "eat: NextToken was null")
|
|
|
|
var assertStrTmplt = "eat: Unexpected token: {value}, expected: {type}"
|
|
var assertStr = assertStrTmplt.format({"value" : currToken.Value, "type" : tokenType})
|
|
|
|
assert(currToken.Type == tokenType, assertStr)
|
|
|
|
NextToken = next_Token()
|
|
|
|
return currToken
|
|
|
|
func is_Glyph(glyph = NextToken) :
|
|
match glyph.Type:
|
|
TokenType.glyph :
|
|
return true
|
|
TokenType.glyph_digit :
|
|
return true
|
|
TokenType.glyph_inline :
|
|
return true
|
|
TokenType.glyph_word :
|
|
return true
|
|
TokenType.glyph_ws :
|
|
return true
|
|
TokenType.glyph_dash :
|
|
return true
|
|
TokenType.glyph_dot :
|
|
return true
|
|
TokenType.glyph_excla :
|
|
return true
|
|
TokenType.glyph_vertS :
|
|
return true
|
|
TokenType.glyph_bPOpen :
|
|
return true
|
|
TokenType.glyph_bPClose :
|
|
return true
|
|
TokenType.glyph_dQuote :
|
|
return true
|
|
|
|
return false
|
|
|
|
func is_GlyphOrStr() :
|
|
return is_Glyph() || NextToken.Type == TokenType.string
|
|
|
|
func is_GroupToken() :
|
|
if NextToken.Value.length() == 2 && NextToken.Value[0] == "\\" :
|
|
match NextToken.Value[1] :
|
|
"0" : continue
|
|
"1" : continue
|
|
"2" : continue
|
|
"3" : continue
|
|
"4" : continue
|
|
"5" : continue
|
|
"6" : continue
|
|
"7" : continue
|
|
"8" : continue
|
|
"9" : continue
|
|
_:
|
|
return true
|
|
return false
|
|
|
|
func is_Number() :
|
|
var \
|
|
regex = RegEx.new()
|
|
regex.compile("^\\d")
|
|
|
|
return regex.search(NextToken.Value) != null
|
|
|
|
func is_RegExToken() :
|
|
match NextToken.Value :
|
|
"^" :
|
|
return true
|
|
"$" :
|
|
return true
|
|
"*" :
|
|
return true
|
|
"[" :
|
|
return true
|
|
"]" :
|
|
return true
|
|
"?" :
|
|
return true
|
|
return
|
|
|
|
# --------------------------------------------------------------------- HELPERS
|
|
|
|
# > Union
|
|
# Union
|
|
# : expression | expression ..
|
|
# | expression
|
|
# ;
|
|
func parse_OpUnion(endToken):
|
|
var expression = parse_Expression(endToken)
|
|
|
|
if NextToken == null || NextToken.Type != TokenType.op_union :
|
|
return expression
|
|
|
|
eat(TokenType.op_union)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.union
|
|
node.Value = [ expression, parse_OpUnion(endToken) ]
|
|
|
|
return node
|
|
|
|
# > Union
|
|
# Expression
|
|
# : EVERYTHING (Almost)
|
|
# ;
|
|
func parse_Expression(endToken):
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.expression
|
|
node.Value = []
|
|
|
|
while NextToken != null && NextToken.Type != TokenType.op_union :
|
|
if endToken != null && NextToken.Type == endToken :
|
|
break
|
|
|
|
match NextToken.Type :
|
|
TokenType.str_start :
|
|
node.Value.append( parse_StrStart() )
|
|
|
|
TokenType.str_end :
|
|
node.Value.append( parse_StrEnd() )
|
|
|
|
TokenType.expr_PStart :
|
|
node.Value.append( parse_CaptureGroup() )
|
|
|
|
TokenType.glyph :
|
|
node.Value.append( parse_Glyph() )
|
|
|
|
TokenType.glyph_digit :
|
|
node.Value.append( parse_GlyphDigit() )
|
|
|
|
TokenType.glyph_inline :
|
|
node.Value.append( parse_GlyphInline() )
|
|
|
|
TokenType.glyph_space :
|
|
node.Value.append( parse_GlyphSpace() )
|
|
|
|
TokenType.glyph_word :
|
|
node.Value.append( parse_GlyphWord() )
|
|
|
|
TokenType.glyph_ws :
|
|
node.Value.append( parse_GlyphWhitespace() )
|
|
|
|
|
|
TokenType.glyph_dash :
|
|
node.Value.append( parse_GlyphDash() )
|
|
|
|
TokenType.glyph_dot :
|
|
node.Value.append( parse_GlyphDot() )
|
|
|
|
TokenType.glyph_excla :
|
|
node.Value.append( parse_GlyphExclamation() )
|
|
|
|
TokenType.glyph_vertS :
|
|
node.Value.append( parse_GlyphVertS() )
|
|
|
|
TokenType.glyph_bPOpen :
|
|
node.Value.append( parse_Glyph_bPOpen() )
|
|
|
|
TokenType.glyph_bPClose :
|
|
node.Value.append( parse_Glyph_bPClose() )
|
|
|
|
TokenType.glyph_dQuote :
|
|
node.Value.append( parse_Glyph_DQuote() )
|
|
|
|
|
|
TokenType.op_look :
|
|
node.Value.append( parse_OpLook() )
|
|
|
|
TokenType.op_not :
|
|
node.Value.append( parse_OpNot() )
|
|
|
|
TokenType.op_repeat:
|
|
node.Value.append( parse_OpRepeat() )
|
|
|
|
TokenType.ref :
|
|
node.Value.append( parse_Backreference() )
|
|
|
|
TokenType.set :
|
|
node.Value.append( parse_Set() )
|
|
|
|
TokenType.string :
|
|
node.Value.append( parse_String() )
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
func parse_StrStart():
|
|
eat(TokenType.str_start)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.str_start
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
func parse_StrEnd():
|
|
eat(TokenType.str_end)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.str_end
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
# Between
|
|
# : glyph
|
|
# | glyph - glyph
|
|
# ;
|
|
func parse_Between(quantifier : bool = false):
|
|
var glyph
|
|
|
|
match NextToken.Type :
|
|
TokenType.glyph :
|
|
glyph = parse_Glyph(quantifier)
|
|
# TokenType.glyph_digit :
|
|
# glyph = parse_GlyphDigit()
|
|
TokenType.glyph_inline :
|
|
glyph = parse_GlyphInline()
|
|
# TokenType.glyph_word :
|
|
# glyph = parse_GlyphWord()
|
|
TokenType.glyph_ws :
|
|
glyph = parse_GlyphWhitespace()
|
|
TokenType.glyph_dash :
|
|
glyph = parse_GlyphDash()
|
|
TokenType.glyph_dot :
|
|
glyph = parse_GlyphDot()
|
|
TokenType.glyph_excla :
|
|
glyph = parse_GlyphExclamation()
|
|
TokenType.glyph_vertS :
|
|
glyph = parse_GlyphVertS()
|
|
TokenType.glyph_bPOpen :
|
|
glyph = parse_Glyph_bPOpen()
|
|
TokenType.glyph_bPClose :
|
|
glyph = parse_Glyph_bPClose()
|
|
TokenType.glyph_dQuote :
|
|
glyph = parse_Glyph_DQuote()
|
|
|
|
if NextToken.Type != TokenType.glyph_between :
|
|
return glyph
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.between
|
|
node.Value = []
|
|
|
|
node.Value.append( glyph )
|
|
|
|
if NextToken.Type == TokenType.glyph_between:
|
|
eat(TokenType.glyph_between)
|
|
|
|
if is_Glyph() :
|
|
node.Value.append( parse_Glyph(quantifier) )
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
# CaptureGroup
|
|
# : ( OpUnion )
|
|
# ;
|
|
func parse_CaptureGroup():
|
|
eat(TokenType.expr_PStart)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.capture
|
|
node.Value = parse_OpUnion(TokenType.expr_PEnd)
|
|
|
|
eat(TokenType.expr_PEnd)
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
# > Between
|
|
# Glyph
|
|
# : glyph
|
|
# ;
|
|
func parse_Glyph(numerical = false):
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
|
|
node.Value = ""
|
|
|
|
while NextToken.Type == TokenType.glyph :
|
|
if NextToken.Value == "/" :
|
|
node.Value += "\\/"
|
|
elif is_RegExToken() :
|
|
node.Value += "\\" + NextToken.Value
|
|
elif is_GroupToken() :
|
|
node.Value += "\\\\" + NextToken.Value[1]
|
|
else :
|
|
node.Value += NextToken.Value
|
|
|
|
eat(TokenType.glyph)
|
|
|
|
if numerical == false :
|
|
break
|
|
|
|
return node
|
|
|
|
func parse_GlyphDigit():
|
|
eat(TokenType.glyph_digit)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.digit
|
|
node.Value = "\\d"
|
|
|
|
return node
|
|
|
|
func parse_GlyphInline():
|
|
eat(TokenType.glyph_inline)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.inline
|
|
node.Value = "."
|
|
|
|
return node
|
|
|
|
func parse_GlyphSpace():
|
|
eat(TokenType.glyph_space)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.space
|
|
node.Value = " "
|
|
|
|
if NextToken.Type == TokenType.expr_PStart :
|
|
eat(TokenType.expr_PStart)
|
|
|
|
var numGlyph = parse_Glyph(true)
|
|
for n in range(int(numGlyph.Value)) :
|
|
node.Value += " "
|
|
|
|
eat(TokenType.expr_PEnd)
|
|
|
|
return node
|
|
|
|
func parse_GlyphWord():
|
|
eat(TokenType.glyph_word)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.word
|
|
node.Value = "\\w"
|
|
|
|
return node
|
|
|
|
func parse_GlyphWhitespace():
|
|
eat(TokenType.glyph_ws)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.whitespace
|
|
node.Value = "\\s"
|
|
|
|
return node
|
|
|
|
func parse_GlyphDash():
|
|
eat(TokenType.glyph_dash)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
node.Value = "\\-"
|
|
|
|
return node
|
|
|
|
func parse_GlyphDot():
|
|
eat(TokenType.glyph_dot)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
node.Value = "\\."
|
|
|
|
return node
|
|
|
|
func parse_GlyphExclamation():
|
|
eat(TokenType.glyph_excla)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
node.Value = "!"
|
|
|
|
return node
|
|
|
|
func parse_GlyphVertS():
|
|
eat(TokenType.glyph_vertS)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
node.Value = "\\|"
|
|
|
|
return node
|
|
|
|
func parse_Glyph_bPOpen():
|
|
eat(TokenType.glyph_bPOpen)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
node.Value = "\\("
|
|
|
|
return node
|
|
|
|
func parse_Glyph_bPClose():
|
|
eat(TokenType.glyph_bPClose)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
node.Value = "\\)"
|
|
|
|
return node
|
|
|
|
func parse_Glyph_DQuote():
|
|
eat(TokenType.glyph_dQuote)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.glyph
|
|
node.Value = "\""
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
# : .lazy
|
|
# ;
|
|
func parse_OpLazy():
|
|
eat(TokenType.op_lazy)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.lazy
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
# > OpNot
|
|
# Look
|
|
# : look ( Expression )
|
|
# ;
|
|
func parse_OpLook():
|
|
eat(TokenType.op_look)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.look
|
|
node.Value = parse_CaptureGroup()
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
# OpNot
|
|
# : !
|
|
# | CaptureGroup
|
|
# | GlyphDigit
|
|
# | GlyphWord
|
|
# | GlyphWhitespace
|
|
# | OpLook
|
|
# | String
|
|
# | Set
|
|
# ;
|
|
func parse_OpNot():
|
|
eat(TokenType.op_not)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.op_not
|
|
|
|
match NextToken.Type:
|
|
TokenType.expr_PStart:
|
|
node.Value = parse_CaptureGroup()
|
|
|
|
TokenType.glyph_digit:
|
|
node.Value = parse_GlyphDigit()
|
|
|
|
TokenType.glyph_word:
|
|
node.Value = parse_GlyphWord()
|
|
|
|
TokenType.glyph_ws:
|
|
node.Value = parse_GlyphWhitespace()
|
|
|
|
TokenType.op_look:
|
|
node.Value = parse_OpLook()
|
|
|
|
TokenType.string:
|
|
node.Value = parse_String()
|
|
|
|
TokenType.set:
|
|
node.Value = parse_Set()
|
|
|
|
return node
|
|
|
|
# > Expression
|
|
# OpRepeat
|
|
# : .repeat ( opt# optBetween opt# ) opt.lazy
|
|
# ;
|
|
func parse_OpRepeat():
|
|
eat(TokenType.op_repeat)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.repeat
|
|
|
|
var vrange = null
|
|
var lazy = null
|
|
|
|
eat(TokenType.expr_PStart)
|
|
|
|
vrange = parse_Between(true)
|
|
|
|
eat(TokenType.expr_PEnd)
|
|
|
|
if NextToken && NextToken.Type == TokenType.op_lazy :
|
|
lazy = parse_OpLazy();
|
|
|
|
node.Value = [ vrange, lazy ]
|
|
|
|
return node
|
|
|
|
func parse_Backreference():
|
|
eat(TokenType.ref)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.ref
|
|
|
|
eat(TokenType.expr_PStart)
|
|
|
|
var assertStrTmplt = "Error when parsing a backreference expression: Expected digit but got: {value}"
|
|
var assertStr = assertStrTmplt.format({"value" : NextToken.Value})
|
|
|
|
assert(NextToken.Type == TokenType.glyph, assertStr)
|
|
node.Value = NextToken.Value
|
|
eat(TokenType.glyph)
|
|
|
|
eat(TokenType.expr_PEnd)
|
|
|
|
return node
|
|
|
|
func parse_Set():
|
|
eat(TokenType.set)
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.set
|
|
node.Value = []
|
|
|
|
eat(TokenType.expr_PStart)
|
|
|
|
while is_Glyph() || NextToken.Type == TokenType.op_not :
|
|
if NextToken.Type == TokenType.op_not :
|
|
var possibleGlyph = parse_OpNot()
|
|
if is_Glyph(possibleGlyph.Value) :
|
|
node.Value.append( possibleGlyph )
|
|
continue
|
|
|
|
assert(true == false, "Bad ! operator in set.")
|
|
|
|
node.Value.append( parse_Between() )
|
|
|
|
eat(TokenType.expr_PEnd)
|
|
|
|
return node
|
|
|
|
func parse_String():
|
|
var string = ""
|
|
|
|
var index = 1
|
|
while NextToken.Value[index] != "\"" :
|
|
string += NextToken.Value[index]
|
|
index += 1
|
|
|
|
var \
|
|
node = ASTNode.new()
|
|
node.Type = NodeType.string
|
|
node.Value = string
|
|
|
|
eat(TokenType.string)
|
|
|
|
return node
|
|
|
|
# End: Parser
|
|
|
|
|
|
# Transpiling
|
|
|
|
var ExprAST : ASTNode
|
|
var RegexResult : String
|
|
|
|
func compile(expression : String):
|
|
init( expression )
|
|
|
|
NextToken = next_Token()
|
|
ExprAST = parse_OpUnion(null)
|
|
|
|
return compile_Union(ExprAST)
|
|
|
|
func compile_Union(node : ASTNode):
|
|
var result = ""
|
|
var expressionLeft = node.Value
|
|
|
|
if node.Type == NodeType.union :
|
|
expressionLeft = node.Value[0].Value
|
|
|
|
for entry in expressionLeft :
|
|
match entry.Type :
|
|
NodeType.str_start:
|
|
result += "^"
|
|
NodeType.str_end:
|
|
result += "$"
|
|
|
|
NodeType.capture:
|
|
result += compile_CaptureGroup(entry, false)
|
|
NodeType.look:
|
|
result += compile_LookAhead(entry, false)
|
|
NodeType.ref:
|
|
result += compile_Backreference(entry)
|
|
NodeType.repeat:
|
|
result += compile_Repeat(entry)
|
|
NodeType.set:
|
|
result += compile_Set(entry, false)
|
|
|
|
NodeType.glyph:
|
|
result += entry.Value
|
|
NodeType.inline:
|
|
result += entry.Value
|
|
NodeType.digit:
|
|
result += entry.Value
|
|
NodeType.space:
|
|
result += entry.Value
|
|
NodeType.word:
|
|
result += entry.Value
|
|
NodeType.whitespace:
|
|
result += entry.Value
|
|
|
|
NodeType.string:
|
|
result += compile_String(entry, false)
|
|
|
|
NodeType.op_not:
|
|
result += compile_OpNot(entry)
|
|
|
|
if node.Type == NodeType.union && node.Value[1] != null :
|
|
result += "|"
|
|
result += compile_Union(node.Value[1])
|
|
|
|
return result
|
|
|
|
func compile_CaptureGroup(node : ASTNode, negate : bool):
|
|
var result = ""
|
|
|
|
if negate :
|
|
result += "(?:"
|
|
else :
|
|
result += "("
|
|
|
|
result += compile_Union(node.Value)
|
|
result += ")"
|
|
|
|
return result
|
|
|
|
func compile_LookAhead(node : ASTNode, negate : bool):
|
|
var result = ""
|
|
|
|
if negate :
|
|
result += "(?!"
|
|
else :
|
|
result += "(?="
|
|
|
|
result += compile_Union(node.Value.Value)
|
|
result += ")"
|
|
|
|
return result
|
|
|
|
func compile_Backreference(node : ASTNode):
|
|
var \
|
|
result = "\\"
|
|
result += node.Value
|
|
|
|
return result
|
|
|
|
func compile_Repeat(node : ASTNode):
|
|
var result = ""
|
|
var vrange = node.Value[0]
|
|
var lazy = node.Value[1]
|
|
|
|
if vrange.Type == NodeType.between :
|
|
if vrange.Value.size() == 1 :
|
|
if vrange.Value[0].Value == "0" :
|
|
result += "*"
|
|
elif vrange.Value[0].Value == "1" :
|
|
result += "+"
|
|
else :
|
|
result += "{" + vrange.Value[0].Value + "," + "}"
|
|
if vrange.Value.size() == 2 :
|
|
if vrange.Value[0].Value == "0" && vrange.Value[1].Value == "1" :
|
|
result += "?"
|
|
else :
|
|
result += "{" + vrange.Value[0].Value + "," + vrange.Value[1].Value + "}"
|
|
else :
|
|
result += "{" + vrange.Value + "}"
|
|
|
|
if lazy != null :
|
|
result += "?"
|
|
|
|
return result
|
|
|
|
func compile_Set(node : ASTNode, negate : bool):
|
|
var result = ""
|
|
|
|
if negate :
|
|
result += "[^"
|
|
else :
|
|
result += "["
|
|
|
|
for entry in node.Value :
|
|
if entry.Type == NodeType.op_not :
|
|
result += compile_OpNot(entry)
|
|
elif entry.Type == NodeType.between :
|
|
result += entry.Value[0].Value
|
|
result += "-"
|
|
result += entry.Value[1].Value
|
|
else :
|
|
result += entry.Value
|
|
|
|
result += "]"
|
|
|
|
return result
|
|
|
|
func compile_String(node : ASTNode, negate : bool):
|
|
var result = ""
|
|
|
|
if negate :
|
|
result += "\\B"
|
|
else :
|
|
result += "\\b"
|
|
|
|
result += node.Value
|
|
|
|
if negate :
|
|
result += "\\B"
|
|
else :
|
|
result += "\\b"
|
|
|
|
return result
|
|
|
|
func compile_OpNot(node : ASTNode):
|
|
var result = ""
|
|
|
|
var entry = node.Value
|
|
|
|
match entry.Type :
|
|
NodeType.capture:
|
|
result += compile_CaptureGroup(entry, true)
|
|
NodeType.digit:
|
|
result += "\\D"
|
|
NodeType.word:
|
|
result += "\\W"
|
|
NodeType.whitespace:
|
|
result += "\\S"
|
|
NodeType.look:
|
|
result += compile_LookAhead(entry, true)
|
|
NodeType.string:
|
|
result += compile_String(entry, true)
|
|
NodeType.set:
|
|
result += compile_Set(entry, true)
|
|
|
|
return result
|