Merge pull request #5209 from Feoramund/regex-fixes

Fix RegEx iterator, remove `.Global`, make patterns unanchored by default (breaking change)
This commit is contained in:
Jeroen van Rijn
2025-05-24 15:38:26 +02:00
committed by GitHub
6 changed files with 254 additions and 43 deletions
-3
View File
@@ -15,8 +15,6 @@ MAX_PROGRAM_SIZE :: int(max(i16))
MAX_CLASSES :: int(max(u8))
Flag :: enum u8 {
// Global: try to match the pattern anywhere in the string.
Global,
// Multiline: treat `^` and `$` as if they also match newlines.
Multiline,
// Case Insensitive: treat `a-z` as if it was also `A-Z`.
@@ -36,7 +34,6 @@ Flags :: bit_set[Flag; u8]
@(rodata)
Flag_To_Letter := #sparse[Flag]u8 {
.Global = 'g',
.Multiline = 'm',
.Case_Insensitive = 'i',
.Ignore_Whitespace = 'x',
+9 -5
View File
@@ -401,7 +401,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
pc_open := 0
add_global: if .Global in flags {
optimize_opening: {
// Check if the opening to the pattern is predictable.
// If so, use one of the optimized Wait opcodes.
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
@@ -412,7 +412,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break add_global
break optimize_opening
case .Rune:
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
@@ -420,24 +420,28 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
pc_open += size_of(Opcode)
inject_raw(&code, pc_open, operand)
pc_open += size_of(rune)
break add_global
break optimize_opening
case .Rune_Class:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break add_global
break optimize_opening
case .Rune_Class_Negated:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break add_global
break optimize_opening
case .Save:
continue
case .Assert_Start:
break optimize_opening
case:
break seek_loop
}
+62 -9
View File
@@ -77,6 +77,8 @@ Match_Iterator :: struct {
vm: virtual_machine.Machine,
idx: int,
temp: runtime.Allocator,
threads: int,
done: bool,
}
/*
@@ -101,7 +103,6 @@ create :: proc(
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (result: Regular_Expression, err: Error) {
// For the sake of speed and simplicity, we first run all the intermediate
// processes such as parsing and compilation through the temporary
// allocator.
@@ -166,7 +167,6 @@ to escape the delimiter if found in the middle of the string.
All runes after the closing delimiter will be parsed as flags:
- 'g': Global
- 'm': Multiline
- 'i': Case_Insensitive
- 'x': Ignore_Whitespace
@@ -243,7 +243,6 @@ create_by_user :: proc(
// to `end` here.
for r in pattern[start + end:] {
switch r {
case 'g': flags += { .Global }
case 'm': flags += { .Multiline }
case 'i': flags += { .Case_Insensitive }
case 'x': flags += { .Ignore_Whitespace }
@@ -282,8 +281,6 @@ create_iterator :: proc(
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (result: Match_Iterator, err: Error) {
flags := flags
flags += {.Global} // We're iterating over a string, so the next match could start anywhere
if .Multiline in flags {
return {}, .Unsupported_Flag
@@ -294,6 +291,7 @@ create_iterator :: proc(
result.temp = temporary_allocator
result.vm = virtual_machine.create(result.regex.program, str)
result.vm.class_data = result.regex.class_data
result.threads = max(1, virtual_machine.opcode_count(result.vm.code) - 1)
return
}
@@ -457,8 +455,27 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
assert(len(it.capture.pos) >= common.MAX_CAPTURE_GROUPS,
"Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
// Guard against situations in which the iterator should finish.
if it.done {
return
}
runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
if it.idx > 0 {
// Reset the state needed to `virtual_machine.run` again.
it.vm.top_thread = 0
it.vm.current_rune = rune(0)
it.vm.current_rune_size = 0
for i in 0..<it.threads {
it.vm.threads[i] = {}
it.vm.next_threads[i] = {}
}
}
// Take note of where the string pointer is before we start.
sp_before := it.vm.string_pointer
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
{
context.allocator = it.temp
@@ -469,6 +486,28 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
}
}
if !ok {
// Match failed, bail out.
return
}
if it.vm.string_pointer == sp_before {
// The string pointer did not move, but there was a match.
//
// At this point, the pattern supplied to the iterator will infinitely
// loop if we do not intervene.
it.done = true
}
if it.vm.string_pointer == len(it.vm.memory) {
// The VM hit the end of the string.
//
// We do not check at the start, because a match of pattern `$`
// against string "" is valid and must return a match.
//
// This check prevents a double-match of `$` against a non-empty string.
it.done = true
}
str := string(it.vm.memory)
num_groups: int
@@ -488,9 +527,7 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
num_groups = n
}
defer if ok {
it.idx += 1
}
defer it.idx += 1
if num_groups > 0 {
result = {it.capture.pos[:num_groups], it.capture.groups[:num_groups]}
@@ -504,8 +541,24 @@ match :: proc {
match_iterator,
}
/*
Reset an iterator, allowing it to be run again as if new.
Inputs:
- it: The iterator to reset.
*/
reset :: proc(it: ^Match_Iterator) {
it.idx = 0
it.done = false
it.idx = 0
it.vm.string_pointer = 0
it.vm.top_thread = 0
it.vm.current_rune = rune(0)
it.vm.current_rune_size = 0
for i in 0..<it.threads {
it.vm.threads[i] = {}
it.vm.next_threads[i] = {}
}
}
/*
@@ -329,10 +329,10 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check {
when UNICODE_MODE {
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory)
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer:])
} else {
if len(vm.memory) > 0 {
vm.next_rune = cast(rune)vm.memory[0]
vm.next_rune = cast(rune)vm.memory[vm.string_pointer]
vm.next_rune_size = 1
}
}
@@ -652,4 +652,4 @@ destroy :: proc(vm: Machine, allocator := context.allocator) {
delete(vm.busy_map)
free(vm.threads)
free(vm.next_threads)
}
}
@@ -103,9 +103,11 @@ expensive_for_backtrackers :: proc(t: ^testing.T) {
@test
global_capture_end_word :: proc(t: ^testing.T) {
// NOTE: The previous behavior of `.Global`, which was to automatically
// insert `.*?` at the start of the pattern, is now default.
EXPR :: `Hellope World!`
rex, err := regex.create(EXPR, { .Global })
rex, err := regex.create(EXPR, { /*.Global*/ })
if !testing.expect_value(t, err, nil) {
return
}
@@ -145,7 +147,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) {
EXPR :: `こにちは`
needle := string(EXPR)
rex, err := regex.create(EXPR, { .Global, .Unicode })
rex, err := regex.create(EXPR, { /*.Global,*/ .Unicode })
if !testing.expect_value(t, err, nil) {
return
}
@@ -185,7 +187,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) {
alternations :: proc(t: ^testing.T) {
EXPR :: `a(?:bb|cc|dd|ee|ff)`
rex, err := regex.create(EXPR, { .No_Capture, .Global })
rex, err := regex.create(EXPR, { .No_Capture, /*.Global*/ })
if !testing.expect_value(t, err, nil) {
return
}
@@ -219,7 +221,7 @@ classes :: proc(t: ^testing.T) {
EXPR :: `[\w\d]+`
NEEDLE :: "0123456789abcdef"
rex, err := regex.create(EXPR, { .Global })
rex, err := regex.create(EXPR, { /*.Global*/ })
if !testing.expect_value(t, err, nil) {
return
}
+174 -19
View File
@@ -51,13 +51,13 @@ check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex
}
check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..string, extra_flags := regex.Flags{}, loc := #caller_location) {
check_expression_with_flags(t, pattern, { .Global } + extra_flags,
check_expression_with_flags(t, pattern, extra_flags,
haystack, ..needles, loc = loc)
check_expression_with_flags(t, pattern, { .Global, .No_Optimization } + extra_flags,
check_expression_with_flags(t, pattern, { .No_Optimization } + extra_flags,
haystack, ..needles, loc = loc)
check_expression_with_flags(t, pattern, { .Global, .Unicode } + extra_flags,
check_expression_with_flags(t, pattern, { .Unicode } + extra_flags,
haystack, ..needles, loc = loc)
check_expression_with_flags(t, pattern, { .Global, .Unicode, .No_Optimization } + extra_flags,
check_expression_with_flags(t, pattern, { .Unicode, .No_Optimization } + extra_flags,
haystack, ..needles, loc = loc)
}
@@ -72,17 +72,18 @@ expect_error :: proc(t: ^testing.T, pattern: string, expected_error: typeid, fla
testing.expect_value(t, variant_ti, expected_ti, loc = loc)
}
check_capture :: proc(t: ^testing.T, got, expected: regex.Capture, loc := #caller_location) {
testing.expect_value(t, len(got.pos), len(got.groups), loc = loc)
testing.expect_value(t, len(got.pos), len(expected.pos), loc = loc)
testing.expect_value(t, len(got.groups), len(expected.groups), loc = loc)
check_capture :: proc(t: ^testing.T, got, expected: regex.Capture, loc := #caller_location) -> (ok: bool) {
testing.expect_value(t, len(got.pos), len(got.groups), loc = loc) or_return
testing.expect_value(t, len(got.pos), len(expected.pos), loc = loc) or_return
testing.expect_value(t, len(got.groups), len(expected.groups), loc = loc) or_return
if len(got.pos) == len(expected.pos) {
for i in 0..<len(got.pos) {
testing.expect_value(t, got.pos[i], expected.pos[i], loc = loc)
testing.expect_value(t, got.groups[i], expected.groups[i], loc = loc)
testing.expect_value(t, got.pos[i], expected.pos[i], loc = loc) or_return
testing.expect_value(t, got.groups[i], expected.groups[i], loc = loc) or_return
}
}
return true
}
@test
@@ -212,6 +213,12 @@ test_alternation :: proc(t: ^testing.T) {
check_expression(t, EXPR, "cc", "cc")
}
@test
test_alternation_order :: proc(t: ^testing.T) {
check_expression(t, "a|ab", "ab", "a")
check_expression(t, "ab|a", "ab", "ab")
}
@test
test_optional :: proc(t: ^testing.T) {
EXPR :: "a?a?a?aaa"
@@ -516,7 +523,7 @@ test_pos_index_explicitly :: proc(t: ^testing.T) {
STR :: "This is an island."
EXPR :: `\bis\b`
rex, err := regex.create(EXPR, { .Global })
rex, err := regex.create(EXPR)
if !testing.expect_value(t, err, nil) {
return
}
@@ -642,9 +649,9 @@ test_unicode_explicitly :: proc(t: ^testing.T) {
}
{
EXPR :: "こにちは!"
check_expression_with_flags(t, EXPR, { .Global, .Unicode },
check_expression_with_flags(t, EXPR, { .Unicode },
"Hello こにちは!", "こにちは!")
check_expression_with_flags(t, EXPR, { .Global, .Unicode, .No_Optimization },
check_expression_with_flags(t, EXPR, { .Unicode, .No_Optimization },
"Hello こにちは!", "こにちは!")
}
}
@@ -901,12 +908,12 @@ test_everything_at_once :: proc(t: ^testing.T) {
@test
test_creation_from_user_string :: proc(t: ^testing.T) {
{
USER_EXPR :: `/^hellope$/gmixun-`
USER_EXPR :: `/^hellope$/mixun-`
STR :: "hellope"
rex, err := regex.create_by_user(USER_EXPR)
defer regex.destroy(rex)
testing.expect_value(t, err, nil)
testing.expect_value(t, rex.flags, regex.Flags{ .Global, .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization })
testing.expect_value(t, rex.flags, regex.Flags{ .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization })
_, ok := regex.match(rex, STR)
testing.expectf(t, ok, "expected user-provided RegEx %v to match %q", rex, STR)
@@ -1102,24 +1109,121 @@ Iterator_Test :: struct {
iterator_vectors := []Iterator_Test{
{
`xxab32ab52xx`, `(ab\d{1})`, {}, // {.Global} implicitly added by the iterator
`xxab32ab52xx`, `(ab\d{1})`, {},
{
{pos = {{2, 5}, {2, 5}}, groups = {"ab3", "ab3"}},
{pos = {{6, 9}, {6, 9}}, groups = {"ab5", "ab5"}},
},
},
{
`xxfoobarxfoobarxx`, `f(o)ob(ar)`, {.Global},
`xxfoobarxfoobarxx`, `f(o)ob(ar)`, {},
{
{pos = {{2, 8}, {3, 4}, {6, 8}}, groups = {"foobar", "o", "ar"}},
{pos = {{9, 15}, {10, 11}, {13, 15}}, groups = {"foobar", "o", "ar"}},
},
},
{
`aaa`, `a`, {},
{
{pos = {{0, 1}}, groups = {"a"}},
{pos = {{1, 2}}, groups = {"a"}},
{pos = {{2, 3}}, groups = {"a"}},
},
},
{
`aaa`, `\w`, {},
{
{pos = {{0, 1}}, groups = {"a"}},
{pos = {{1, 2}}, groups = {"a"}},
{pos = {{2, 3}}, groups = {"a"}},
},
},
{
`aかか`, `.`, {.Unicode},
{
{pos = {{0, 1}}, groups = {"a"}},
{pos = {{1, 4}}, groups = {"か"}},
{pos = {{4, 7}}, groups = {"か"}},
},
},
{
`ゆめ.`, `.`, {.Unicode},
{
{pos = {{0, 3}}, groups = {"ゆ"}},
{pos = {{3, 6}}, groups = {"め"}},
{pos = {{6, 7}}, groups = {"."}},
},
},
// This pattern is not anchored, so this is valid per the new behavior.
{
`ababa`, `(?:a|ab)`, {},
{
{pos = {{0, 1}}, groups = {"a"}},
{pos = {{2, 3}}, groups = {"a"}},
{pos = {{4, 5}}, groups = {"a"}},
},
},
// Ensure alternation follows expected ordering of left-higher priority.
{
`ababa`, `(?:ab|a)`, {},
{
{pos = {{0, 2}}, groups = {"ab"}},
{pos = {{2, 4}}, groups = {"ab"}},
{pos = {{4, 5}}, groups = {"a"}},
},
},
// This one is anchored, so only one match.
//
// This test ensures the behavior of `Assert_Start` is checking against the
// start of the string and we haven't just slid the memory string itself.
{
`ababa`, `^(?:a|b)`, {},
{
{pos = {{0, 1}}, groups = {"a"}},
},
},
{
`ababa`, `a$`, {},
{
{pos = {{4, 5}}, groups = {"a"}},
},
},
// A blank pattern on a blank string is valid and must not loop infinitely.
{
``, ``, {},
{
{pos = {{0, 0}}, groups = {""}},
},
},
// These too are valid.
//
// The iterator must bail out when it encounters any situation which would
// loop infinitely, but only after giving at least one valid match if one
// exists, as this would accord with attempting to singularly match the
// pattern against the string and provide a positive result.
{
`a`, ``, {},
{
{pos = {{0, 0}}, groups = {""}},
},
},
{
``, `$`, {},
{
{pos = {{0, 0}}, groups = {""}},
},
},
{
`aaa`, `$`, {},
{
{pos = {{3, 3}}, groups = {""}},
},
},
}
@test
test_match_iterator :: proc(t: ^testing.T) {
for test in iterator_vectors {
vector: for test in iterator_vectors {
it, err := regex.create_iterator(test.haystack, test.pattern, test.flags)
defer regex.destroy(it)
@@ -1128,10 +1232,61 @@ test_match_iterator :: proc(t: ^testing.T) {
for capture, idx in regex.match(&it) {
if idx >= len(test.expected) {
log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
continue vector
}
if !check_capture(t, capture, test.expected[idx]) {
log.errorf("capture check failed on string %q against pattern %q", test.haystack, test.pattern)
}
}
testing.expectf(t, it.idx == len(test.expected), "expected idx to be %i, got %i on string %q against pattern %q", len(test.expected), it.idx, test.haystack, test.pattern)
}
}
@test
test_iterator_reset :: proc(t: ^testing.T) {
test := Iterator_Test{
`aaa`, `a`, {},
{
{pos = {{0, 1}}, groups = {"a"}},
{pos = {{1, 2}}, groups = {"a"}},
{pos = {{2, 3}}, groups = {"a"}},
},
}
out: {
it, err := regex.create_iterator(`aaa`, `a`)
defer regex.destroy(it)
testing.expect_value(t, err, nil)
(err == nil) or_break out
for capture, idx in regex.match(&it) {
if idx >= len(test.expected) {
log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
break
}
check_capture(t, capture, test.expected[idx])
}
testing.expect_value(t, it.idx, len(test.expected))
// Do it again.
iterations := 0
regex.reset(&it)
// Mind that this loop can do nothing if it wasn't reset but leave us
// with the expected `idx` state.
//
// That's why we count iterations this time around.
for capture, idx in regex.match(&it) {
iterations += 1
if idx >= len(test.expected) {
log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
break
}
check_capture(t, capture, test.expected[idx])
}
testing.expect_value(t, it.idx, len(test.expected))
testing.expect_value(t, iterations, len(test.expected))
}
}
}