From fedb9efb413dda3f2d9d4bbb39050219f8c0f394 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 23 May 2025 20:20:59 -0400 Subject: [PATCH 1/3] Make RegEx VM restartable and fix iterator infinite loop --- core/text/regex/regex.odin | 67 +++++++++++++++++-- .../virtual_machine/virtual_machine.odin | 6 +- .../core/text/regex/test_core_text_regex.odin | 5 +- 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index c805740f7..90aa34946 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -77,6 +77,8 @@ Match_Iterator :: struct { vm: virtual_machine.Machine, idx: int, temp: runtime.Allocator, + threads: int, + done: bool, } /* @@ -101,7 +103,6 @@ create :: proc( permanent_allocator := context.allocator, temporary_allocator := context.temp_allocator, ) -> (result: Regular_Expression, err: Error) { - // For the sake of speed and simplicity, we first run all the intermediate // processes such as parsing and compilation through the temporary // allocator. @@ -294,6 +295,7 @@ create_iterator :: proc( result.temp = temporary_allocator result.vm = virtual_machine.create(result.regex.program, str) result.vm.class_data = result.regex.class_data + result.threads = max(1, virtual_machine.opcode_count(result.vm.code) - 1) return } @@ -457,8 +459,27 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok: assert(len(it.capture.pos) >= common.MAX_CAPTURE_GROUPS, "Pre-allocated RegEx capture `pos` must be at least 10 elements long.") + // Guard against situations in which the iterator should finish. + if it.done { + return + } + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + if it.idx > 0 { + // Reset the state needed to `virtual_machine.run` again. + it.vm.top_thread = 0 + it.vm.current_rune = rune(0) + it.vm.current_rune_size = 0 + for i in 0.. (result: Capture, index: int, ok: } } + if !ok { + // Match failed, bail out. + return + } + + if it.vm.string_pointer == sp_before { + // The string pointer did not move, but there was a match. + // + // At this point, the pattern supplied to the iterator will infinitely + // loop if we do not intervene. + it.done = true + } + if it.vm.string_pointer == len(it.vm.memory) { + // The VM hit the end of the string. + // + // We do not check at the start, because a match of pattern `$` + // against string "" is valid and must return a match. + // + // This check prevents a double-match of `$` against a non-empty string. + it.done = true + } + str := string(it.vm.memory) num_groups: int @@ -488,9 +531,7 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok: num_groups = n } - defer if ok { - it.idx += 1 - } + defer it.idx += 1 if num_groups > 0 { result = {it.capture.pos[:num_groups], it.capture.groups[:num_groups]} @@ -504,8 +545,24 @@ match :: proc { match_iterator, } +/* +Reset an iterator, allowing it to be run again as if new. + +Inputs: +- it: The iterator to reset. +*/ reset :: proc(it: ^Match_Iterator) { - it.idx = 0 + it.done = false + it.idx = 0 + it.vm.string_pointer = 0 + + it.vm.top_thread = 0 + it.vm.current_rune = rune(0) + it.vm.current_rune_size = 0 + for i in 0.. (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check { when UNICODE_MODE { - vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory) + vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer:]) } else { if len(vm.memory) > 0 { - vm.next_rune = cast(rune)vm.memory[0] + vm.next_rune = cast(rune)vm.memory[vm.string_pointer] vm.next_rune_size = 1 } } @@ -652,4 +652,4 @@ destroy :: proc(vm: Machine, allocator := context.allocator) { delete(vm.busy_map) free(vm.threads) free(vm.next_threads) -} \ No newline at end of file +} diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index 913e716e5..8b4e3f997 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -1119,7 +1119,7 @@ iterator_vectors := []Iterator_Test{ @test test_match_iterator :: proc(t: ^testing.T) { - for test in iterator_vectors { + vector: for test in iterator_vectors { it, err := regex.create_iterator(test.haystack, test.pattern, test.flags) defer regex.destroy(it) @@ -1128,7 +1128,8 @@ test_match_iterator :: proc(t: ^testing.T) { for capture, idx in regex.match(&it) { if idx >= len(test.expected) { - break + log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture) + continue vector } check_capture(t, capture, test.expected[idx]) } From 37d6491300269502341d7f5ea455ae089a36ce06 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sat, 24 May 2025 06:02:50 -0400 Subject: [PATCH 2/3] Remove `Global` RegEx flag, default to unanchored patterns --- core/text/regex/common/common.odin | 3 --- core/text/regex/compiler/compiler.odin | 14 +++++++---- core/text/regex/regex.odin | 4 ---- .../benchmark/text/regex/benchmark_regex.odin | 10 ++++---- .../core/text/regex/test_core_text_regex.odin | 24 +++++++++---------- 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/core/text/regex/common/common.odin b/core/text/regex/common/common.odin index 4a303e0a3..e60bef58f 100644 --- a/core/text/regex/common/common.odin +++ b/core/text/regex/common/common.odin @@ -15,8 +15,6 @@ MAX_PROGRAM_SIZE :: int(max(i16)) MAX_CLASSES :: int(max(u8)) Flag :: enum u8 { - // Global: try to match the pattern anywhere in the string. - Global, // Multiline: treat `^` and `$` as if they also match newlines. Multiline, // Case Insensitive: treat `a-z` as if it was also `A-Z`. @@ -36,7 +34,6 @@ Flags :: bit_set[Flag; u8] @(rodata) Flag_To_Letter := #sparse[Flag]u8 { - .Global = 'g', .Multiline = 'm', .Case_Insensitive = 'i', .Ignore_Whitespace = 'x', diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin index b3ded0104..07ace7b5d 100644 --- a/core/text/regex/compiler/compiler.odin +++ b/core/text/regex/compiler/compiler.odin @@ -401,7 +401,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: pc_open := 0 - add_global: if .Global in flags { + optimize_opening: { // Check if the opening to the pattern is predictable. // If so, use one of the optimized Wait opcodes. iter := virtual_machine.Opcode_Iterator{ code[:], 0 } @@ -412,7 +412,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: pc_open += size_of(Opcode) inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open])) pc_open += size_of(u8) - break add_global + break optimize_opening case .Rune: operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1]) @@ -420,24 +420,28 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: pc_open += size_of(Opcode) inject_raw(&code, pc_open, operand) pc_open += size_of(rune) - break add_global + break optimize_opening case .Rune_Class: inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class) pc_open += size_of(Opcode) inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open])) pc_open += size_of(u8) - break add_global + break optimize_opening case .Rune_Class_Negated: inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated) pc_open += size_of(Opcode) inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open])) pc_open += size_of(u8) - break add_global + break optimize_opening case .Save: continue + + case .Assert_Start: + break optimize_opening + case: break seek_loop } diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 90aa34946..94a4b163a 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -167,7 +167,6 @@ to escape the delimiter if found in the middle of the string. All runes after the closing delimiter will be parsed as flags: -- 'g': Global - 'm': Multiline - 'i': Case_Insensitive - 'x': Ignore_Whitespace @@ -244,7 +243,6 @@ create_by_user :: proc( // to `end` here. for r in pattern[start + end:] { switch r { - case 'g': flags += { .Global } case 'm': flags += { .Multiline } case 'i': flags += { .Case_Insensitive } case 'x': flags += { .Ignore_Whitespace } @@ -283,8 +281,6 @@ create_iterator :: proc( permanent_allocator := context.allocator, temporary_allocator := context.temp_allocator, ) -> (result: Match_Iterator, err: Error) { - flags := flags - flags += {.Global} // We're iterating over a string, so the next match could start anywhere if .Multiline in flags { return {}, .Unsupported_Flag diff --git a/tests/benchmark/text/regex/benchmark_regex.odin b/tests/benchmark/text/regex/benchmark_regex.odin index 8d29888a3..73d19ad0f 100644 --- a/tests/benchmark/text/regex/benchmark_regex.odin +++ b/tests/benchmark/text/regex/benchmark_regex.odin @@ -103,9 +103,11 @@ expensive_for_backtrackers :: proc(t: ^testing.T) { @test global_capture_end_word :: proc(t: ^testing.T) { + // NOTE: The previous behavior of `.Global`, which was to automatically + // insert `.*?` at the start of the pattern, is now default. EXPR :: `Hellope World!` - rex, err := regex.create(EXPR, { .Global }) + rex, err := regex.create(EXPR, { /*.Global*/ }) if !testing.expect_value(t, err, nil) { return } @@ -145,7 +147,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) { EXPR :: `こにちは` needle := string(EXPR) - rex, err := regex.create(EXPR, { .Global, .Unicode }) + rex, err := regex.create(EXPR, { /*.Global,*/ .Unicode }) if !testing.expect_value(t, err, nil) { return } @@ -185,7 +187,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) { alternations :: proc(t: ^testing.T) { EXPR :: `a(?:bb|cc|dd|ee|ff)` - rex, err := regex.create(EXPR, { .No_Capture, .Global }) + rex, err := regex.create(EXPR, { .No_Capture, /*.Global*/ }) if !testing.expect_value(t, err, nil) { return } @@ -219,7 +221,7 @@ classes :: proc(t: ^testing.T) { EXPR :: `[\w\d]+` NEEDLE :: "0123456789abcdef" - rex, err := regex.create(EXPR, { .Global }) + rex, err := regex.create(EXPR, { /*.Global*/ }) if !testing.expect_value(t, err, nil) { return } diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index 8b4e3f997..696a2dc48 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -51,13 +51,13 @@ check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex } check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..string, extra_flags := regex.Flags{}, loc := #caller_location) { - check_expression_with_flags(t, pattern, { .Global } + extra_flags, + check_expression_with_flags(t, pattern, extra_flags, haystack, ..needles, loc = loc) - check_expression_with_flags(t, pattern, { .Global, .No_Optimization } + extra_flags, + check_expression_with_flags(t, pattern, { .No_Optimization } + extra_flags, haystack, ..needles, loc = loc) - check_expression_with_flags(t, pattern, { .Global, .Unicode } + extra_flags, + check_expression_with_flags(t, pattern, { .Unicode } + extra_flags, haystack, ..needles, loc = loc) - check_expression_with_flags(t, pattern, { .Global, .Unicode, .No_Optimization } + extra_flags, + check_expression_with_flags(t, pattern, { .Unicode, .No_Optimization } + extra_flags, haystack, ..needles, loc = loc) } @@ -516,7 +516,7 @@ test_pos_index_explicitly :: proc(t: ^testing.T) { STR :: "This is an island." EXPR :: `\bis\b` - rex, err := regex.create(EXPR, { .Global }) + rex, err := regex.create(EXPR) if !testing.expect_value(t, err, nil) { return } @@ -642,9 +642,9 @@ test_unicode_explicitly :: proc(t: ^testing.T) { } { EXPR :: "こにちは!" - check_expression_with_flags(t, EXPR, { .Global, .Unicode }, + check_expression_with_flags(t, EXPR, { .Unicode }, "Hello こにちは!", "こにちは!") - check_expression_with_flags(t, EXPR, { .Global, .Unicode, .No_Optimization }, + check_expression_with_flags(t, EXPR, { .Unicode, .No_Optimization }, "Hello こにちは!", "こにちは!") } } @@ -901,12 +901,12 @@ test_everything_at_once :: proc(t: ^testing.T) { @test test_creation_from_user_string :: proc(t: ^testing.T) { { - USER_EXPR :: `/^hellope$/gmixun-` + USER_EXPR :: `/^hellope$/mixun-` STR :: "hellope" rex, err := regex.create_by_user(USER_EXPR) defer regex.destroy(rex) testing.expect_value(t, err, nil) - testing.expect_value(t, rex.flags, regex.Flags{ .Global, .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization }) + testing.expect_value(t, rex.flags, regex.Flags{ .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization }) _, ok := regex.match(rex, STR) testing.expectf(t, ok, "expected user-provided RegEx %v to match %q", rex, STR) @@ -1102,14 +1102,14 @@ Iterator_Test :: struct { iterator_vectors := []Iterator_Test{ { - `xxab32ab52xx`, `(ab\d{1})`, {}, // {.Global} implicitly added by the iterator + `xxab32ab52xx`, `(ab\d{1})`, {}, { {pos = {{2, 5}, {2, 5}}, groups = {"ab3", "ab3"}}, {pos = {{6, 9}, {6, 9}}, groups = {"ab5", "ab5"}}, }, }, { - `xxfoobarxfoobarxx`, `f(o)ob(ar)`, {.Global}, + `xxfoobarxfoobarxx`, `f(o)ob(ar)`, {}, { {pos = {{2, 8}, {3, 4}, {6, 8}}, groups = {"foobar", "o", "ar"}}, {pos = {{9, 15}, {10, 11}, {13, 15}}, groups = {"foobar", "o", "ar"}}, @@ -1135,4 +1135,4 @@ test_match_iterator :: proc(t: ^testing.T) { } testing.expect_value(t, it.idx, len(test.expected)) } -} \ No newline at end of file +} From 5d01acc04f5f90925c94a64dca0508d104b6241d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sat, 24 May 2025 06:33:16 -0400 Subject: [PATCH 3/3] Add more RegEx tests --- .../core/text/regex/test_core_text_regex.odin | 166 +++++++++++++++++- 1 file changed, 160 insertions(+), 6 deletions(-) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index 696a2dc48..aed3091e1 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -72,17 +72,18 @@ expect_error :: proc(t: ^testing.T, pattern: string, expected_error: typeid, fla testing.expect_value(t, variant_ti, expected_ti, loc = loc) } -check_capture :: proc(t: ^testing.T, got, expected: regex.Capture, loc := #caller_location) { - testing.expect_value(t, len(got.pos), len(got.groups), loc = loc) - testing.expect_value(t, len(got.pos), len(expected.pos), loc = loc) - testing.expect_value(t, len(got.groups), len(expected.groups), loc = loc) +check_capture :: proc(t: ^testing.T, got, expected: regex.Capture, loc := #caller_location) -> (ok: bool) { + testing.expect_value(t, len(got.pos), len(got.groups), loc = loc) or_return + testing.expect_value(t, len(got.pos), len(expected.pos), loc = loc) or_return + testing.expect_value(t, len(got.groups), len(expected.groups), loc = loc) or_return if len(got.pos) == len(expected.pos) { for i in 0..= len(test.expected) { + log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture) + break + } check_capture(t, capture, test.expected[idx]) } testing.expect_value(t, it.idx, len(test.expected)) + + // Do it again. + iterations := 0 + regex.reset(&it) + + // Mind that this loop can do nothing if it wasn't reset but leave us + // with the expected `idx` state. + // + // That's why we count iterations this time around. + for capture, idx in regex.match(&it) { + iterations += 1 + if idx >= len(test.expected) { + log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture) + break + } + check_capture(t, capture, test.expected[idx]) + } + testing.expect_value(t, it.idx, len(test.expected)) + testing.expect_value(t, iterations, len(test.expected)) } }