Revision control

Copy as Markdown

Other Tools

[[test]]
name = "invalid-regex-no-crash-100"
regex = '(*)'
haystack = ""
matches = []
compiles = false
[[test]]
name = "invalid-regex-no-crash-200"
regex = '(?:?)'
haystack = ""
matches = []
compiles = false
[[test]]
name = "invalid-regex-no-crash-300"
regex = '(?)'
haystack = ""
matches = []
compiles = false
[[test]]
name = "invalid-regex-no-crash-400"
regex = '*'
haystack = ""
matches = []
compiles = false
[[test]]
name = "unsorted-binary-search-100"
regex = '(?i-u)[a_]+'
haystack = "A_"
matches = [[0, 2]]
[[test]]
name = "unsorted-binary-search-200"
regex = '(?i-u)[A_]+'
haystack = "a_"
matches = [[0, 2]]
[[test]]
name = "unicode-case-lower-nocase-flag"
regex = '(?i)\p{Ll}+'
haystack = "ΛΘΓΔα"
matches = [[0, 10]]
[[test]]
name = "negated-char-class-100"
regex = '(?i)[^x]'
haystack = "x"
matches = []
[[test]]
name = "negated-char-class-200"
regex = '(?i)[^x]'
haystack = "X"
matches = []
[[test]]
name = "ascii-word-underscore"
regex = '[[:word:]]'
haystack = "_"
matches = [[0, 1]]
[[test]]
name = "captures-repeat"
regex = '([a-f]){2}(?P<foo>[x-z])'
haystack = "abx"
matches = [
[[0, 3], [1, 2], [2, 3]],
]
[[test]]
name = "alt-in-alt-100"
regex = 'ab?|$'
haystack = "az"
matches = [[0, 1], [2, 2]]
[[test]]
name = "alt-in-alt-200"
regex = '^(?:.*?)(?:\n|\r\n?|$)'
haystack = "ab\rcd"
matches = [[0, 3]]
[[test]]
name = "leftmost-first-prefix"
regex = 'z*azb'
haystack = "azb"
matches = [[0, 3]]
[[test]]
name = "many-alternates"
regex = '1|2|3|4|5|6|7|8|9|10|int'
haystack = "int"
matches = [[0, 3]]
[[test]]
name = "word-boundary-alone-100"
regex = '\b'
haystack = "Should this (work?)"
matches = [[0, 0], [6, 6], [7, 7], [11, 11], [13, 13], [17, 17]]
[[test]]
name = "word-boundary-alone-200"
regex = '\b'
haystack = "a b c"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "word-boundary-ascii-no-capture"
regex = '\B'
haystack = "\U00028F3E"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
unicode = false
utf8 = false
[[test]]
name = "word-boundary-ascii-capture"
regex = '(?:\B)'
haystack = "\U00028F3E"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
unicode = false
utf8 = false
[[test]]
name = "partial-anchor"
regex = '^a|b'
haystack = "ba"
matches = [[0, 1]]
[[test]]
name = "endl-or-word-boundary"
regex = '(?m:$)|(?-u:\b)'
haystack = "\U0006084E"
matches = [[4, 4]]
[[test]]
name = "zero-or-end"
regex = '(?i-u:\x00)|$'
haystack = "\U000E682F"
matches = [[4, 4]]
[[test]]
name = "y-or-endl"
regex = '(?i-u:y)|(?m:$)'
haystack = "\U000B4331"
matches = [[4, 4]]
[[test]]
name = "word-boundary-start-x"
regex = '(?u:\b)^(?-u:X)'
haystack = "X"
matches = [[0, 1]]
[[test]]
name = "word-boundary-ascii-start-x"
regex = '(?-u:\b)^(?-u:X)'
haystack = "X"
matches = [[0, 1]]
[[test]]
name = "end-not-word-boundary"
regex = '$\B'
haystack = "\U0005C124\U000B576C"
matches = [[8, 8]]
unicode = false
utf8 = false
[[test]]
name = "partial-anchor-alternate-begin"
regex = '^a|z'
haystack = "yyyyya"
matches = []
[[test]]
name = "partial-anchor-alternate-end"
regex = 'a$|z'
haystack = "ayyyyy"
matches = []
[[test]]
name = "lits-unambiguous-100"
regex = '(?:ABC|CDA|BC)X'
haystack = "CDAX"
matches = [[0, 4]]
[[test]]
name = "lits-unambiguous-200"
regex = '((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$'
haystack = "CIMG2341"
matches = [
[[0, 8], [0, 4], [], [0, 4], [4, 8]],
]
#
# 2022-09-19: This has now been "properly" fixed in that empty character
# classes are fully supported as something that can never match. This test
# used to be marked as 'compiles = false', but now it works.
[[test]]
name = "negated-full-byte-range"
regex = '[^\x00-\xFF]'
haystack = ""
matches = []
compiles = true
unicode = false
utf8 = false
[[test]]
name = "strange-anchor-non-complete-prefix"
regex = 'a^{2}'
haystack = ""
matches = []
[[test]]
name = "strange-anchor-non-complete-suffix"
regex = '${2}a'
haystack = ""
matches = []
[[test]]
name = "captures-after-dfa-premature-end-100"
regex = 'a(b*(X|$))?'
haystack = "abcbX"
matches = [
[[0, 1], [], []],
]
[[test]]
name = "captures-after-dfa-premature-end-200"
regex = 'a(bc*(X|$))?'
haystack = "abcbX"
matches = [
[[0, 1], [], []],
]
[[test]]
name = "captures-after-dfa-premature-end-300"
regex = '(aa$)?'
haystack = "aaz"
matches = [
[[0, 0], []],
[[1, 1], []],
[[2, 2], []],
[[3, 3], []],
]
# Plucked from "Why aren’t regular expressions a lingua franca? an empirical
# study on the re-use and portability of regular expressions", The ACM Joint
# European Software Engineering Conference and Symposium on the Foundations of
# Software Engineering (ESEC/FSE), 2019.
#
[[test]]
name = "captures-after-dfa-premature-end-400"
regex = '(a)\d*\.?\d+\b'
haystack = "a0.0c"
matches = [
[[0, 2], [0, 1]],
]
[[test]]
name = "literal-panic"
regex = 'typename type\-parameter\-[0-9]+\-[0-9]+::.+'
haystack = "test"
matches = []
[[test]]
name = "empty-flag-expr"
regex = '(?:(?:(?x)))'
haystack = ""
matches = [[0, 0]]
#[[tests]]
#name = "blank-matches-nothing-between-space-and-tab"
#regex = '[[:blank:]]'
#input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
#match = false
#unescape = true
#[[tests]]
#name = "blank-matches-nothing-between-space-and-tab-inverted"
#regex = '^[[:^blank:]]+$'
#input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
#match = true
#unescape = true
[[test]]
name = "invalid-repetition"
regex = '(?m){1,1}'
haystack = ""
matches = []
compiles = false
[[test]]
name = "flags-are-unset"
regex = '(?:(?i)foo)|Bar'
haystack = "foo Foo bar Bar"
matches = [[0, 3], [4, 7], [12, 15]]
# Note that 'Ј' is not 'j', but cyrillic Je
#
[[test]]
name = "empty-group-with-unicode"
regex = '(?:)Ј01'
haystack = 'zЈ01'
matches = [[1, 5]]
[[test]]
name = "word-boundary-weird"
regex = '\b..\b'
haystack = "I have 12, he has 2!"
matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
[[test]]
name = "word-boundary-weird-ascii"
regex = '\b..\b'
haystack = "I have 12, he has 2!"
matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
unicode = false
utf8 = false
[[test]]
name = "word-boundary-weird-minimal-ascii"
regex = '\b..\b'
haystack = "az,,b"
matches = [[0, 2], [2, 4]]
unicode = false
utf8 = false
[[test]]
name = "reverse-suffix-100"
regex = '[0-4][0-4][0-4]000'
haystack = "153.230000"
matches = [[4, 10]]
[[test]]
name = "reverse-suffix-200"
regex = '[0-9][0-9][0-9]000'
haystack = "153.230000\n"
matches = [[4, 10]]
# This is a tricky case for the reverse suffix optimization, because it
# finds the 'foobar' match but the reverse scan must fail to find a match by
# correctly dealing with the word boundary following the 'foobar' literal when
# computing the start state.
#
# This test exists because I tried to break the following assumption that
# is currently in the code: that if a suffix is found and the reverse scan
# succeeds, then it's guaranteed that there is an overall match. Namely, the
# 'is_match' routine does *not* do another forward scan in this case because of
# this assumption.
[[test]]
name = "reverse-suffix-300"
regex = '\w+foobar\b'
haystack = "xyzfoobarZ"
matches = []
unicode = false
utf8 = false
[[test]]
name = "stops"
regex = '\bs(?:[ab])'
haystack = 's\xE4'
matches = []
unescape = true
utf8 = false
[[test]]
name = "stops-ascii"
regex = '(?-u:\b)s(?:[ab])'
haystack = 's\xE4'
matches = []
unescape = true
utf8 = false
[[test]]
name = "adjacent-line-boundary-100"
regex = '(?m)^(?:[^ ]+?)$'
haystack = "line1\nline2"
matches = [[0, 5], [6, 11]]
# Continued.
[[test]]
name = "adjacent-line-boundary-200"
regex = '(?m)^(?:[^ ]+?)$'
haystack = "A\nB"
matches = [[0, 1], [2, 3]]
# There is no issue for this bug.
[[test]]
name = "anchored-prefix-100"
regex = '^a[[:^space:]]'
haystack = "a "
matches = []
# There is no issue for this bug.
[[test]]
name = "anchored-prefix-200"
regex = '^a[[:^space:]]'
haystack = "foo boo a"
matches = []
# There is no issue for this bug.
[[test]]
name = "anchored-prefix-300"
regex = '^-[a-z]'
haystack = "r-f"
matches = []
# Tests that a possible Aho-Corasick optimization works correctly. It only
# kicks in when we have a lot of literals. By "works correctly," we mean that
# leftmost-first match semantics are properly respected. That is, samwise
# should match, not sam.
#
# There is no issue for this bug.
[[test]]
name = "aho-corasick-100"
regex = 'samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z'
haystack = "samwise"
matches = [[0, 7]]
[[test]]
name = "interior-anchor-capture"
regex = '(a$)b$'
haystack = 'ab'
matches = []
# I found this bug in the course of adding some of the regexes that Ruff uses
# to rebar. It turns out that the lazy DFA was finding a match that was being
# rejected by the one-pass DFA. Yikes. I then minimized the regex and haystack.
#
[[test]]
name = "ruff-whitespace-around-keywords"
regex = '^(a|ab)$'
haystack = "ab"
anchored = true
unicode = false
utf8 = true
matches = [[[0, 2], [0, 2]]]
[[test]]
name = "i429-0"
regex = '(?:(?-u:\b)|(?u:h))+'
haystack = "h"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1]]
[[test]]
name = "i429-1"
regex = '(?u:\B)'
haystack = "鋸"
unicode = true
utf8 = false
matches = []
[[test]]
name = "i429-2"
regex = '(?:(?u:\b)|(?s-u:.))+'
haystack = "oB"
unicode = true
utf8 = false
matches = [[0, 0], [1, 2]]
[[test]]
name = "i429-3"
regex = '(?:(?-u:\B)|(?su:.))+'
haystack = "\U000FEF80"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "i429-3-utf8"
regex = '(?:(?-u:\B)|(?su:.))+'
haystack = "\U000FEF80"
unicode = true
utf8 = true
matches = [[0, 0], [4, 4]]
[[test]]
name = "i429-4"
regex = '(?m:$)(?m:^)(?su:.)'
haystack = "\n‣"
unicode = true
utf8 = false
matches = [[0, 1]]
[[test]]
name = "i429-5"
regex = '(?m:$)^(?m:^)'
haystack = "\n"
unicode = true
utf8 = false
matches = [[0, 0]]
[[test]]
name = "i429-6"
regex = '(?P<kp>(?iu:do)(?m:$))*'
haystack = "dodo"
unicode = true
utf8 = false
matches = [
[[0, 0], []],
[[1, 1], []],
[[2, 4], [2, 4]],
]
[[test]]
name = "i429-7"
regex = '(?u:\B)'
haystack = "䡁"
unicode = true
utf8 = false
matches = []
[[test]]
name = "i429-8"
regex = '(?:(?-u:\b)|(?u:[\u{0}-W]))+'
haystack = "0"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1]]
[[test]]
name = "i429-9"
regex = '((?m:$)(?-u:\B)(?s-u:.)(?-u:\B)$)'
haystack = "\n\n"
unicode = true
utf8 = false
matches = [
[[1, 2], [1, 2]],
]
[[test]]
name = "i429-10"
regex = '(?m:$)(?m:$)^(?su:.)'
haystack = "\n\u0081¨\u200a"
unicode = true
utf8 = false
matches = [[0, 1]]
[[test]]
name = "i429-11"
regex = '(?-u:\B)(?m:^)'
haystack = "0\n"
unicode = true
utf8 = false
matches = [[2, 2]]
[[test]]
name = "i429-12"
regex = '(?:(?u:\b)|(?-u:.))+'
haystack = "0"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1]]
[[test]]
name = "i969"
regex = 'c.*d\z'
haystack = "ababcd"
bounds = [4, 6]
search-kind = "earliest"
matches = [[4, 6]]
# I found this during the regex-automata migration. This is the fowler basic
# 154 test, but without anchored = true and without a match limit.
#
# This test caught a subtle bug in the hybrid reverse DFA search, where it
# would skip over the termination condition if it entered a start state. This
# was a double bug. Firstly, the reverse DFA shouldn't have had start states
# specialized in the first place, and thus it shouldn't have possible to detect
# that the DFA had entered a start state. The second bug was that the start
# state handling was incorrect by jumping over the termination condition.
[[test]]
name = "fowler-basic154-unanchored"
regex = '''a([bc]*)c*'''
haystack = '''abc'''
matches = [[[0, 3], [1, 3]]]
#
# This was never really a problem in the new architecture because the
# regex-automata engines are far more principled about how they deal with
# look-around. (This was one of the many reasons I wanted to re-work the
# original regex crate engines.)
[[test]]
name = "word-boundary-interact-poorly-with-literal-optimizations"
regex = '(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))'
haystack = 'ubi-Darwin-x86_64.tar.gz'
matches = []
# This was found during fuzz testing of regex. It provoked a panic in the meta
# engine as a result of the reverse suffix optimization. Namely, it hit a case
# where a suffix match was found, a corresponding reverse match was found, but
# the forward search turned up no match. The forward search should always match
# if the suffix and reverse search match.
#
# This in turn uncovered an inconsistency between the PikeVM and the DFA (lazy
# and fully compiled) engines. It was caused by a mishandling of the collection
# of NFA state IDs in the generic determinization code (which is why both types
# of DFA were impacted). Namely, when a fail state was encountered (that's the
# `[^\s\S]` in the pattern below), then it would just stop collecting states.
# But that's not correct since a later state could lead to a match.
[[test]]
name = "impossible-branch"
regex = '.*[^\s\S]A|B'
haystack = "B"
matches = [[0, 1]]
# This was found during fuzz testing in regex-lite. The regex crate never
# suffered from this bug, but it causes regex-lite to incorrectly compile
# captures.
[[test]]
name = "captures-wrong-order"
regex = '(a){0}(a)'
haystack = 'a'
matches = [[[0, 1], [], [0, 1]]]
# This tests a bug in how quit states are handled in the DFA. At some point
# during development, the DFAs were tweaked slightly such that if they hit
# a quit state (which means, they hit a byte that the caller configured should
# stop the search), then it might not return an error necessarily. Namely, if a
# match had already been found, then it would be returned instead of an error.
#
# But this is actually wrong! Why? Because even though a match had been found,
# it wouldn't be fully correct to return it once a quit state has been seen
# because you can't determine whether the match offset returned is the correct
# greedy/leftmost-first match. Since you can't complete the search as requested
# by the caller, the DFA should just stop and return an error.
#
# Interestingly, this does seem to produce an unavoidable difference between
# 'try_is_match().unwrap()' and 'try_find().unwrap().is_some()' for the DFAs.
# The former will stop immediately once a match is known to occur and return
# 'Ok(true)', where as the latter could find the match but quit with an
# 'Err(..)' first.
#
# Thankfully, I believe this inconsistency between 'is_match()' and 'find()'
# cannot be observed in the higher level meta regex API because it specifically
# will try another engine that won't fail in the case of a DFA failing.
#
# This regression happened in the regex crate rewrite, but before anything got
# released.
[[test]]
name = "negated-unicode-word-boundary-dfa-fail"
regex = '\B.*'
haystack = "!\u02D7"
matches = [[0, 3]]
# This failure was found in the *old* regex crate (prior to regex 1.9), but
# I didn't investigate why. My best guess is that it's a literal optimization
# bug. It didn't occur in the rewrite.
[[test]]
name = "missed-match"
regex = 'e..+e.ee>'
haystack = 'Zeee.eZZZZZZZZeee>eeeeeee>'
matches = [[1, 26]]
# This test came from the 'ignore' crate and tripped a bug in how accelerated
# DFA states were handled in an overlapping search.
[[test]]
name = "regex-to-glob"
regex = ['(?-u)^path1/[^/]*$']
haystack = "path1/foo"
matches = [[0, 9]]
utf8 = false
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "reverse-inner-plus-shorter-than-expected"
regex = '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})'
haystack = '102:12:39'
matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]
# Like reverse-inner-plus-shorter-than-expected, but using a far simpler regex
# to demonstrate the extent of the rot. Sigh.
#
[[test]]
name = "reverse-inner-short"
regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])'
haystack = '102:12:39'
matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]
# This regression test was found via the RegexSet APIs. It triggered a
# particular code path where a regex was compiled with 'All' match semantics
# (to support overlapping search), but got funneled down into a standard
# leftmost search when calling 'is_match'. This is fine on its own, but the
# leftmost search will use a prefilter and that's where this went awry.
#
# Namely, since 'All' semantics were used, the aho-corasick prefilter was
# incorrectly compiled with 'Standard' semantics. This was wrong because
# 'Standard' immediately attempts to report a match at every position, even if
# that would mean reporting a match past the leftmost match before reporting
# the leftmost match. This breaks the prefilter contract of never having false
# negatives and leads overall to the engine not finding a match.
#
[[test]]
name = "prefilter-with-aho-corasick-standard-semantics"
regex = '(?m)^ *v [0-9]'
haystack = 'v 0'
matches = [
{ id = 0, spans = [[0, 3]] },
]
match-kind = "all"
search-kind = "overlapping"
unicode = true
utf8 = true