word-boundary.toml

comm-central/third_party/rust/regex/testdata/word-boundary.toml

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

# Some of these are cribbed from RE2's test suite.

# These test \b. Below are tests for \B.

[[test]]

name = "wb1"

regex = '\b'

haystack = ""

matches = []

unicode = false

[[test]]

name = "wb2"

regex = '\b'

haystack = "a"

matches = [[0, 0], [1, 1]]

unicode = false

[[test]]

name = "wb3"

regex = '\b'

haystack = "ab"

matches = [[0, 0], [2, 2]]

unicode = false

[[test]]

name = "wb4"

regex = '^\b'

haystack = "ab"

matches = [[0, 0]]

unicode = false

[[test]]

name = "wb5"

regex = '\b$'

haystack = "ab"

matches = [[2, 2]]

unicode = false

[[test]]

name = "wb6"

regex = '^\b$'

haystack = "ab"

matches = []

unicode = false

[[test]]

name = "wb7"

regex = '\bbar\b'

haystack = "nobar bar foo bar"

matches = [[6, 9], [14, 17]]

unicode = false

[[test]]

name = "wb8"

regex = 'a\b'

haystack = "faoa x"

matches = [[3, 4]]

unicode = false

[[test]]

name = "wb9"

regex = '\bbar'

haystack = "bar x"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb10"

regex = '\bbar'

haystack = "foo\nbar x"

matches = [[4, 7]]

unicode = false

[[test]]

name = "wb11"

regex = 'bar\b'

haystack = "foobar"

matches = [[3, 6]]

unicode = false

[[test]]

name = "wb12"

regex = 'bar\b'

haystack = "foobar\nxxx"

matches = [[3, 6]]

unicode = false

[[test]]

name = "wb13"

regex = '(?:foo|bar|[A-Z])\b'

haystack = "foo"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb14"

regex = '(?:foo|bar|[A-Z])\b'

haystack = "foo\n"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb15"

regex = '\b(?:foo|bar|[A-Z])'

haystack = "foo"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb16"

regex = '\b(?:foo|bar|[A-Z])\b'

haystack = "X"

matches = [[0, 1]]

unicode = false

[[test]]

name = "wb17"

regex = '\b(?:foo|bar|[A-Z])\b'

haystack = "XY"

matches = []

unicode = false

[[test]]

name = "wb18"

regex = '\b(?:foo|bar|[A-Z])\b'

haystack = "bar"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb19"

regex = '\b(?:foo|bar|[A-Z])\b'

haystack = "foo"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb20"

regex = '\b(?:foo|bar|[A-Z])\b'

haystack = "foo\n"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb21"

regex = '\b(?:foo|bar|[A-Z])\b'

haystack = "ffoo bbar N x"

matches = [[10, 11]]

unicode = false

[[test]]

name = "wb22"

regex = '\b(?:fo|foo)\b'

haystack = "fo"

matches = [[0, 2]]

unicode = false

[[test]]

name = "wb23"

regex = '\b(?:fo|foo)\b'

haystack = "foo"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb24"

regex = '\b\b'

haystack = ""

matches = []

unicode = false

[[test]]

name = "wb25"

regex = '\b\b'

haystack = "a"

matches = [[0, 0], [1, 1]]

unicode = false

[[test]]

name = "wb26"

regex = '\b$'

haystack = ""

matches = []

unicode = false

[[test]]

name = "wb27"

regex = '\b$'

haystack = "x"

matches = [[1, 1]]

unicode = false

[[test]]

name = "wb28"

regex = '\b$'

haystack = "y x"

matches = [[3, 3]]

unicode = false

[[test]]

name = "wb29"

regex = '(?-u:\b).$'

haystack = "x"

matches = [[0, 1]]

[[test]]

name = "wb30"

regex = '^\b(?:fo|foo)\b'

haystack = "fo"

matches = [[0, 2]]

unicode = false

[[test]]

name = "wb31"

regex = '^\b(?:fo|foo)\b'

haystack = "foo"

matches = [[0, 3]]

unicode = false

[[test]]

name = "wb32"

regex = '^\b$'

haystack = ""

matches = []

unicode = false

[[test]]

name = "wb33"

regex = '^\b$'

haystack = "x"

matches = []

unicode = false

[[test]]

name = "wb34"

regex = '^(?-u:\b).$'

haystack = "x"

matches = [[0, 1]]

[[test]]

name = "wb35"

regex = '^(?-u:\b).(?-u:\b)$'

haystack = "x"

matches = [[0, 1]]

[[test]]

name = "wb36"

regex = '^^^^^\b$$$$$'

haystack = ""

matches = []

unicode = false

[[test]]

name = "wb37"

regex = '^^^^^(?-u:\b).$$$$$'

haystack = "x"

matches = [[0, 1]]

[[test]]

name = "wb38"

regex = '^^^^^\b$$$$$'

haystack = "x"

matches = []

unicode = false

[[test]]

name = "wb39"

regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$'

haystack = "x"

matches = [[0, 1]]

[[test]]

name = "wb40"

regex = '(?-u:\b).+(?-u:\b)'

haystack = "$$abc$$"

matches = [[2, 5]]

[[test]]

name = "wb41"

regex = '\b'

haystack = "a b c"

matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]

unicode = false

[[test]]

name = "wb42"

regex = '\bfoo\b'

haystack = "zzz foo zzz"

matches = [[4, 7]]

unicode = false

[[test]]

name = "wb43"

regex = '\b^'

haystack = "ab"

matches = [[0, 0]]

unicode = false

[[test]]

name = "wb44"

regex = '$\b'

haystack = "ab"

matches = [[2, 2]]

unicode = false

# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we

# have to disable it for most of these tests. This is because \B can match at

# non-UTF-8 boundaries.

[[test]]

name = "nb1"

regex = '\Bfoo\B'

haystack = "n foo xfoox that"

matches = [[7, 10]]

unicode = false

utf8 = false

[[test]]

name = "nb2"

regex = 'a\B'

haystack = "faoa x"

matches = [[1, 2]]

unicode = false

utf8 = false

[[test]]

name = "nb3"

regex = '\Bbar'

haystack = "bar x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb4"

regex = '\Bbar'

haystack = "foo\nbar x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb5"

regex = 'bar\B'

haystack = "foobar"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb6"

regex = 'bar\B'

haystack = "foobar\nxxx"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb7"

regex = '(?:foo|bar|[A-Z])\B'

haystack = "foox"

matches = [[0, 3]]

unicode = false

utf8 = false

[[test]]

name = "nb8"

regex = '(?:foo|bar|[A-Z])\B'

haystack = "foo\n"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb9"

regex = '\B'

haystack = ""

matches = [[0, 0]]

unicode = false

utf8 = false

[[test]]

name = "nb10"

regex = '\B'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb11"

regex = '\B(?:foo|bar|[A-Z])'

haystack = "foo"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb12"

regex = '\B(?:foo|bar|[A-Z])\B'

haystack = "xXy"

matches = [[1, 2]]

unicode = false

utf8 = false

[[test]]

name = "nb13"

regex = '\B(?:foo|bar|[A-Z])\B'

haystack = "XY"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb14"

regex = '\B(?:foo|bar|[A-Z])\B'

haystack = "XYZ"

matches = [[1, 2]]

unicode = false

utf8 = false

[[test]]

name = "nb15"

regex = '\B(?:foo|bar|[A-Z])\B'

haystack = "abara"

matches = [[1, 4]]

unicode = false

utf8 = false

[[test]]

name = "nb16"

regex = '\B(?:foo|bar|[A-Z])\B'

haystack = "xfoo_"

matches = [[1, 4]]

unicode = false

utf8 = false

[[test]]

name = "nb17"

regex = '\B(?:foo|bar|[A-Z])\B'

haystack = "xfoo\n"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb18"

regex = '\B(?:foo|bar|[A-Z])\B'

haystack = "foo bar vNX"

matches = [[9, 10]]

unicode = false

utf8 = false

[[test]]

name = "nb19"

regex = '\B(?:fo|foo)\B'

haystack = "xfoo"

matches = [[1, 3]]

unicode = false

utf8 = false

[[test]]

name = "nb20"

regex = '\B(?:foo|fo)\B'

haystack = "xfooo"

matches = [[1, 4]]

unicode = false

utf8 = false

[[test]]

name = "nb21"

regex = '\B\B'

haystack = ""

matches = [[0, 0]]

unicode = false

utf8 = false

[[test]]

name = "nb22"

regex = '\B\B'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb23"

regex = '\B$'

haystack = ""

matches = [[0, 0]]

unicode = false

utf8 = false

[[test]]

name = "nb24"

regex = '\B$'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb25"

regex = '\B$'

haystack = "y x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb26"

regex = '\B.$'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb27"

regex = '^\B(?:fo|foo)\B'

haystack = "fo"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb28"

regex = '^\B(?:fo|foo)\B'

haystack = "fo"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb29"

regex = '^\B'

haystack = ""

matches = [[0, 0]]

unicode = false

utf8 = false

[[test]]

name = "nb30"

regex = '^\B'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb31"

regex = '^\B\B'

haystack = ""

matches = [[0, 0]]

unicode = false

utf8 = false

[[test]]

name = "nb32"

regex = '^\B\B'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb33"

regex = '^\B$'

haystack = ""

matches = [[0, 0]]

unicode = false

utf8 = false

[[test]]

name = "nb34"

regex = '^\B$'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb35"

regex = '^\B.$'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb36"

regex = '^\B.\B$'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb37"

regex = '^^^^^\B$$$$$'

haystack = ""

matches = [[0, 0]]

unicode = false

utf8 = false

[[test]]

name = "nb38"

regex = '^^^^^\B.$$$$$'

haystack = "x"

matches = []

unicode = false

utf8 = false

[[test]]

name = "nb39"

regex = '^^^^^\B$$$$$'

haystack = "x"

matches = []

unicode = false

utf8 = false

# unicode1* and unicode2* work for both Unicode and ASCII because all matches

# are reported as byte offsets, and « and » do not correspond to word

# boundaries at either the character or byte level.

[[test]]

name = "unicode1"

regex = '\bx\b'

haystack = "«x"

matches = [[2, 3]]

[[test]]

name = "unicode1-only-ascii"

regex = '\bx\b'

haystack = "«x"

matches = [[2, 3]]

unicode = false

[[test]]

name = "unicode2"

regex = '\bx\b'

haystack = "x»"

matches = [[0, 1]]

[[test]]

name = "unicode2-only-ascii"

regex = '\bx\b'

haystack = "x»"

matches = [[0, 1]]

unicode = false

# ASCII word boundaries are completely oblivious to Unicode characters, so

# even though β is a character, an ASCII \b treats it as a word boundary

# when it is adjacent to another ASCII character. (The ASCII \b only looks

# at the leading byte of β.) For Unicode \b, the tests are precisely inverted.

[[test]]

name = "unicode3"

regex = '\bx\b'

haystack = 'áxβ'

matches = []

[[test]]

name = "unicode3-only-ascii"

regex = '\bx\b'

haystack = 'áxβ'

matches = [[2, 3]]

unicode = false

[[test]]

name = "unicode4"

regex = '\Bx\B'

haystack = 'áxβ'

matches = [[2, 3]]

[[test]]

name = "unicode4-only-ascii"

regex = '\Bx\B'

haystack = 'áxβ'

matches = []

unicode = false

utf8 = false

# The same as above, but with \b instead of \B as a sanity check.

[[test]]

name = "unicode5"

regex = '\b'

haystack = "0\U0007EF5E"

matches = [[0, 0], [1, 1]]

[[test]]

name = "unicode5-only-ascii"

regex = '\b'

haystack = "0\U0007EF5E"

matches = [[0, 0], [1, 1]]

unicode = false

utf8 = false

[[test]]

name = "unicode5-noutf8"

regex = '\b'

haystack = '0\xFF\xFF\xFF\xFF'

matches = [[0, 0], [1, 1]]

unescape = true

utf8 = false

[[test]]

name = "unicode5-noutf8-only-ascii"

regex = '\b'

haystack = '0\xFF\xFF\xFF\xFF'

matches = [[0, 0], [1, 1]]

unescape = true

unicode = false

utf8 = false

# Weird special case to ensure that ASCII \B treats each individual code unit

# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary

# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the

# \w character class.)

[[test]]

name = "unicode5-not"

regex = '\B'

haystack = "0\U0007EF5E"

matches = [[5, 5]]

[[test]]

name = "unicode5-not-only-ascii"

regex = '\B'

haystack = "0\U0007EF5E"

matches = [[2, 2], [3, 3], [4, 4], [5, 5]]

unicode = false

utf8 = false

# This gets no matches since \B only matches in the presence of valid UTF-8

# when Unicode is enabled, even when UTF-8 mode is disabled.

[[test]]

name = "unicode5-not-noutf8"

regex = '\B'

haystack = '0\xFF\xFF\xFF\xFF'

matches = []

unescape = true

utf8 = false

# But this DOES get matches since \B in ASCII mode only looks at individual

# bytes.

[[test]]

name = "unicode5-not-noutf8-only-ascii"

regex = '\B'

haystack = '0\xFF\xFF\xFF\xFF'

matches = [[2, 2], [3, 3], [4, 4], [5, 5]]

unescape = true

unicode = false

utf8 = false

# Some tests of no particular significance.

[[test]]

name = "unicode6"

regex = '\b[0-9]+\b'

haystack = "foo 123 bar 456 quux 789"

matches = [[4, 7], [12, 15], [21, 24]]

[[test]]

name = "unicode7"

regex = '\b[0-9]+\b'

haystack = "foo 123 bar a456 quux 789"

matches = [[4, 7], [22, 25]]

[[test]]

name = "unicode8"

regex = '\b[0-9]+\b'

haystack = "foo 123 bar 456a quux 789"

matches = [[4, 7], [22, 25]]

# A variant of the problem described here:

# https://github.com/google/re2/blob/89567f5de5b23bb5ad0c26cbafc10bdc7389d1fa/re2/dfa.cc#L658-L667

[[test]]

name = "alt-with-assertion-repetition"

regex = '(?:\b|%)+'

haystack = "z%"

bounds = [1, 2]

anchored = true

matches = [[1, 1]]