pattern.rs - mozsearch

use core::{cmp, fmt, mem, u16, usize};

use alloc::{boxed::Box, string::String, vec, vec::Vec};

use crate::{

    packed::{api::MatchKind, ext::Pointer},

    PatternID,

};

/// A non-empty collection of non-empty patterns to search for.

///

/// This collection of patterns is what is passed around to both execute

/// searches and to construct the searchers themselves. Namely, this permits

/// searches to avoid copying all of the patterns, and allows us to keep only

/// one copy throughout all packed searchers.

///

/// Note that this collection is not a set. The same pattern can appear more

/// than once.

#[derive(Clone, Debug)]

pub(crate) struct Patterns {

    /// The match semantics supported by this collection of patterns.

///

    /// The match semantics determines the order of the iterator over patterns.

    /// For leftmost-first, patterns are provided in the same order as were

    /// provided by the caller. For leftmost-longest, patterns are provided in

    /// descending order of length, with ties broken by the order in which they

    /// were provided by the caller.

    kind: MatchKind,

    /// The collection of patterns, indexed by their identifier.

    by_id: Vec<Vec<u8>>,

    /// The order of patterns defined for iteration, given by pattern

    /// identifiers. The order of `by_id` and `order` is always the same for

    /// leftmost-first semantics, but may be different for leftmost-longest

    /// semantics.

    order: Vec<PatternID>,

    /// The length of the smallest pattern, in bytes.

    minimum_len: usize,

    /// The total number of pattern bytes across the entire collection. This

    /// is used for reporting total heap usage in constant time.

    total_pattern_bytes: usize,

// BREADCRUMBS: I think we want to experiment with a different bucket

// representation. Basically, each bucket is just a Range<usize> to a single

// contiguous allocation? Maybe length-prefixed patterns or something? The

// idea is to try to get rid of the pointer chasing in verification. I don't

// know that that is the issue, but I suspect it is.

impl Patterns {

    /// Create a new collection of patterns for the given match semantics. The

    /// ID of each pattern is the index of the pattern at which it occurs in

    /// the `by_id` slice.

///

    /// If any of the patterns in the slice given are empty, then this panics.

    /// Similarly, if the number of patterns given is zero, then this also

    /// panics.

    pub(crate) fn new() -> Patterns {

        Patterns {

            kind: MatchKind::default(),

            by_id: vec![],

            order: vec![],

            minimum_len: usize::MAX,

            total_pattern_bytes: 0,

    /// Add a pattern to this collection.

///

    /// This panics if the pattern given is empty.

    pub(crate) fn add(&mut self, bytes: &[u8]) {

        assert!(!bytes.is_empty());

        assert!(self.by_id.len() <= u16::MAX as usize);

        let id = PatternID::new(self.by_id.len()).unwrap();

        self.order.push(id);

        self.by_id.push(bytes.to_vec());

        self.minimum_len = cmp::min(self.minimum_len, bytes.len());

        self.total_pattern_bytes += bytes.len();

    /// Set the match kind semantics for this collection of patterns.

///

    /// If the kind is not set, then the default is leftmost-first.

    pub(crate) fn set_match_kind(&mut self, kind: MatchKind) {

        self.kind = kind;

        match self.kind {

            MatchKind::LeftmostFirst => {

                self.order.sort();

            MatchKind::LeftmostLongest => {

                let (order, by_id) = (&mut self.order, &mut self.by_id);

                order.sort_by(|&id1, &id2| {

                    by_id[id1].len().cmp(&by_id[id2].len()).reverse()

});

    /// Return the number of patterns in this collection.

///

    /// This is guaranteed to be greater than zero.

    pub(crate) fn len(&self) -> usize {

        self.by_id.len()

    /// Returns true if and only if this collection of patterns is empty.

    pub(crate) fn is_empty(&self) -> bool {

        self.len() == 0

    /// Returns the approximate total amount of heap used by these patterns, in

    /// units of bytes.

    pub(crate) fn memory_usage(&self) -> usize {

        self.order.len() * mem::size_of::<PatternID>()

            + self.by_id.len() * mem::size_of::<Vec<u8>>()

            + self.total_pattern_bytes

    /// Clears all heap memory associated with this collection of patterns and

    /// resets all state such that it is a valid empty collection.

    pub(crate) fn reset(&mut self) {

        self.kind = MatchKind::default();

        self.by_id.clear();

        self.order.clear();

        self.minimum_len = usize::MAX;

    /// Returns the length, in bytes, of the smallest pattern.

///

    /// This is guaranteed to be at least one.

    pub(crate) fn minimum_len(&self) -> usize {

        self.minimum_len

    /// Returns the match semantics used by these patterns.

    pub(crate) fn match_kind(&self) -> &MatchKind {

        &self.kind

    /// Return the pattern with the given identifier. If such a pattern does

    /// not exist, then this panics.

    pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> {

        Pattern(&self.by_id[id])

    /// Return the pattern with the given identifier without performing bounds

    /// checks.

///

    /// # Safety

///

    /// Callers must ensure that a pattern with the given identifier exists

    /// before using this method.

    pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {

        Pattern(self.by_id.get_unchecked(id.as_usize()))

    /// Return an iterator over all the patterns in this collection, in the

    /// order in which they should be matched.

///

    /// Specifically, in a naive multi-pattern matcher, the following is

    /// guaranteed to satisfy the match semantics of this collection of

    /// patterns:

///

    /// ```ignore

    /// for i in 0..haystack.len():

    ///   for p in patterns.iter():

    ///     if haystack[i..].starts_with(p.bytes()):

    ///       return Match(p.id(), i, i + p.bytes().len())

    /// ```

///

    /// Namely, among the patterns in a collection, if they are matched in

    /// the order provided by this iterator, then the result is guaranteed

    /// to satisfy the correct match semantics. (Either leftmost-first or

    /// leftmost-longest.)

    pub(crate) fn iter(&self) -> PatternIter<'_> {

        PatternIter { patterns: self, i: 0 }

/// An iterator over the patterns in the `Patterns` collection.

///

/// The order of the patterns provided by this iterator is consistent with the

/// match semantics of the originating collection of patterns.

///

/// The lifetime `'p` corresponds to the lifetime of the collection of patterns

/// this is iterating over.

#[derive(Debug)]

pub(crate) struct PatternIter<'p> {

    patterns: &'p Patterns,

    i: usize,

impl<'p> Iterator for PatternIter<'p> {

    type Item = (PatternID, Pattern<'p>);

    fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {

        if self.i >= self.patterns.len() {

            return None;

        let id = self.patterns.order[self.i];

        let p = self.patterns.get(id);

        self.i += 1;

        Some((id, p))

/// A pattern that is used in packed searching.

#[derive(Clone)]

pub(crate) struct Pattern<'a>(&'a [u8]);

impl<'a> fmt::Debug for Pattern<'a> {

    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

        f.debug_struct("Pattern")

            .field("lit", &String::from_utf8_lossy(&self.0))

            .finish()

impl<'p> Pattern<'p> {

    /// Returns the length of this pattern, in bytes.

    pub(crate) fn len(&self) -> usize {

        self.0.len()

    /// Returns the bytes of this pattern.

    pub(crate) fn bytes(&self) -> &[u8] {

        &self.0

    /// Returns the first `len` low nybbles from this pattern. If this pattern

    /// is shorter than `len`, then this panics.

    pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> {

        let mut nybs = vec![0; len].into_boxed_slice();

        for (i, byte) in self.bytes().iter().take(len).enumerate() {

            nybs[i] = byte & 0xF;

        nybs

    /// Returns true if this pattern is a prefix of the given bytes.

    #[inline(always)]

    pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool {

        is_prefix(bytes, self.bytes())

    /// Returns true if this pattern is a prefix of the haystack given by the

    /// raw `start` and `end` pointers.

///

    /// # Safety

///

    /// * It must be the case that `start < end` and that the distance between

    /// them is at least equal to `V::BYTES`. That is, it must always be valid

    /// to do at least an unaligned load of `V` at `start`.

    /// * Both `start` and `end` must be valid for reads.

    /// * Both `start` and `end` must point to an initialized value.

    /// * Both `start` and `end` must point to the same allocated object and

    /// must either be in bounds or at most one byte past the end of the

    /// allocated object.

    /// * Both `start` and `end` must be _derived from_ a pointer to the same

    /// object.

    /// * The distance between `start` and `end` must not overflow `isize`.

    /// * The distance being in bounds must not rely on "wrapping around" the

    /// address space.

    #[inline(always)]

    pub(crate) unsafe fn is_prefix_raw(

        &self,

        start: *const u8,

        end: *const u8,

    ) -> bool {

        let patlen = self.bytes().len();

        let haylen = end.distance(start);

        if patlen > haylen {

            return false;

        // SAFETY: We've checked that the haystack has length at least equal

        // to this pattern. All other safety concerns are the responsibility

        // of the caller.

        is_equal_raw(start, self.bytes().as_ptr(), patlen)

/// Returns true if and only if `needle` is a prefix of `haystack`.

///

/// This uses a latency optimized variant of `memcmp` internally which *might*

/// make this faster for very short strings.

///

/// # Inlining

///

/// This routine is marked `inline(always)`. If you want to call this function

/// in a way that is not always inlined, you'll need to wrap a call to it in

/// another function that is marked as `inline(never)` or just `inline`.

#[inline(always)]

fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {

    if needle.len() > haystack.len() {

        return false;

    // SAFETY: Our pointers are derived directly from borrowed slices which

    // uphold all of our safety guarantees except for length. We account for

    // length with the check above.

    unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) }

/// Compare corresponding bytes in `x` and `y` for equality.

///

/// That is, this returns true if and only if `x.len() == y.len()` and

/// `x[i] == y[i]` for all `0 <= i < x.len()`.

///

/// Note that this isn't used. We only use it in tests as a convenient way

/// of testing `is_equal_raw`.

///

/// # Inlining

///

/// This routine is marked `inline(always)`. If you want to call this function

/// in a way that is not always inlined, you'll need to wrap a call to it in

/// another function that is marked as `inline(never)` or just `inline`.

///

/// # Motivation

///

/// Why not use slice equality instead? Well, slice equality usually results in

/// a call out to the current platform's `libc` which might not be inlineable

/// or have other overhead. This routine isn't guaranteed to be a win, but it

/// might be in some cases.

#[cfg(test)]

#[inline(always)]

fn is_equal(x: &[u8], y: &[u8]) -> bool {

    if x.len() != y.len() {

        return false;

    // SAFETY: Our pointers are derived directly from borrowed slices which

    // uphold all of our safety guarantees except for length. We account for

    // length with the check above.

    unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }

/// Compare `n` bytes at the given pointers for equality.

///

/// This returns true if and only if `*x.add(i) == *y.add(i)` for all

/// `0 <= i < n`.

///

/// # Inlining

///

/// This routine is marked `inline(always)`. If you want to call this function

/// in a way that is not always inlined, you'll need to wrap a call to it in

/// another function that is marked as `inline(never)` or just `inline`.

///

/// # Motivation

///

/// Why not use slice equality instead? Well, slice equality usually results in

/// a call out to the current platform's `libc` which might not be inlineable

/// or have other overhead. This routine isn't guaranteed to be a win, but it

/// might be in some cases.

///

/// # Safety

///

/// * Both `x` and `y` must be valid for reads of up to `n` bytes.

/// * Both `x` and `y` must point to an initialized value.

/// * Both `x` and `y` must each point to an allocated object and

/// must either be in bounds or at most one byte past the end of the

/// allocated object. `x` and `y` do not need to point to the same allocated

/// object, but they may.

/// * Both `x` and `y` must be _derived from_ a pointer to their respective

/// allocated objects.

/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly

/// for `y` and `y+n`.

/// * The distance being in bounds must not rely on "wrapping around" the

/// address space.

#[inline(always)]

unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool {

    // If we don't have enough bytes to do 4-byte at a time loads, then

    // handle each possible length specially. Note that I used to have a

    // byte-at-a-time loop here and that turned out to be quite a bit slower

    // for the memmem/pathological/defeat-simple-vector-alphabet benchmark.

    if n < 4 {

        return match n {

            0 => true,

            1 => x.read() == y.read(),

            2 => {

                x.cast::<u16>().read_unaligned()

                    == y.cast::<u16>().read_unaligned()

            // I also tried copy_nonoverlapping here and it looks like the

            // codegen is the same.

            3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),

            _ => unreachable!(),

};

    // When we have 4 or more bytes to compare, then proceed in chunks of 4 at

    // a time using unaligned loads.

//

    // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is

    // that this particular version of memcmp is likely to be called with tiny

    // needles. That means that if we do 8 byte loads, then a higher proportion

    // of memcmp calls will use the slower variant above. With that said, this

    // is a hypothesis and is only loosely supported by benchmarks. There's

    // likely some improvement that could be made here. The main thing here

    // though is to optimize for latency, not throughput.

    // SAFETY: The caller is responsible for ensuring the pointers we get are

    // valid and readable for at least `n` bytes. We also do unaligned loads,

    // so there's no need to ensure we're aligned. (This is justified by this

    // routine being specifically for short strings.)

    let xend = x.add(n.wrapping_sub(4));

    let yend = y.add(n.wrapping_sub(4));

    while x < xend {

        let vx = x.cast::<u32>().read_unaligned();

        let vy = y.cast::<u32>().read_unaligned();

        if vx != vy {

            return false;

        x = x.add(4);

        y = y.add(4);

    let vx = xend.cast::<u32>().read_unaligned();

    let vy = yend.cast::<u32>().read_unaligned();

    vx == vy

#[cfg(test)]

mod tests {

    use super::*;

    #[test]

    fn equals_different_lengths() {

        assert!(!is_equal(b"", b"a"));

        assert!(!is_equal(b"a", b""));

        assert!(!is_equal(b"ab", b"a"));

        assert!(!is_equal(b"a", b"ab"));

    #[test]

    fn equals_mismatch() {

        let one_mismatch = [

            (&b"a"[..], &b"x"[..]),

            (&b"ab"[..], &b"ax"[..]),

            (&b"abc"[..], &b"abx"[..]),

            (&b"abcd"[..], &b"abcx"[..]),

            (&b"abcde"[..], &b"abcdx"[..]),

            (&b"abcdef"[..], &b"abcdex"[..]),

            (&b"abcdefg"[..], &b"abcdefx"[..]),

            (&b"abcdefgh"[..], &b"abcdefgx"[..]),

            (&b"abcdefghi"[..], &b"abcdefghx"[..]),

            (&b"abcdefghij"[..], &b"abcdefghix"[..]),

            (&b"abcdefghijk"[..], &b"abcdefghijx"[..]),

            (&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),

            (&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),

            (&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),

];

        for (x, y) in one_mismatch {

            assert_eq!(x.len(), y.len(), "lengths should match");

            assert!(!is_equal(x, y));

            assert!(!is_equal(y, x));

    #[test]

    fn equals_yes() {

        assert!(is_equal(b"", b""));

        assert!(is_equal(b"a", b"a"));

        assert!(is_equal(b"ab", b"ab"));

        assert!(is_equal(b"abc", b"abc"));

        assert!(is_equal(b"abcd", b"abcd"));

        assert!(is_equal(b"abcde", b"abcde"));

        assert!(is_equal(b"abcdef", b"abcdef"));

        assert!(is_equal(b"abcdefg", b"abcdefg"));

        assert!(is_equal(b"abcdefgh", b"abcdefgh"));

        assert!(is_equal(b"abcdefghi", b"abcdefghi"));

    #[test]

    fn prefix() {

        assert!(is_prefix(b"", b""));

        assert!(is_prefix(b"a", b""));

        assert!(is_prefix(b"ab", b""));

        assert!(is_prefix(b"foo", b"foo"));

        assert!(is_prefix(b"foobar", b"foo"));

        assert!(!is_prefix(b"foo", b"fob"));

        assert!(!is_prefix(b"foobar", b"fob"));

Source code

Revision control

Copy as Markdown

Other Tools