genericsimd.rs - mozsearch

use core::mem::size_of;

use crate::memmem::{

    prefilter::{PrefilterFnTy, PrefilterState},

    vector::Vector,

    NeedleInfo,

};

/// The implementation of the forward vector accelerated candidate finder.

///

/// This is inspired by the "generic SIMD" algorithm described here:

/// http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd

///

/// The main difference is that this is just a prefilter. That is, it reports

/// candidates once they are seen and doesn't attempt to confirm them. Also,

/// the bytes this routine uses to check for candidates are selected based on

/// an a priori background frequency distribution. This means that on most

/// haystacks, this will on average spend more time in vectorized code than you

/// would if you just selected the first and last bytes of the needle.

///

/// Note that a non-prefilter variant of this algorithm can be found in the

/// parent module, but it only works on smaller needles.

///

/// `prestate`, `ninfo`, `haystack` and `needle` are the four prefilter

/// function parameters. `fallback` is a prefilter that is used if the haystack

/// is too small to be handled with the given vector size.

///

/// This routine is not safe because it is intended for callers to specialize

/// this with a particular vector (e.g., __m256i) and then call it with the

/// relevant target feature (e.g., avx2) enabled.

///

/// # Panics

///

/// If `needle.len() <= 1`, then this panics.

///

/// # Safety

///

/// Since this is meant to be used with vector functions, callers need to

/// specialize this inside of a function with a `target_feature` attribute.

/// Therefore, callers must ensure that whatever target feature is being used

/// supports the vector functions that this function is specialized for. (For

/// the specific vector functions used, see the Vector trait implementations.)

#[inline(always)]

pub(crate) unsafe fn find<V: Vector>(

    prestate: &mut PrefilterState,

    ninfo: &NeedleInfo,

    haystack: &[u8],

    needle: &[u8],

    fallback: PrefilterFnTy,

) -> Option<usize> {

    assert!(needle.len() >= 2, "needle must be at least 2 bytes");

    let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_usize();

    let min_haystack_len = rare2i + size_of::<V>();

    if haystack.len() < min_haystack_len {

        return fallback(prestate, ninfo, haystack, needle);

    let start_ptr = haystack.as_ptr();

    let end_ptr = start_ptr.add(haystack.len());

    let max_ptr = end_ptr.sub(min_haystack_len);

    let mut ptr = start_ptr;

    let rare1chunk = V::splat(needle[rare1i]);

    let rare2chunk = V::splat(needle[rare2i]);

    // N.B. I did experiment with unrolling the loop to deal with size(V)

    // bytes at a time and 2*size(V) bytes at a time. The double unroll

    // was marginally faster while the quadruple unroll was unambiguously

    // slower. In the end, I decided the complexity from unrolling wasn't

    // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to

    // compare.

    while ptr <= max_ptr {

        let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);

        if let Some(chunki) = m {

            return Some(matched(prestate, start_ptr, ptr, chunki));

        ptr = ptr.add(size_of::<V>());

    if ptr < end_ptr {

        // This routine immediately quits if a candidate match is found.

        // That means that if we're here, no candidate matches have been

        // found at or before 'ptr'. Thus, we don't need to mask anything

        // out even though we might technically search part of the haystack

        // that we've already searched (because we know it can't match).

        ptr = max_ptr;

        let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);

        if let Some(chunki) = m {

            return Some(matched(prestate, start_ptr, ptr, chunki));

    prestate.update(haystack.len());

    None

// Below are two different techniques for checking whether a candidate

// match exists in a given chunk or not. find_in_chunk2 checks two bytes

// where as find_in_chunk3 checks three bytes. The idea behind checking

// three bytes is that while we do a bit more work per iteration, we

// decrease the chances of a false positive match being reported and thus

// make the search faster overall. This actually works out for the

// memmem/krate/prebuilt/huge-en/never-all-common-bytes benchmark, where

// using find_in_chunk3 is about 25% faster than find_in_chunk2. However,

// it turns out that find_in_chunk2 is faster for all other benchmarks, so

// perhaps the extra check isn't worth it in practice.

//

// For now, we go with find_in_chunk2, but we leave find_in_chunk3 around

// to make it easy to switch to and benchmark when possible.

/// Search for an occurrence of two rare bytes from the needle in the current

/// chunk pointed to by ptr.

///

/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2

/// bytes repeated in each 8-bit lane, respectively.

///

/// # Safety

///

/// It must be safe to do an unaligned read of size(V) bytes starting at both

/// (ptr + rare1i) and (ptr + rare2i).

#[inline(always)]

unsafe fn find_in_chunk2<V: Vector>(

    ptr: *const u8,

    rare1i: usize,

    rare2i: usize,

    rare1chunk: V,

    rare2chunk: V,

) -> Option<usize> {

    let chunk0 = V::load_unaligned(ptr.add(rare1i));

    let chunk1 = V::load_unaligned(ptr.add(rare2i));

    let eq0 = chunk0.cmpeq(rare1chunk);

    let eq1 = chunk1.cmpeq(rare2chunk);

    let match_offsets = eq0.and(eq1).movemask();

    if match_offsets == 0 {

        return None;

    Some(match_offsets.trailing_zeros() as usize)

/// Search for an occurrence of two rare bytes and the first byte (even if one

/// of the rare bytes is equivalent to the first byte) from the needle in the

/// current chunk pointed to by ptr.

///

/// firstchunk, rare1chunk and rare2chunk correspond to vectors with the first,

/// rare1 and rare2 bytes repeated in each 8-bit lane, respectively.

///

/// # Safety

///

/// It must be safe to do an unaligned read of size(V) bytes starting at ptr,

/// (ptr + rare1i) and (ptr + rare2i).

#[allow(dead_code)]

#[inline(always)]

unsafe fn find_in_chunk3<V: Vector>(

    ptr: *const u8,

    rare1i: usize,

    rare2i: usize,

    firstchunk: V,

    rare1chunk: V,

    rare2chunk: V,

) -> Option<usize> {

    let chunk0 = V::load_unaligned(ptr);

    let chunk1 = V::load_unaligned(ptr.add(rare1i));

    let chunk2 = V::load_unaligned(ptr.add(rare2i));

    let eq0 = chunk0.cmpeq(firstchunk);

    let eq1 = chunk1.cmpeq(rare1chunk);

    let eq2 = chunk2.cmpeq(rare2chunk);

    let match_offsets = eq0.and(eq1).and(eq2).movemask();

    if match_offsets == 0 {

        return None;

    Some(match_offsets.trailing_zeros() as usize)

/// Accepts a chunk-relative offset and returns a haystack relative offset

/// after updating the prefilter state.

///

/// Why do we use this unlineable function when a search completes? Well,

/// I don't know. Really. Obviously this function was not here initially.

/// When doing profiling, the codegen for the inner loop here looked bad and

/// I didn't know why. There were a couple extra 'add' instructions and an

/// extra 'lea' instruction that I couldn't explain. I hypothesized that the

/// optimizer was having trouble untangling the hot code in the loop from the

/// code that deals with a candidate match. By putting the latter into an

/// unlineable function, it kind of forces the issue and it had the intended

/// effect: codegen improved measurably. It's good for a ~10% improvement

/// across the board on the memmem/krate/prebuilt/huge-en/ benchmarks.

#[cold]

#[inline(never)]

fn matched(

    prestate: &mut PrefilterState,

    start_ptr: *const u8,

    ptr: *const u8,

    chunki: usize,

) -> usize {

    let found = diff(ptr, start_ptr) + chunki;

    prestate.update(found);

    found

/// Subtract `b` from `a` and return the difference. `a` must be greater than

/// or equal to `b`.

fn diff(a: *const u8, b: *const u8) -> usize {

    debug_assert!(a >= b);

    (a as usize) - (b as usize)