Files
addr2line
adler
aho_corasick
arrayvec
atty
backtrace
bitflags
camino
cargo_metadata
cargo_nextest
cargo_platform
cfg_expr
cfg_if
chrono
clap
clap_derive
color_eyre
config
crossbeam_channel
crossbeam_deque
crossbeam_epoch
crossbeam_utils
ctrlc
datatest_stable
debug_ignore
duct
either
enable_ansi_support
env_logger
eyre
fixedbitset
gimli
guppy
guppy_workspace_hack
hashbrown
humantime
humantime_serde
indent_write
indenter
indexmap
is_ci
itertools
itoa
lazy_static
lexical_core
libc
log
memchr
memoffset
miniz_oxide
nested
nextest_metadata
nextest_runner
nix
nom
num_cpus
num_integer
num_traits
object
once_cell
os_pipe
os_str_bytes
owo_colors
pathdiff
petgraph
proc_macro2
proc_macro_error
proc_macro_error_attr
quick_junit
quick_xml
quote
rayon
rayon_core
regex
regex_syntax
rustc_demangle
ryu
same_file
scopeguard
semver
serde
serde_derive
serde_json
shared_child
shellwords
smallvec
static_assertions
strip_ansi_escapes
strsim
structopt
structopt_derive
supports_color
syn
target_lexicon
target_spec
termcolor
textwrap
time
toml
twox_hash
unicode_xid
utf8parse
vte
vte_generate_state_changes
walkdir
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
This module implements a "fallback" prefilter that only relies on memchr to
function. While memchr works best when it's explicitly vectorized, its
fallback implementations are fast enough to make a prefilter like this
worthwhile.

The essence of this implementation is to identify two rare bytes in a needle
based on a background frequency distribution of bytes. We then run memchr on the
rarer byte. For each match, we use the second rare byte as a guard to quickly
check if a match is possible. If the position passes the guard test, then we do
a naive memcmp to confirm the match.

In practice, this formulation works amazingly well, primarily because of the
heuristic use of a background frequency distribution. However, it does have a
number of weaknesses where it can get quite slow when its background frequency
distribution doesn't line up with the haystack being searched. This is why we
have specialized vector routines that essentially take this idea and move the
guard check into vectorized code. (Those specialized vector routines do still
make use of the background frequency distribution of bytes though.)

This fallback implementation was originally formulated in regex many moons ago:
https://github.com/rust-lang/regex/blob/3db8722d0b204a85380fe2a65e13d7065d7dd968/src/literal/imp.rs#L370-L501
Prior to that, I'm not aware of anyone using this technique in any prominent
substring search implementation. Although, I'm sure folks have had this same
insight long before me.

Another version of this also appeared in bstr:
https://github.com/BurntSushi/bstr/blob/a444256ca7407fe180ee32534688549655b7a38e/src/search/prefilter.rs#L83-L340
*/

use crate::memmem::{
    prefilter::{PrefilterFnTy, PrefilterState},
    NeedleInfo,
};

// Check that the functions below satisfy the Prefilter function type.
const _: PrefilterFnTy = find;

/// Look for a possible occurrence of needle. The position returned
/// corresponds to the beginning of the occurrence, if one exists.
///
/// Callers may assume that this never returns false negatives (i.e., it
/// never misses an actual occurrence), but must check that the returned
/// position corresponds to a match. That is, it can return false
/// positives.
///
/// This should only be used when Freqy is constructed for forward
/// searching.
pub(crate) fn find(
    prestate: &mut PrefilterState,
    ninfo: &NeedleInfo,
    haystack: &[u8],
    needle: &[u8],
) -> Option<usize> {
    let mut i = 0;
    let (rare1i, rare2i) = ninfo.rarebytes.as_rare_usize();
    let (rare1, rare2) = ninfo.rarebytes.as_rare_bytes(needle);
    while prestate.is_effective() {
        // Use a fast vectorized implementation to skip to the next
        // occurrence of the rarest byte (heuristically chosen) in the
        // needle.
        let found = crate::memchr(rare1, &haystack[i..])?;
        prestate.update(found);
        i += found;

        // If we can't align our first match with the haystack, then a
        // match is impossible.
        if i < rare1i {
            i += 1;
            continue;
        }

        // Align our rare2 byte with the haystack. A mismatch means that
        // a match is impossible.
        let aligned_rare2i = i - rare1i + rare2i;
        if haystack.get(aligned_rare2i) != Some(&rare2) {
            i += 1;
            continue;
        }

        // We've done what we can. There might be a match here.
        return Some(i - rare1i);
    }
    // The only way we get here is if we believe our skipping heuristic
    // has become ineffective. We're allowed to return false positives,
    // so return the position at which we advanced to, aligned to the
    // haystack.
    Some(i.saturating_sub(rare1i))
}

#[cfg(all(test, feature = "std"))]
mod tests {
    use super::*;

    fn freqy_find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
        let ninfo = NeedleInfo::new(needle);
        let mut prestate = PrefilterState::new();
        find(&mut prestate, &ninfo, haystack, needle)
    }

    #[test]
    fn freqy_forward() {
        assert_eq!(Some(0), freqy_find(b"BARFOO", b"BAR"));
        assert_eq!(Some(3), freqy_find(b"FOOBAR", b"BAR"));
        assert_eq!(Some(0), freqy_find(b"zyzz", b"zyzy"));
        assert_eq!(Some(2), freqy_find(b"zzzy", b"zyzy"));
        assert_eq!(None, freqy_find(b"zazb", b"zyzy"));
        assert_eq!(Some(0), freqy_find(b"yzyy", b"yzyz"));
        assert_eq!(Some(2), freqy_find(b"yyyz", b"yzyz"));
        assert_eq!(None, freqy_find(b"yayb", b"yzyz"));
    }

    #[test]
    #[cfg(not(miri))]
    fn prefilter_permutations() {
        use crate::memmem::prefilter::tests::PrefilterTest;

        // SAFETY: super::find is safe to call for all inputs and on all
        // platforms.
        unsafe { PrefilterTest::run_all_tests(super::find) };
    }
}