diff --git a/.travis.yml b/.travis.yml index c49ba699fa..f6e7e4f5c9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ dist: trusty sudo: required language: rust rust: - - 1.20.0 + - 1.24.1 - stable - beta - nightly diff --git a/README.md b/README.md index f9908caeda..d90c00fc1d 100644 --- a/README.md +++ b/README.md @@ -210,7 +210,7 @@ recommended for general use. ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.20.0`. +This crate's minimum supported `rustc` version is `1.24.1`. The current **tentative** policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 6cb56db850..ac5c3d8502 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -44,9 +44,9 @@ pub use ffi::re2::Regex; #[cfg(feature = "re-dphobos")] pub use ffi::d_phobos::Regex; #[cfg(feature = "re-rust")] -pub use regex::Regex; +pub use regex::{Regex, RegexSet}; #[cfg(feature = "re-rust-bytes")] -pub use regex::bytes::Regex; +pub use regex::bytes::{Regex, RegexSet}; #[cfg(feature = "re-tcl")] pub use ffi::tcl::Regex; @@ -272,6 +272,58 @@ macro_rules! bench_captures { } } +// USAGE: bench_is_match_set!(name, is_match, regex, haystack) +macro_rules! bench_is_match_set { + ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + lazy_static! { + static ref RE: Mutex = Mutex::new($re); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + if re.is_match(&text) != $is_match { + if $is_match { + panic!("expected match, got not match"); + } else { + panic!("expected no match, got match"); + } + } + }); + } + } +} + +// USAGE: bench_matches_set!(name, is_match, regex, haystack) +macro_rules! bench_matches_set { + ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + lazy_static! { + static ref RE: Mutex = Mutex::new($re); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + if re.matches(&text).matched_any() != $is_match { + if $is_match { + panic!("expected match, got not match"); + } else { + panic!("expected no match, got match"); + } + } + }); + } + } +} + mod ffi; mod misc; mod regexdna; diff --git a/bench/src/misc.rs b/bench/src/misc.rs index ad516e23e4..b5b25b737c 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -14,6 +14,8 @@ use std::iter::repeat; use test::Bencher; +#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))] +use RegexSet; use {Regex, Text}; #[cfg(not(feature = "re-onig"))] @@ -278,3 +280,25 @@ bench_captures!(short_haystack_1000000x, repeat("aaaa").take(1000000).collect::(), repeat("dddd").take(1000000).collect::(), )); + +#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))] +bench_is_match_set!(is_match_set, + true, + RegexSet::new(vec![ + "aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23.", + ]).unwrap(), + format!("{}a482c{}", + repeat('a').take(10).collect::(), + repeat('b').take(10).collect::()) + ); + +#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))] +bench_matches_set!(matches_set, + true, + RegexSet::new(vec![ + "aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23.", + ]).unwrap(), + format!("{}a482c{}", + repeat('a').take(10).collect::(), + repeat('b').take(10).collect::()) + ); diff --git a/ci/script.sh b/ci/script.sh index eea7d39d2a..ba66be192b 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -3,18 +3,7 @@ # This is the main CI script for testing the regex crate and its sub-crates. set -ex -MSRV="1.20.0" - -# If we're building on 1.20, then lazy_static 1.2 will fail to build since it -# updated its MSRV to 1.24.1. In this case, we force the use of lazy_static 1.1 -# to build on Rust 1.20.0. -if [ "$TRAVIS_RUST_VERSION" = "$MSRV" ]; then - cargo update -p lazy_static --precise 1.1.0 - # On older versions of Cargo, this apparently needs to be run twice - # if Cargo.lock didn't previously exist. Since this command should be - # idempotent, we run it again unconditionally. - cargo update -p lazy_static --precise 1.1.0 -fi +MSRV="1.24.1" # Builds the regex crate and runs tests. cargo build --verbose diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000000..c7ad93bafe --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +disable_all_formatting = true diff --git a/src/dfa.rs b/src/dfa.rs index 992a45cc62..e662b73380 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -50,6 +50,7 @@ use std::collections::HashMap; use std::fmt; use std::iter::repeat; use std::mem; +use std::sync::Arc; use exec::ProgramCache; use prog::{Inst, Program}; @@ -88,7 +89,7 @@ pub fn can_exec(insts: &Program) -> bool { /// This cache is reused between multiple invocations of the same regex /// program. (It is not shared simultaneously between threads. If there is /// contention, then new caches are created.) -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct Cache { /// Group persistent DFA related cache state together. The sparse sets /// listed below are used as scratch space while computing uncached states. @@ -107,7 +108,7 @@ pub struct Cache { /// `CacheInner` is logically just a part of Cache, but groups together fields /// that aren't passed as function parameters throughout search. (This split /// is mostly an artifact of the borrow checker. It is happily paid.) -#[derive(Clone, Debug)] +#[derive(Debug)] struct CacheInner { /// A cache of pre-compiled DFA states, keyed by the set of NFA states /// and the set of empty-width flags set at the byte in the input when the @@ -117,7 +118,7 @@ struct CacheInner { /// things, we just pass indexes around manually. The performance impact of /// this is probably an instruction or two in the inner loop. However, on /// 64 bit, each StatePtr is half the size of a *State. - compiled: HashMap, + compiled: StateMap, /// The transition table. /// /// The transition table is laid out in row-major order, where states are @@ -134,9 +135,6 @@ struct CacheInner { /// bytes that never discriminate a distinct path through the DFA from each /// other. trans: Transitions, - /// Our set of states. Note that `StatePtr / num_byte_classes` indexes - /// this Vec rather than just a `StatePtr`. - states: Vec, /// A set of cached start states, which are limited to the number of /// permutations of flags set just before the initial byte of input. (The /// index into this vec is a `EmptyFlags`.) @@ -155,6 +153,9 @@ struct CacheInner { /// The total heap size of the DFA's cache. We use this to determine when /// we should flush the cache. size: usize, + /// Scratch space used when building instruction pointer lists for new + /// states. This helps amortize allocation. + insts_scratch_space: Vec, } /// The transition table. @@ -267,8 +268,8 @@ impl Result { /// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8) /// is coded as a flag, followed by 4 bytes encoding the delta. #[derive(Clone, Eq, Hash, PartialEq)] -struct State{ - data: Box<[u8]>, +struct State { + data: Arc<[u8]>, } /// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes @@ -428,13 +429,13 @@ impl Cache { let starts = vec![STATE_UNKNOWN; 256]; let mut cache = Cache { inner: CacheInner { - compiled: HashMap::new(), + compiled: StateMap::new(num_byte_classes), trans: Transitions::new(num_byte_classes), - states: vec![], start_states: starts, stack: vec![], flush_count: 0, size: 0, + insts_scratch_space: vec![], }, qcur: SparseSet::new(prog.insts.len()), qnext: SparseSet::new(prog.insts.len()), @@ -1179,7 +1180,11 @@ impl<'a> Fsm<'a> { Some(v) => v, }; // In the cache? Cool. Done. - if let Some(&si) = self.cache.compiled.get(&key) { + if let Some(si) = self + .cache + .compiled + .get_ptr(&key) + { return Some(si); } // If the cache has gotten too big, wipe it. @@ -1219,8 +1224,14 @@ impl<'a> Fsm<'a> { // are conditional, we need to make them part of a state's key in the // cache. + let mut insts = mem::replace( + &mut self.cache.insts_scratch_space, + vec![], + ); + insts.clear(); // Reserve 1 byte for flags. - let mut insts = vec![0]; + insts.push(0); + let mut prev = 0; for &ip in q { let ip = usize_to_u32(ip); @@ -1244,13 +1255,16 @@ impl<'a> Fsm<'a> { // see a match when expanding NFA states previously, then this is a // dead state and no amount of additional input can transition out // of this state. - if insts.len() == 1 && !state_flags.is_match() { - None - } else { - let StateFlags(f) = *state_flags; - insts[0] = f; - Some(State { data: insts.into_boxed_slice() }) - } + let opt_state = + if insts.len() == 1 && !state_flags.is_match() { + None + } else { + let StateFlags(f) = *state_flags; + insts[0] = f; + Some(State { data: Arc::from(&*insts) }) + }; + self.cache.insts_scratch_space = insts; + opt_state } /// Clears the cache, but saves and restores current_state if it is not @@ -1265,7 +1279,7 @@ impl<'a> Fsm<'a> { &mut self, current_state: Option<&mut StatePtr>, ) -> bool { - if self.cache.states.is_empty() { + if self.cache.compiled.is_empty() { // Nothing to clear... return true; } @@ -1295,7 +1309,7 @@ impl<'a> Fsm<'a> { // 10 or fewer bytes per state. // Additionally, we permit the cache to be flushed a few times before // caling it quits. - let nstates = self.cache.states.len(); + let nstates = self.cache.compiled.len(); if self.cache.flush_count >= 3 && self.at >= self.last_cache_flush && (self.at - self.last_cache_flush) <= 10 * nstates { @@ -1314,7 +1328,6 @@ impl<'a> Fsm<'a> { }; self.cache.reset_size(); self.cache.trans.clear(); - self.cache.states.clear(); self.cache.compiled.clear(); for s in &mut self.cache.start_states { *s = STATE_UNKNOWN; @@ -1334,7 +1347,7 @@ impl<'a> Fsm<'a> { fn restore_state(&mut self, state: State) -> Option { // If we've already stored this state, just return a pointer to it. // None will be the wiser. - if let Some(&si) = self.cache.compiled.get(&state) { + if let Some(si) = self.cache.compiled.get_ptr(&state) { return Some(si); } self.add_state(state) @@ -1475,7 +1488,7 @@ impl<'a> Fsm<'a> { /// Returns a reference to a State given a pointer to it. fn state(&self, si: StatePtr) -> &State { - &self.cache.states[si as usize / self.num_byte_classes()] + self.cache.compiled.get_state(si).unwrap() } /// Adds the given state to the DFA. @@ -1508,16 +1521,13 @@ impl<'a> Fsm<'a> { // so we can find it later. self.cache.size += self.cache.trans.state_heap_size() - + (2 * state.data.len()) + + state.data.len() + (2 * mem::size_of::()) + mem::size_of::(); - self.cache.states.push(state.clone()); self.cache.compiled.insert(state, si); // Transition table and set of states and map should all be in sync. - debug_assert!(self.cache.states.len() + debug_assert!(self.cache.compiled.len() == self.cache.trans.num_states()); - debug_assert!(self.cache.states.len() - == self.cache.compiled.len()); Some(si) } @@ -1598,6 +1608,66 @@ impl<'a> Fsm<'a> { } } +/// An abstraction for representing a map of states. The map supports two +/// different ways of state lookup. One is fast constant time access via a +/// state pointer. The other is a hashmap lookup based on the DFA's +/// constituent NFA states. +/// +/// A DFA state internally uses an Arc such that we only need to store the +/// set of NFA states on the heap once, even though we support looking up +/// states by two different means. A more natural way to express this might +/// use raw pointers, but an Arc is safe and effectively achieves the same +/// thing. +#[derive(Debug)] +struct StateMap { + /// The keys are not actually static but rely on always pointing to a + /// buffer in `states` which will never be moved except when clearing + /// the map or on drop, in which case the keys of this map will be + /// removed before + map: HashMap, + /// Our set of states. Note that `StatePtr / num_byte_classes` indexes + /// this Vec rather than just a `StatePtr`. + states: Vec, + /// The number of byte classes in the DFA. Used to index `states`. + num_byte_classes: usize, +} + +impl StateMap { + fn new(num_byte_classes: usize) -> StateMap { + StateMap { + map: HashMap::new(), + states: vec![], + num_byte_classes: num_byte_classes, + } + } + + fn len(&self) -> usize { + self.states.len() + } + + fn is_empty(&self) -> bool { + self.states.is_empty() + } + + fn get_ptr(&self, state: &State) -> Option { + self.map.get(state).cloned() + } + + fn get_state(&self, si: StatePtr) -> Option<&State> { + self.states.get(si as usize / self.num_byte_classes) + } + + fn insert(&mut self, state: State, si: StatePtr) { + self.map.insert(state.clone(), si); + self.states.push(state); + } + + fn clear(&mut self) { + self.map.clear(); + self.states.clear(); + } +} + impl Transitions { /// Create a new transition table. /// @@ -1844,6 +1914,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) { mod tests { extern crate rand; + use std::sync::Arc; use quickcheck::{QuickCheck, StdGen, quickcheck}; use super::{ StateFlags, State, push_inst_ptr, @@ -1858,7 +1929,7 @@ mod tests { for &ip in ips.iter() { push_inst_ptr(&mut data, &mut prev, ip); } - let state = State { data: data.into_boxed_slice() }; + let state = State { data: Arc::from(&data[..]) }; let expected: Vec = ips.into_iter().map(|ip| ip as usize).collect(); diff --git a/src/exec.rs b/src/exec.rs index 578289aa5c..52d970f801 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -1271,7 +1271,7 @@ enum MatchNfaType { /// available to a particular program. pub type ProgramCache = RefCell; -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct ProgramCacheInner { pub pikevm: pikevm::Cache, pub backtrack: backtrack::Cache, diff --git a/src/prog.rs b/src/prog.rs index 198e1f45c4..b62802f2b7 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -161,6 +161,7 @@ impl Program { impl Deref for Program { type Target = [Inst]; + #[inline(always)] fn deref(&self) -> &Self::Target { &*self.insts } diff --git a/src/sparse.rs b/src/sparse.rs index 3cb8978ee4..ae4cae4b9b 100644 --- a/src/sparse.rs +++ b/src/sparse.rs @@ -14,52 +14,49 @@ use std::slice; #[derive(Clone, Debug)] pub struct SparseSet { /// Dense contains the instruction pointers in the order in which they - /// were inserted. Accessing elements >= self.size is illegal. + /// were inserted. dense: Vec, /// Sparse maps instruction pointers to their location in dense. /// /// An instruction pointer is in the set if and only if - /// sparse[ip] < size && ip == dense[sparse[ip]]. - sparse: Vec, - /// The number of elements in the set. - size: usize, + /// sparse[ip] < dense.len() && ip == dense[sparse[ip]]. + sparse: Box<[usize]>, } impl SparseSet { pub fn new(size: usize) -> SparseSet { SparseSet { - dense: vec![0; size], - sparse: vec![0; size], - size: 0, + dense: Vec::with_capacity(size), + sparse: vec![0; size].into_boxed_slice(), } } pub fn len(&self) -> usize { - self.size + self.dense.len() } pub fn is_empty(&self) -> bool { - self.size == 0 + self.dense.is_empty() } pub fn capacity(&self) -> usize { - self.dense.len() + self.dense.capacity() } pub fn insert(&mut self, value: usize) { - let i = self.size; - self.dense[i] = value; + let i = self.len(); + assert!(i < self.capacity()); + self.dense.push(value); self.sparse[value] = i; - self.size += 1; } pub fn contains(&self, value: usize) -> bool { let i = self.sparse[value]; - i < self.size && self.dense[i] == value + self.dense.get(i) == Some(&value) } pub fn clear(&mut self) { - self.size = 0; + self.dense.clear(); } } @@ -67,7 +64,7 @@ impl Deref for SparseSet { type Target = [usize]; fn deref(&self) -> &Self::Target { - &self.dense[0..self.size] + &self.dense } }