From 2e0a4884bd8021d39af5d9531c9a8ecc9ca51078 Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Wed, 22 Aug 2018 14:49:03 +0200 Subject: [PATCH 1/8] bench: add RegexSet benchmarks --- bench/src/bench.rs | 56 ++++++++++++++++++++++++++++++++++++++++++++-- bench/src/misc.rs | 24 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 6cb56db850..ac5c3d8502 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -44,9 +44,9 @@ pub use ffi::re2::Regex; #[cfg(feature = "re-dphobos")] pub use ffi::d_phobos::Regex; #[cfg(feature = "re-rust")] -pub use regex::Regex; +pub use regex::{Regex, RegexSet}; #[cfg(feature = "re-rust-bytes")] -pub use regex::bytes::Regex; +pub use regex::bytes::{Regex, RegexSet}; #[cfg(feature = "re-tcl")] pub use ffi::tcl::Regex; @@ -272,6 +272,58 @@ macro_rules! bench_captures { } } +// USAGE: bench_is_match_set!(name, is_match, regex, haystack) +macro_rules! bench_is_match_set { + ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + lazy_static! { + static ref RE: Mutex = Mutex::new($re); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + if re.is_match(&text) != $is_match { + if $is_match { + panic!("expected match, got not match"); + } else { + panic!("expected no match, got match"); + } + } + }); + } + } +} + +// USAGE: bench_matches_set!(name, is_match, regex, haystack) +macro_rules! bench_matches_set { + ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + lazy_static! { + static ref RE: Mutex = Mutex::new($re); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + if re.matches(&text).matched_any() != $is_match { + if $is_match { + panic!("expected match, got not match"); + } else { + panic!("expected no match, got match"); + } + } + }); + } + } +} + mod ffi; mod misc; mod regexdna; diff --git a/bench/src/misc.rs b/bench/src/misc.rs index ad516e23e4..b5b25b737c 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -14,6 +14,8 @@ use std::iter::repeat; use test::Bencher; +#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))] +use RegexSet; use {Regex, Text}; #[cfg(not(feature = "re-onig"))] @@ -278,3 +280,25 @@ bench_captures!(short_haystack_1000000x, repeat("aaaa").take(1000000).collect::(), repeat("dddd").take(1000000).collect::(), )); + +#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))] +bench_is_match_set!(is_match_set, + true, + RegexSet::new(vec![ + "aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23.", + ]).unwrap(), + format!("{}a482c{}", + repeat('a').take(10).collect::(), + repeat('b').take(10).collect::()) + ); + +#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))] +bench_matches_set!(matches_set, + true, + RegexSet::new(vec![ + "aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23.", + ]).unwrap(), + format!("{}a482c{}", + repeat('a').take(10).collect::(), + repeat('b').take(10).collect::()) + ); From 40ca5d960a278ce592c980a636078d45ac8e30f4 Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Thu, 23 Aug 2018 09:04:52 +0200 Subject: [PATCH 2/8] refactor: simplify SparseSet slightly --- src/sparse.rs | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/sparse.rs b/src/sparse.rs index 3cb8978ee4..ae4cae4b9b 100644 --- a/src/sparse.rs +++ b/src/sparse.rs @@ -14,52 +14,49 @@ use std::slice; #[derive(Clone, Debug)] pub struct SparseSet { /// Dense contains the instruction pointers in the order in which they - /// were inserted. Accessing elements >= self.size is illegal. + /// were inserted. dense: Vec, /// Sparse maps instruction pointers to their location in dense. /// /// An instruction pointer is in the set if and only if - /// sparse[ip] < size && ip == dense[sparse[ip]]. - sparse: Vec, - /// The number of elements in the set. - size: usize, + /// sparse[ip] < dense.len() && ip == dense[sparse[ip]]. + sparse: Box<[usize]>, } impl SparseSet { pub fn new(size: usize) -> SparseSet { SparseSet { - dense: vec![0; size], - sparse: vec![0; size], - size: 0, + dense: Vec::with_capacity(size), + sparse: vec![0; size].into_boxed_slice(), } } pub fn len(&self) -> usize { - self.size + self.dense.len() } pub fn is_empty(&self) -> bool { - self.size == 0 + self.dense.is_empty() } pub fn capacity(&self) -> usize { - self.dense.len() + self.dense.capacity() } pub fn insert(&mut self, value: usize) { - let i = self.size; - self.dense[i] = value; + let i = self.len(); + assert!(i < self.capacity()); + self.dense.push(value); self.sparse[value] = i; - self.size += 1; } pub fn contains(&self, value: usize) -> bool { let i = self.sparse[value]; - i < self.size && self.dense[i] == value + self.dense.get(i) == Some(&value) } pub fn clear(&mut self) { - self.size = 0; + self.dense.clear(); } } @@ -67,7 +64,7 @@ impl Deref for SparseSet { type Target = [usize]; fn deref(&self) -> &Self::Target { - &self.dense[0..self.size] + &self.dense } } From 2d6a27366bcd90489eae66e543131d44dcfffc6a Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Thu, 23 Aug 2018 09:07:23 +0200 Subject: [PATCH 3/8] perf: always inline Program's Deref Shaves of a few percent on dfa heavy regexes such as `RegexSet` use. --- src/prog.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/prog.rs b/src/prog.rs index 198e1f45c4..b62802f2b7 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -161,6 +161,7 @@ impl Program { impl Deref for Program { type Target = [Inst]; + #[inline(always)] fn deref(&self) -> &Self::Target { &*self.insts } From 80180b5587a896b41abbecb8be701c3eb0b2277b Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Fri, 24 Aug 2018 14:54:04 +0200 Subject: [PATCH 4/8] perf: cache the Vec used to build state keys Avoids frequent re-allocations while the instructions are pushed to it. --- src/dfa.rs | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index 992a45cc62..ca379da380 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -155,6 +155,9 @@ struct CacheInner { /// The total heap size of the DFA's cache. We use this to determine when /// we should flush the cache. size: usize, + /// Scratch space used when building instruction pointer lists for new + /// states. This helps amortize allocation. + insts_scratch_space: Vec, } /// The transition table. @@ -435,6 +438,7 @@ impl Cache { stack: vec![], flush_count: 0, size: 0, + insts_scratch_space: vec![], }, qcur: SparseSet::new(prog.insts.len()), qnext: SparseSet::new(prog.insts.len()), @@ -1219,8 +1223,14 @@ impl<'a> Fsm<'a> { // are conditional, we need to make them part of a state's key in the // cache. + let mut insts = mem::replace( + &mut self.cache.insts_scratch_space, + vec![], + ); + insts.clear(); // Reserve 1 byte for flags. - let mut insts = vec![0]; + insts.push(0); + let mut prev = 0; for &ip in q { let ip = usize_to_u32(ip); @@ -1244,13 +1254,16 @@ impl<'a> Fsm<'a> { // see a match when expanding NFA states previously, then this is a // dead state and no amount of additional input can transition out // of this state. - if insts.len() == 1 && !state_flags.is_match() { - None - } else { - let StateFlags(f) = *state_flags; - insts[0] = f; - Some(State { data: insts.into_boxed_slice() }) - } + let opt_state = + if insts.len() == 1 && !state_flags.is_match() { + None + } else { + let StateFlags(f) = *state_flags; + insts[0] = f; + Some(State { data: insts.clone().into_boxed_slice() }) + }; + self.cache.insts_scratch_space = insts; + opt_state } /// Clears the cache, but saves and restores current_state if it is not From 0179cab4b1aeef04ccffd4efc33c6964271f3262 Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Fri, 7 Sep 2018 17:41:21 +0200 Subject: [PATCH 5/8] regex: remove Clone from Cache --- src/dfa.rs | 4 ++-- src/exec.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index ca379da380..6d93aae731 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -88,7 +88,7 @@ pub fn can_exec(insts: &Program) -> bool { /// This cache is reused between multiple invocations of the same regex /// program. (It is not shared simultaneously between threads. If there is /// contention, then new caches are created.) -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct Cache { /// Group persistent DFA related cache state together. The sparse sets /// listed below are used as scratch space while computing uncached states. @@ -107,7 +107,7 @@ pub struct Cache { /// `CacheInner` is logically just a part of Cache, but groups together fields /// that aren't passed as function parameters throughout search. (This split /// is mostly an artifact of the borrow checker. It is happily paid.) -#[derive(Clone, Debug)] +#[derive(Debug)] struct CacheInner { /// A cache of pre-compiled DFA states, keyed by the set of NFA states /// and the set of empty-width flags set at the byte in the input when the diff --git a/src/exec.rs b/src/exec.rs index 578289aa5c..52d970f801 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -1271,7 +1271,7 @@ enum MatchNfaType { /// available to a particular program. pub type ProgramCache = RefCell; -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct ProgramCacheInner { pub pikevm: pikevm::Cache, pub backtrack: backtrack::Cache, From afe668352060d207ef82d24360c315e24aef0f7e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 30 Nov 2018 21:02:11 -0500 Subject: [PATCH 6/8] perf: avoid storing compiled dfa states twice --- src/dfa.rs | 100 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 21 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index 6d93aae731..e662b73380 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -50,6 +50,7 @@ use std::collections::HashMap; use std::fmt; use std::iter::repeat; use std::mem; +use std::sync::Arc; use exec::ProgramCache; use prog::{Inst, Program}; @@ -117,7 +118,7 @@ struct CacheInner { /// things, we just pass indexes around manually. The performance impact of /// this is probably an instruction or two in the inner loop. However, on /// 64 bit, each StatePtr is half the size of a *State. - compiled: HashMap, + compiled: StateMap, /// The transition table. /// /// The transition table is laid out in row-major order, where states are @@ -134,9 +135,6 @@ struct CacheInner { /// bytes that never discriminate a distinct path through the DFA from each /// other. trans: Transitions, - /// Our set of states. Note that `StatePtr / num_byte_classes` indexes - /// this Vec rather than just a `StatePtr`. - states: Vec, /// A set of cached start states, which are limited to the number of /// permutations of flags set just before the initial byte of input. (The /// index into this vec is a `EmptyFlags`.) @@ -270,8 +268,8 @@ impl Result { /// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8) /// is coded as a flag, followed by 4 bytes encoding the delta. #[derive(Clone, Eq, Hash, PartialEq)] -struct State{ - data: Box<[u8]>, +struct State { + data: Arc<[u8]>, } /// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes @@ -431,9 +429,8 @@ impl Cache { let starts = vec![STATE_UNKNOWN; 256]; let mut cache = Cache { inner: CacheInner { - compiled: HashMap::new(), + compiled: StateMap::new(num_byte_classes), trans: Transitions::new(num_byte_classes), - states: vec![], start_states: starts, stack: vec![], flush_count: 0, @@ -1183,7 +1180,11 @@ impl<'a> Fsm<'a> { Some(v) => v, }; // In the cache? Cool. Done. - if let Some(&si) = self.cache.compiled.get(&key) { + if let Some(si) = self + .cache + .compiled + .get_ptr(&key) + { return Some(si); } // If the cache has gotten too big, wipe it. @@ -1260,7 +1261,7 @@ impl<'a> Fsm<'a> { } else { let StateFlags(f) = *state_flags; insts[0] = f; - Some(State { data: insts.clone().into_boxed_slice() }) + Some(State { data: Arc::from(&*insts) }) }; self.cache.insts_scratch_space = insts; opt_state @@ -1278,7 +1279,7 @@ impl<'a> Fsm<'a> { &mut self, current_state: Option<&mut StatePtr>, ) -> bool { - if self.cache.states.is_empty() { + if self.cache.compiled.is_empty() { // Nothing to clear... return true; } @@ -1308,7 +1309,7 @@ impl<'a> Fsm<'a> { // 10 or fewer bytes per state. // Additionally, we permit the cache to be flushed a few times before // caling it quits. - let nstates = self.cache.states.len(); + let nstates = self.cache.compiled.len(); if self.cache.flush_count >= 3 && self.at >= self.last_cache_flush && (self.at - self.last_cache_flush) <= 10 * nstates { @@ -1327,7 +1328,6 @@ impl<'a> Fsm<'a> { }; self.cache.reset_size(); self.cache.trans.clear(); - self.cache.states.clear(); self.cache.compiled.clear(); for s in &mut self.cache.start_states { *s = STATE_UNKNOWN; @@ -1347,7 +1347,7 @@ impl<'a> Fsm<'a> { fn restore_state(&mut self, state: State) -> Option { // If we've already stored this state, just return a pointer to it. // None will be the wiser. - if let Some(&si) = self.cache.compiled.get(&state) { + if let Some(si) = self.cache.compiled.get_ptr(&state) { return Some(si); } self.add_state(state) @@ -1488,7 +1488,7 @@ impl<'a> Fsm<'a> { /// Returns a reference to a State given a pointer to it. fn state(&self, si: StatePtr) -> &State { - &self.cache.states[si as usize / self.num_byte_classes()] + self.cache.compiled.get_state(si).unwrap() } /// Adds the given state to the DFA. @@ -1521,16 +1521,13 @@ impl<'a> Fsm<'a> { // so we can find it later. self.cache.size += self.cache.trans.state_heap_size() - + (2 * state.data.len()) + + state.data.len() + (2 * mem::size_of::()) + mem::size_of::(); - self.cache.states.push(state.clone()); self.cache.compiled.insert(state, si); // Transition table and set of states and map should all be in sync. - debug_assert!(self.cache.states.len() + debug_assert!(self.cache.compiled.len() == self.cache.trans.num_states()); - debug_assert!(self.cache.states.len() - == self.cache.compiled.len()); Some(si) } @@ -1611,6 +1608,66 @@ impl<'a> Fsm<'a> { } } +/// An abstraction for representing a map of states. The map supports two +/// different ways of state lookup. One is fast constant time access via a +/// state pointer. The other is a hashmap lookup based on the DFA's +/// constituent NFA states. +/// +/// A DFA state internally uses an Arc such that we only need to store the +/// set of NFA states on the heap once, even though we support looking up +/// states by two different means. A more natural way to express this might +/// use raw pointers, but an Arc is safe and effectively achieves the same +/// thing. +#[derive(Debug)] +struct StateMap { + /// The keys are not actually static but rely on always pointing to a + /// buffer in `states` which will never be moved except when clearing + /// the map or on drop, in which case the keys of this map will be + /// removed before + map: HashMap, + /// Our set of states. Note that `StatePtr / num_byte_classes` indexes + /// this Vec rather than just a `StatePtr`. + states: Vec, + /// The number of byte classes in the DFA. Used to index `states`. + num_byte_classes: usize, +} + +impl StateMap { + fn new(num_byte_classes: usize) -> StateMap { + StateMap { + map: HashMap::new(), + states: vec![], + num_byte_classes: num_byte_classes, + } + } + + fn len(&self) -> usize { + self.states.len() + } + + fn is_empty(&self) -> bool { + self.states.is_empty() + } + + fn get_ptr(&self, state: &State) -> Option { + self.map.get(state).cloned() + } + + fn get_state(&self, si: StatePtr) -> Option<&State> { + self.states.get(si as usize / self.num_byte_classes) + } + + fn insert(&mut self, state: State, si: StatePtr) { + self.map.insert(state.clone(), si); + self.states.push(state); + } + + fn clear(&mut self) { + self.map.clear(); + self.states.clear(); + } +} + impl Transitions { /// Create a new transition table. /// @@ -1857,6 +1914,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) { mod tests { extern crate rand; + use std::sync::Arc; use quickcheck::{QuickCheck, StdGen, quickcheck}; use super::{ StateFlags, State, push_inst_ptr, @@ -1871,7 +1929,7 @@ mod tests { for &ip in ips.iter() { push_inst_ptr(&mut data, &mut prev, ip); } - let state = State { data: data.into_boxed_slice() }; + let state = State { data: Arc::from(&data[..]) }; let expected: Vec = ips.into_iter().map(|ip| ip as usize).collect(); From 554e4c89f703ae69ab336b5a8672063c592c4e0b Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Mon, 8 Oct 2018 16:19:57 +0200 Subject: [PATCH 7/8] ci: bump minimum rust version to 1.24.1 This was motivated in part for using `From<&[T]> for Arc`, which technically only requires 1.21.0. However, lazy_static 1.2, which is a dependency of regex, now requires 1.24.1, so we might as well ride the wave. --- .travis.yml | 2 +- README.md | 2 +- ci/script.sh | 13 +------------ 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index c49ba699fa..f6e7e4f5c9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ dist: trusty sudo: required language: rust rust: - - 1.20.0 + - 1.24.1 - stable - beta - nightly diff --git a/README.md b/README.md index f9908caeda..d90c00fc1d 100644 --- a/README.md +++ b/README.md @@ -210,7 +210,7 @@ recommended for general use. ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.20.0`. +This crate's minimum supported `rustc` version is `1.24.1`. The current **tentative** policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if diff --git a/ci/script.sh b/ci/script.sh index eea7d39d2a..ba66be192b 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -3,18 +3,7 @@ # This is the main CI script for testing the regex crate and its sub-crates. set -ex -MSRV="1.20.0" - -# If we're building on 1.20, then lazy_static 1.2 will fail to build since it -# updated its MSRV to 1.24.1. In this case, we force the use of lazy_static 1.1 -# to build on Rust 1.20.0. -if [ "$TRAVIS_RUST_VERSION" = "$MSRV" ]; then - cargo update -p lazy_static --precise 1.1.0 - # On older versions of Cargo, this apparently needs to be run twice - # if Cargo.lock didn't previously exist. Since this command should be - # idempotent, we run it again unconditionally. - cargo update -p lazy_static --precise 1.1.0 -fi +MSRV="1.24.1" # Builds the regex crate and runs tests. cargo build --verbose From ae6edce4d5dc8d70912a799c1621a20598479b11 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 30 Nov 2018 21:24:19 -0500 Subject: [PATCH 8/8] infra: disable rustfmt rustfmt is not something I'm willing to adopt at this time, but contributors are still trying to use it anyway. This is irksome, so we attempt to prevent this from happening by creating a config that explicitly disables rustfmt. --- rustfmt.toml | 1 + 1 file changed, 1 insertion(+) create mode 100644 rustfmt.toml diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000000..c7ad93bafe --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +disable_all_formatting = true