From 73d0b999b86f27f32f891031ada83e7e376bc3bf Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Wed, 22 Aug 2018 14:49:03 +0200
Subject: [PATCH 01/10] Add RegexSet benchmarks

---
 bench/src/bench.rs | 54 ++++++++++++++++++++++++++++++++++++++++++++--
 bench/src/misc.rs  | 20 ++++++++++++++++-
 2 files changed, 71 insertions(+), 3 deletions(-)
diff --git a/bench/src/bench.rs b/bench/src/bench.rs
index 6cb56db850..8c2d082201 100644
--- a/bench/src/bench.rs
+++ b/bench/src/bench.rs
@@ -44,9 +44,9 @@ pub use ffi::re2::Regex;
 #[cfg(feature = "re-dphobos")]
 pub use ffi::d_phobos::Regex;
 #[cfg(feature = "re-rust")]
-pub use regex::Regex;
+pub use regex::{Regex, RegexSet};
 #[cfg(feature = "re-rust-bytes")]
-pub use regex::bytes::Regex;
+pub use regex::bytes::{Regex, RegexSet};
 #[cfg(feature = "re-tcl")]
 pub use ffi::tcl::Regex;
 
@@ -272,6 +272,56 @@ macro_rules! bench_captures {
     }
 }
 
+macro_rules! bench_is_match_set {
+    ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            use std::sync::Mutex;
+            lazy_static! {
+                static ref RE: Mutex<RegexSet> = Mutex::new($re);
+                static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
+            };
+            let re = RE.lock().unwrap();
+            let text = TEXT.lock().unwrap();
+            b.bytes = text.len() as u64;
+            b.iter(|| {
+                if re.is_match(&text) != $is_match {
+                    if $is_match {
+                        panic!("expected match, got not match");
+                    } else {
+                        panic!("expected no match, got match");
+                    }
+                }
+            });
+        }
+    }
+}
+
+macro_rules! bench_matches_set {
+    ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            use std::sync::Mutex;
+            lazy_static! {
+                static ref RE: Mutex<RegexSet> = Mutex::new($re);
+                static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
+            };
+            let re = RE.lock().unwrap();
+            let text = TEXT.lock().unwrap();
+            b.bytes = text.len() as u64;
+            b.iter(|| {
+                if re.matches(&text).matched_any() != $is_match {
+                    if $is_match {
+                        panic!("expected match, got not match");
+                    } else {
+                        panic!("expected no match, got match");
+                    }
+                }
+            });
+        }
+    }
+}
+
 mod ffi;
 mod misc;
 mod regexdna;
diff --git a/bench/src/misc.rs b/bench/src/misc.rs
index ad516e23e4..69ea21f842 100644
--- a/bench/src/misc.rs
+++ b/bench/src/misc.rs
@@ -14,7 +14,7 @@ use std::iter::repeat;
 
 use test::Bencher;
 
-use {Regex, Text};
+use {Regex, RegexSet, Text};
 
 #[cfg(not(feature = "re-onig"))]
 #[cfg(not(feature = "re-pcre1"))]
@@ -278,3 +278,21 @@ bench_captures!(short_haystack_1000000x,
             repeat("aaaa").take(1000000).collect::<String>(),
             repeat("dddd").take(1000000).collect::<String>(),
             ));
+
+#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
+bench_is_match_set!(is_match_set,
+    true,
+    RegexSet::new(vec!["aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23."]).unwrap(),
+    format!("{}a482c{}",
+            repeat('a').take(10).collect::<String>(),
+            repeat('b').take(10).collect::<String>())
+    );
+
+#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
+bench_matches_set!(matches_set,
+    true,
+    RegexSet::new(vec!["aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23."]).unwrap(),
+    format!("{}a482c{}",
+            repeat('a').take(10).collect::<String>(),
+            repeat('b').take(10).collect::<String>())
+    );

From 939111439d03d8cedeb7905ff132bac73765b5f3 Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Thu, 23 Aug 2018 09:04:52 +0200
Subject: [PATCH 02/10] refactor: Simplify SparseSet slightly

---
 src/sparse.rs | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/sparse.rs b/src/sparse.rs
index 34c05e7157..0629dea793 100644
--- a/src/sparse.rs
+++ b/src/sparse.rs
@@ -14,52 +14,49 @@ use std::slice;
 #[derive(Clone, Debug)]
 pub struct SparseSet {
     /// Dense contains the instruction pointers in the order in which they
-    /// were inserted. Accessing elements >= self.size is illegal.
+    /// were inserted.
     dense: Vec<usize>,
     /// Sparse maps instruction pointers to their location in dense.
     ///
     /// An instruction pointer is in the set if and only if
-    /// sparse[ip] < size && ip == dense[sparse[ip]].
-    sparse: Vec<usize>,
-    /// The number of elements in the set.
-    size: usize,
+    /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
+    sparse: Box<[usize]>,
 }
 
 impl SparseSet {
     pub fn new(size: usize) -> SparseSet {
         SparseSet {
-            dense: vec![0; size],
-            sparse: vec![0; size],
-            size: 0,
+            dense: Vec::with_capacity(size),
+            sparse: vec![0; size].into_boxed_slice(),
         }
     }
 
     pub fn len(&self) -> usize {
-        self.size
+        self.dense.len()
     }
 
     pub fn is_empty(&self) -> bool {
-        self.size == 0
+        self.dense.is_empty()
     }
 
     pub fn capacity(&self) -> usize {
-        self.dense.len()
+        self.dense.capacity()
     }
 
     pub fn insert(&mut self, value: usize) {
-        let i = self.size;
-        self.dense[i] = value;
+        let i = self.len();
+        assert!(i < self.capacity());
+        self.dense.push(value);
         self.sparse[value] = i;
-        self.size += 1;
     }
 
     pub fn contains(&self, value: usize) -> bool {
         let i = self.sparse[value];
-        i < self.size && self.dense[i] == value
+        self.dense.get(i) == Some(&value)
     }
 
     pub fn clear(&mut self) {
-        self.size = 0;
+        self.dense.clear();
     }
 }
 
@@ -67,7 +64,7 @@ impl Deref for SparseSet {
     type Target = [usize];
 
     fn deref(&self) -> &Self::Target {
-        &self.dense[0..self.size]
+        &self.dense
     }
 }
 

From 1a0f55cfba4a9be0f9c90db239269fe66612b743 Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Thu, 23 Aug 2018 09:07:23 +0200
Subject: [PATCH 03/10] perf: Always inline Program's Deref

Shaves of a few percent on dfa heavy regexes such as `RegexSet` use.
---
 src/prog.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/prog.rs b/src/prog.rs
index 198e1f45c4..b62802f2b7 100644
--- a/src/prog.rs
+++ b/src/prog.rs
@@ -161,6 +161,7 @@ impl Program {
 impl Deref for Program {
     type Target = [Inst];
 
+    #[inline(always)]
     fn deref(&self) -> &Self::Target {
         &*self.insts
     }

From 5209bf10815346b496117fff57a8e782d5ef760c Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Fri, 24 Aug 2018 14:54:04 +0200
Subject: [PATCH 04/10] perf: Cache the Vec used to build state keys

Avoids frequent re-allocations while the instructions are pushed to it.
---
 src/dfa.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/dfa.rs b/src/dfa.rs
index 992a45cc62..7d77e38283 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -155,6 +155,8 @@ struct CacheInner {
     /// The total heap size of the DFA's cache. We use this to determine when
     /// we should flush the cache.
     size: usize,
+
+    insts_scratch_space: Vec<u8>,   
 }
 
 /// The transition table.
@@ -435,6 +437,7 @@ impl Cache {
                 stack: vec![],
                 flush_count: 0,
                 size: 0,
+                insts_scratch_space: vec![],
             },
             qcur: SparseSet::new(prog.insts.len()),
             qnext: SparseSet::new(prog.insts.len()),
@@ -1220,7 +1223,10 @@ impl<'a> Fsm<'a> {
         // cache.
 
         // Reserve 1 byte for flags.
-        let mut insts = vec![0];
+        let mut insts = mem::replace(&mut self.cache.insts_scratch_space, Vec::new());
+        insts.clear();
+        insts.push(0);
+
         let mut prev = 0;
         for &ip in q {
             let ip = usize_to_u32(ip);
@@ -1244,13 +1250,15 @@ impl<'a> Fsm<'a> {
         // see a match when expanding NFA states previously, then this is a
         // dead state and no amount of additional input can transition out
         // of this state.
-        if insts.len() == 1 && !state_flags.is_match() {
+        let opt_state = if insts.len() == 1 && !state_flags.is_match() {
             None
         } else {
             let StateFlags(f) = *state_flags;
             insts[0] = f;
-            Some(State { data: insts.into_boxed_slice() })
-        }
+            Some(State { data: insts.clone().into_boxed_slice() })
+        };
+        self.cache.insts_scratch_space = insts;
+        opt_state
     }
 
     /// Clears the cache, but saves and restores current_state if it is not

From 3e50be90fd181c8785cf89698e66b85c13620d0e Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Fri, 7 Sep 2018 18:25:37 +0200
Subject: [PATCH 05/10] perf: Avoid cloning the State data if the state exists

---
 src/dfa.rs | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/dfa.rs b/src/dfa.rs
index 7d77e38283..8b9ee6d46b 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -117,7 +117,7 @@ struct CacheInner {
     /// things, we just pass indexes around manually. The performance impact of
     /// this is probably an instruction or two in the inner loop. However, on
     /// 64 bit, each StatePtr is half the size of a *State.
-    compiled: HashMap<State, StatePtr>,
+    compiled: HashMap<Box<[u8]>, StatePtr>,
     /// The transition table.
     ///
     /// The transition table is laid out in row-major order, where states are
@@ -1177,14 +1177,15 @@ impl<'a> Fsm<'a> {
         // in q. We use this as an optimization in exec_byte to determine when
         // we should follow epsilon transitions at the empty string preceding
         // the current byte.
-        let key = match self.cached_state_key(q, &mut state_flags) {
+        match self.cached_state_key(q, &mut state_flags) {
             None => return Some(STATE_DEAD),
             Some(v) => v,
-        };
+        }
         // In the cache? Cool. Done.
-        if let Some(&si) = self.cache.compiled.get(&key) {
+        if let Some(&si) = self.cache.compiled.get(&self.cache.insts_scratch_space[..]) {
             return Some(si);
         }
+
         // If the cache has gotten too big, wipe it.
         if self.approximate_size() > self.prog.dfa_size_limit
             && !self.clear_cache_and_save(current_state)
@@ -1192,6 +1193,10 @@ impl<'a> Fsm<'a> {
                 // Ooops. DFA is giving up.
                 return None;
             }
+
+        let key = State {
+            data: self.cache.insts_scratch_space.clone().into_boxed_slice()
+        };
         // Allocate room for our state and add it.
         self.add_state(key)
     }
@@ -1210,7 +1215,7 @@ impl<'a> Fsm<'a> {
         &mut self,
         q: &SparseSet,
         state_flags: &mut StateFlags,
-    ) -> Option<State> {
+    ) -> Option<()> {
         use prog::Inst::*;
 
         // We need to build up enough information to recognize pre-built states
@@ -1255,7 +1260,7 @@ impl<'a> Fsm<'a> {
         } else {
             let StateFlags(f) = *state_flags;
             insts[0] = f;
-            Some(State { data: insts.clone().into_boxed_slice() })
+            Some(())
         };
         self.cache.insts_scratch_space = insts;
         opt_state
@@ -1342,7 +1347,7 @@ impl<'a> Fsm<'a> {
     fn restore_state(&mut self, state: State) -> Option<StatePtr> {
         // If we've already stored this state, just return a pointer to it.
         // None will be the wiser.
-        if let Some(&si) = self.cache.compiled.get(&state) {
+        if let Some(&si) = self.cache.compiled.get(&state.data) {
             return Some(si);
         }
         self.add_state(state)
@@ -1520,7 +1525,7 @@ impl<'a> Fsm<'a> {
             + (2 * mem::size_of::<State>())
             + mem::size_of::<StatePtr>();
         self.cache.states.push(state.clone());
-        self.cache.compiled.insert(state, si);
+        self.cache.compiled.insert(state.data, si);
         // Transition table and set of states and map should all be in sync.
         debug_assert!(self.cache.states.len()
                       == self.cache.trans.num_states());

From 59b3b36b1d8290f08062e6d907d14ac049c75662 Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Fri, 7 Sep 2018 17:41:21 +0200
Subject: [PATCH 06/10] Remove Clone from Cache

---
 src/dfa.rs  | 4 ++--
 src/exec.rs | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dfa.rs b/src/dfa.rs
index 8b9ee6d46b..1f381fe618 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -88,7 +88,7 @@ pub fn can_exec(insts: &Program) -> bool {
 /// This cache is reused between multiple invocations of the same regex
 /// program. (It is not shared simultaneously between threads. If there is
 /// contention, then new caches are created.)
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct Cache {
     /// Group persistent DFA related cache state together. The sparse sets
     /// listed below are used as scratch space while computing uncached states.
@@ -107,7 +107,7 @@ pub struct Cache {
 /// `CacheInner` is logically just a part of Cache, but groups together fields
 /// that aren't passed as function parameters throughout search. (This split
 /// is mostly an artifact of the borrow checker. It is happily paid.)
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 struct CacheInner {
     /// A cache of pre-compiled DFA states, keyed by the set of NFA states
     /// and the set of empty-width flags set at the byte in the input when the
diff --git a/src/exec.rs b/src/exec.rs
index 578289aa5c..52d970f801 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -1271,7 +1271,7 @@ enum MatchNfaType {
 /// available to a particular program.
 pub type ProgramCache = RefCell<ProgramCacheInner>;
 
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct ProgramCacheInner {
     pub pikevm: pikevm::Cache,
     pub backtrack: backtrack::Cache,

From c4c14ce62c48a2b0545fbf7b6cc43a8742ce219f Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Wed, 26 Sep 2018 13:14:08 +0200
Subject: [PATCH 07/10] refactor: Implement Borrow<[u8]> for State so it can be
 used as a map key

---
 src/dfa.rs | 213 ++++++++++++++++++++++-------------------------------
 1 file changed, 89 insertions(+), 124 deletions(-)

diff --git a/src/dfa.rs b/src/dfa.rs
index 1f381fe618..1d9cdc51a6 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -46,6 +46,7 @@ articles on regexes is strongly recommended: https://swtch.com/~rsc/regexp/
 implementation.)
 */
 
+use std::borrow::Borrow;
 use std::collections::HashMap;
 use std::fmt;
 use std::iter::repeat;
@@ -117,7 +118,7 @@ struct CacheInner {
     /// things, we just pass indexes around manually. The performance impact of
     /// this is probably an instruction or two in the inner loop. However, on
     /// 64 bit, each StatePtr is half the size of a *State.
-    compiled: HashMap<Box<[u8]>, StatePtr>,
+    compiled: HashMap<State, StatePtr>,
     /// The transition table.
     ///
     /// The transition table is laid out in row-major order, where states are
@@ -156,7 +157,7 @@ struct CacheInner {
     /// we should flush the cache.
     size: usize,
 
-    insts_scratch_space: Vec<u8>,   
+    insts_scratch_space: Vec<u8>,
 }
 
 /// The transition table.
@@ -269,10 +270,16 @@ impl<T> Result<T> {
 /// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
 /// is coded as a flag, followed by 4 bytes encoding the delta.
 #[derive(Clone, Eq, Hash, PartialEq)]
-struct State{
+struct State {
     data: Box<[u8]>,
 }
 
+impl Borrow<[u8]> for State {
+    fn borrow(&self) -> &[u8] {
+        &self.data
+    }
+}
+
 /// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
 /// an NFA state).
 ///
@@ -294,7 +301,7 @@ struct InstPtrs<'a> {
     data: &'a [u8],
 }
 
-impl <'a>Iterator for InstPtrs<'a> {
+impl<'a> Iterator for InstPtrs<'a> {
     type Item = usize;
 
     fn next(&mut self) -> Option<usize> {
@@ -372,7 +379,7 @@ type StatePtr = u32;
 
 /// An unknown state means that the state has not been computed yet, and that
 /// the only way to progress is to compute it.
-const STATE_UNKNOWN: StatePtr = 1<<31;
+const STATE_UNKNOWN: StatePtr = 1 << 31;
 
 /// A dead state means that the state has been computed and it is known that
 /// once it is entered, no future match can ever occur.
@@ -386,12 +393,12 @@ const STATE_QUIT: StatePtr = STATE_DEAD + 1;
 /// A start state is a state that the DFA can start in.
 ///
 /// Note that start states have their lower bits set to a state pointer.
-const STATE_START: StatePtr = 1<<30;
+const STATE_START: StatePtr = 1 << 30;
 
 /// A match state means that the regex has successfully matched.
 ///
 /// Note that match states have their lower bits set to a state pointer.
-const STATE_MATCH: StatePtr = 1<<29;
+const STATE_MATCH: StatePtr = 1 << 29;
 
 /// The maximum state pointer. This is useful to mask out the "valid" state
 /// pointer from a state with the "start" or "match" bits set.
@@ -451,8 +458,7 @@ impl CacheInner {
     /// Resets the cache size to account for fixed costs, such as the program
     /// and stack sizes.
     fn reset_size(&mut self) {
-        self.size =
-            (self.start_states.len() * mem::size_of::<StatePtr>())
+        self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
             + (self.stack.len() * mem::size_of::<InstPtr>());
     }
 }
@@ -478,11 +484,7 @@ impl<'a> Fsm<'a> {
             cache: &mut cache.inner,
         };
         let (empty_flags, state_flags) = dfa.start_flags(text, at);
-        dfa.start = match dfa.start_state(
-            &mut cache.qcur,
-            empty_flags,
-            state_flags,
-        ) {
+        dfa.start = match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
             None => return Result::Quit,
             Some(STATE_DEAD) => return Result::NoMatch(at),
             Some(si) => si,
@@ -511,11 +513,7 @@ impl<'a> Fsm<'a> {
             cache: &mut cache.inner,
         };
         let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
-        dfa.start = match dfa.start_state(
-            &mut cache.qcur,
-            empty_flags,
-            state_flags,
-        ) {
+        dfa.start = match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
             None => return Result::Quit,
             Some(STATE_DEAD) => return Result::NoMatch(at),
             Some(si) => si,
@@ -545,11 +543,7 @@ impl<'a> Fsm<'a> {
             cache: &mut cache.inner,
         };
         let (empty_flags, state_flags) = dfa.start_flags(text, at);
-        dfa.start = match dfa.start_state(
-            &mut cache.qcur,
-            empty_flags,
-            state_flags,
-        ) {
+        dfa.start = match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
             None => return Result::Quit,
             Some(STATE_DEAD) => return Result::NoMatch(at),
             Some(si) => si,
@@ -677,8 +671,7 @@ impl<'a> Fsm<'a> {
                 // match states are final. Therefore, we can quit.
                 if self.prog.matches.len() > 1 {
                     let state = self.state(next_si);
-                    let just_matches = state.inst_ptrs()
-                         .all(|ip| self.prog[ip].is_match());
+                    let just_matches = state.inst_ptrs().all(|ip| self.prog[ip].is_match());
                     if just_matches {
                         return result;
                     }
@@ -689,12 +682,9 @@ impl<'a> Fsm<'a> {
                 // very quickly, and only recording the match location once
                 // we've left this particular state.
                 let cur = at;
-                while (next_si & !STATE_MATCH) == prev_si
-                    && at + 2 < text.len() {
+                while (next_si & !STATE_MATCH) == prev_si && at + 2 < text.len() {
                     // Argument for safety is in the definition of next_si.
-                    next_si = unsafe {
-                        self.next_si(next_si & !STATE_MATCH, text, at)
-                    };
+                    next_si = unsafe { self.next_si(next_si & !STATE_MATCH, text, at) };
                     at += 1;
                 }
                 if at > cur {
@@ -811,7 +801,7 @@ impl<'a> Fsm<'a> {
                 next_si &= !STATE_MATCH;
                 result = Result::Match(at + 1);
                 if self.quit_after_match {
-                    return result
+                    return result;
                 }
                 self.last_match_si = next_si;
                 prev_si = next_si;
@@ -819,9 +809,7 @@ impl<'a> Fsm<'a> {
                 while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
                     // Argument for safety is in the definition of next_si.
                     at -= 1;
-                    next_si = unsafe {
-                        self.next_si(next_si & !STATE_MATCH, text, at)
-                    };
+                    next_si = unsafe { self.next_si(next_si & !STATE_MATCH, text, at) };
                 }
                 if at < cur {
                     result = Result::Match(at + 2);
@@ -988,8 +976,7 @@ impl<'a> Fsm<'a> {
                     state_flags.set_match();
                     if !self.continue_past_first_match() {
                         break;
-                    } else if self.prog.matches.len() > 1
-                            && !qnext.contains(ip as usize) {
+                    } else if self.prog.matches.len() > 1 && !qnext.contains(ip as usize) {
                         // If we are continuing on to find other matches,
                         // then keep a record of the match states we've seen.
                         qnext.insert(ip);
@@ -997,25 +984,23 @@ impl<'a> Fsm<'a> {
                 }
                 Bytes(ref inst) => {
                     if b.as_byte().map_or(false, |b| inst.matches(b)) {
-                        self.follow_epsilons(
-                            inst.goto as InstPtr, qnext, empty_flags);
+                        self.follow_epsilons(inst.goto as InstPtr, qnext, empty_flags);
                     }
                 }
             }
         }
 
-        let cache =
-            if b.is_eof() && self.prog.matches.len() > 1 {
-                // If we're processing the last byte of the input and we're
-                // matching a regex set, then make the next state contain the
-                // previous states transitions. We do this so that the main
-                // matching loop can extract all of the match instructions.
-                mem::swap(qcur, qnext);
-                // And don't cache this state because it's totally bunk.
-                false
-            } else {
-                true
-            };
+        let cache = if b.is_eof() && self.prog.matches.len() > 1 {
+            // If we're processing the last byte of the input and we're
+            // matching a regex set, then make the next state contain the
+            // previous states transitions. We do this so that the main
+            // matching loop can extract all of the match instructions.
+            mem::swap(qcur, qnext);
+            // And don't cache this state because it's totally bunk.
+            false
+        } else {
+            true
+        };
 
         // We've now built up the set of NFA states that ought to comprise the
         // next DFA state, so try to find it in the cache, and if it doesn't
@@ -1024,11 +1009,7 @@ impl<'a> Fsm<'a> {
         // N.B. We pass `&mut si` here because the cache may clear itself if
         // it has gotten too full. When that happens, the location of the
         // current state may change.
-        let mut next = match self.cached_state(
-            qnext,
-            state_flags,
-            Some(&mut si),
-        ) {
+        let mut next = match self.cached_state(qnext, state_flags, Some(&mut si)) {
             None => return None,
             Some(next) => next,
         };
@@ -1073,14 +1054,9 @@ impl<'a> Fsm<'a> {
     /// line should be set if the preceding byte is `\n`. End line should never
     /// be set in this case. (Even if the proceding byte is a `\n`, it will
     /// be handled in a subsequent DFA state.)
-    fn follow_epsilons(
-        &mut self,
-        ip: InstPtr,
-        q: &mut SparseSet,
-        flags: EmptyFlags,
-    ) {
-        use prog::Inst::*;
+    fn follow_epsilons(&mut self, ip: InstPtr, q: &mut SparseSet, flags: EmptyFlags) {
         use prog::EmptyLook::*;
+        use prog::Inst::*;
 
         // We need to traverse the NFA to follow epsilon transitions, so avoid
         // recursion with an explicit stack.
@@ -1127,9 +1103,8 @@ impl<'a> Fsm<'a> {
                             NotWordBoundary if flags.not_word_boundary => {
                                 ip = inst.goto as InstPtr;
                             }
-                            StartLine | EndLine | StartText | EndText
-                            | WordBoundaryAscii | NotWordBoundaryAscii
-                            | WordBoundary | NotWordBoundary => {
+                            StartLine | EndLine | StartText | EndText | WordBoundaryAscii
+                            | NotWordBoundaryAscii | WordBoundary | NotWordBoundary => {
                                 break;
                             }
                         }
@@ -1189,13 +1164,13 @@ impl<'a> Fsm<'a> {
         // If the cache has gotten too big, wipe it.
         if self.approximate_size() > self.prog.dfa_size_limit
             && !self.clear_cache_and_save(current_state)
-            {
-                // Ooops. DFA is giving up.
-                return None;
-            }
+        {
+            // Ooops. DFA is giving up.
+            return None;
+        }
 
         let key = State {
-            data: self.cache.insts_scratch_space.clone().into_boxed_slice()
+            data: self.cache.insts_scratch_space.clone().into_boxed_slice(),
         };
         // Allocate room for our state and add it.
         self.add_state(key)
@@ -1211,11 +1186,7 @@ impl<'a> Fsm<'a> {
     /// Specifically, q should be an ordered set of NFA states and is_match
     /// should be true if and only if the previous NFA states contained a match
     /// state.
-    fn cached_state_key(
-        &mut self,
-        q: &SparseSet,
-        state_flags: &mut StateFlags,
-    ) -> Option<()> {
+    fn cached_state_key(&mut self, q: &SparseSet, state_flags: &mut StateFlags) -> Option<()> {
         use prog::Inst::*;
 
         // We need to build up enough information to recognize pre-built states
@@ -1274,10 +1245,7 @@ impl<'a> Fsm<'a> {
     ///
     /// This returns false if the cache is not cleared and the DFA should
     /// give up.
-    fn clear_cache_and_save(
-        &mut self,
-        current_state: Option<&mut StatePtr>,
-    ) -> bool {
+    fn clear_cache_and_save(&mut self, current_state: Option<&mut StatePtr>) -> bool {
         if self.cache.states.is_empty() {
             // Nothing to clear...
             return true;
@@ -1311,7 +1279,8 @@ impl<'a> Fsm<'a> {
         let nstates = self.cache.states.len();
         if self.cache.flush_count >= 3
             && self.at >= self.last_cache_flush
-            && (self.at - self.last_cache_flush) <= 10 * nstates {
+            && (self.at - self.last_cache_flush) <= 10 * nstates
+        {
             return false;
         }
         // Update statistics tracking cache flushes.
@@ -1347,7 +1316,7 @@ impl<'a> Fsm<'a> {
     fn restore_state(&mut self, state: State) -> Option<StatePtr> {
         // If we've already stored this state, just return a pointer to it.
         // None will be the wiser.
-        if let Some(&si) = self.cache.compiled.get(&state.data) {
+        if let Some(&si) = self.cache.compiled.get(&state) {
             return Some(si);
         }
         self.add_state(state)
@@ -1402,14 +1371,13 @@ impl<'a> Fsm<'a> {
         // matches are delayed by one byte, start states can never be match
         // states.
         let flagi = {
-            (((empty_flags.start as u8) << 0) |
-             ((empty_flags.end as u8) << 1) |
-             ((empty_flags.start_line as u8) << 2) |
-             ((empty_flags.end_line as u8) << 3) |
-             ((empty_flags.word_boundary as u8) << 4) |
-             ((empty_flags.not_word_boundary as u8) << 5) |
-             ((state_flags.is_word() as u8) << 6))
-            as usize
+            (((empty_flags.start as u8) << 0)
+                | ((empty_flags.end as u8) << 1)
+                | ((empty_flags.start_line as u8) << 2)
+                | ((empty_flags.end_line as u8) << 3)
+                | ((empty_flags.word_boundary as u8) << 4)
+                | ((empty_flags.not_word_boundary as u8) << 5)
+                | ((state_flags.is_word() as u8) << 6)) as usize
         };
         match self.cache.start_states[flagi] {
             STATE_UNKNOWN => {}
@@ -1460,11 +1428,7 @@ impl<'a> Fsm<'a> {
     ///
     /// This should only be used when executing the DFA in reverse over the
     /// input.
-    fn start_flags_reverse(
-        &self,
-        text: &[u8],
-        at: usize,
-    ) -> (EmptyFlags, StateFlags) {
+    fn start_flags_reverse(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
         let mut empty_flags = EmptyFlags::default();
         let mut state_flags = StateFlags::default();
         empty_flags.start = at == text.len();
@@ -1472,8 +1436,7 @@ impl<'a> Fsm<'a> {
         empty_flags.start_line = at == text.len() || text[at] == b'\n';
         empty_flags.end_line = text.is_empty();
 
-        let is_word_last =
-            at < text.len() && Byte::byte(text[at]).is_ascii_word();
+        let is_word_last = at < text.len() && Byte::byte(text[at]).is_ascii_word();
         let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
         if is_word_last {
             state_flags.set_word();
@@ -1519,18 +1482,15 @@ impl<'a> Fsm<'a> {
         }
         // Finally, put our actual state on to our heap of states and index it
         // so we can find it later.
-        self.cache.size +=
-            self.cache.trans.state_heap_size()
+        self.cache.size += self.cache.trans.state_heap_size()
             + (2 * state.data.len())
             + (2 * mem::size_of::<State>())
             + mem::size_of::<StatePtr>();
         self.cache.states.push(state.clone());
-        self.cache.compiled.insert(state.data, si);
+        self.cache.compiled.insert(state, si);
         // Transition table and set of states and map should all be in sync.
-        debug_assert!(self.cache.states.len()
-                      == self.cache.trans.num_states());
-        debug_assert!(self.cache.states.len()
-                      == self.cache.compiled.len());
+        debug_assert!(self.cache.states.len() == self.cache.trans.num_states());
+        debug_assert!(self.cache.states.len() == self.cache.compiled.len());
         Some(si)
     }
 
@@ -1583,9 +1543,7 @@ impl<'a> Fsm<'a> {
 
     /// Returns true if there is a prefix we can quickly search for.
     fn has_prefix(&self) -> bool {
-        !self.prog.is_reverse
-        && !self.prog.prefixes.is_empty()
-        && !self.prog.is_anchored_start
+        !self.prog.is_reverse && !self.prog.prefixes.is_empty() && !self.prog.is_anchored_start
     }
 
     /// Sets the STATE_START bit in the given state pointer if and only if
@@ -1636,7 +1594,8 @@ impl Transitions {
         if si > STATE_MAX as usize {
             return None;
         }
-        self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
+        self.table
+            .extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
         Some(usize_to_u32(si))
     }
 
@@ -1695,9 +1654,15 @@ impl StateFlags {
 }
 
 impl Byte {
-    fn byte(b: u8) -> Self { Byte(b as u16) }
-    fn eof() -> Self { Byte(256) }
-    fn is_eof(&self) -> bool { self.0 == 256 }
+    fn byte(b: u8) -> Self {
+        Byte(b as u16)
+    }
+    fn eof() -> Self {
+        Byte(256)
+    }
+    fn is_eof(&self) -> bool {
+        self.0 == 256
+    }
 
     fn is_ascii_word(&self) -> bool {
         let b = match self.as_byte() {
@@ -1723,9 +1688,9 @@ impl fmt::Debug for State {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         let ips: Vec<usize> = self.inst_ptrs().collect();
         f.debug_struct("State")
-         .field("flags", &self.flags())
-         .field("insts", &ips)
-         .finish()
+            .field("flags", &self.flags())
+            .field("insts", &ips)
+            .finish()
     }
 }
 
@@ -1764,10 +1729,10 @@ impl<'a> fmt::Debug for TransitionsRow<'a> {
 impl fmt::Debug for StateFlags {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         f.debug_struct("StateFlags")
-         .field("is_match", &self.is_match())
-         .field("is_word", &self.is_word())
-         .field("has_empty", &self.has_empty())
-         .finish()
+            .field("is_match", &self.is_match())
+            .field("is_word", &self.is_word())
+            .field("has_empty", &self.has_empty())
+            .finish()
     }
 }
 
@@ -1857,11 +1822,10 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
 mod tests {
     extern crate rand;
 
-    use quickcheck::{QuickCheck, StdGen, quickcheck};
     use super::{
-        StateFlags, State, push_inst_ptr,
-        write_varu32, read_varu32, write_vari32, read_vari32,
+        push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32, State, StateFlags,
     };
+    use quickcheck::{quickcheck, QuickCheck, StdGen};
 
     #[test]
     fn prop_state_encode_decode() {
@@ -1871,10 +1835,11 @@ mod tests {
             for &ip in ips.iter() {
                 push_inst_ptr(&mut data, &mut prev, ip);
             }
-            let state = State { data: data.into_boxed_slice() };
+            let state = State {
+                data: data.into_boxed_slice(),
+            };
 
-            let expected: Vec<usize> =
-                ips.into_iter().map(|ip| ip as usize).collect();
+            let expected: Vec<usize> = ips.into_iter().map(|ip| ip as usize).collect();
             let got: Vec<usize> = state.inst_ptrs().collect();
             expected == got && state.flags() == StateFlags(flags)
         }

From 7e655b7c3e699357727541add21c8fb6f5a64bf6 Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Wed, 26 Sep 2018 13:59:13 +0200
Subject: [PATCH 08/10] chore: Fix unused_import in benchmarks

---
 bench/src/misc.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bench/src/misc.rs b/bench/src/misc.rs
index 69ea21f842..2786408248 100644
--- a/bench/src/misc.rs
+++ b/bench/src/misc.rs
@@ -14,7 +14,9 @@ use std::iter::repeat;
 
 use test::Bencher;
 
-use {Regex, RegexSet, Text};
+#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
+use RegexSet;
+use {Regex, Text};
 
 #[cfg(not(feature = "re-onig"))]
 #[cfg(not(feature = "re-pcre1"))]

From 6882158d920aa77ce6ae733f1f365d4d656ccdd6 Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Fri, 7 Sep 2018 18:36:04 +0200
Subject: [PATCH 09/10] perf: Avoid storing compiled dfa states twice

---
 src/dfa.rs | 105 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 21 deletions(-)

diff --git a/src/dfa.rs b/src/dfa.rs
index 1d9cdc51a6..5b1a750cc3 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -47,15 +47,17 @@ implementation.)
 */
 
 use std::borrow::Borrow;
-use std::collections::HashMap;
 use std::fmt;
 use std::iter::repeat;
 use std::mem;
+use std::sync::Arc;
 
 use exec::ProgramCache;
 use prog::{Inst, Program};
 use sparse::SparseSet;
 
+use self::state_map::StateMap;
+
 /// Return true if and only if the given program can be executed by a DFA.
 ///
 /// Generally, a DFA is always possible. A pathological case where it is not
@@ -118,7 +120,7 @@ struct CacheInner {
     /// things, we just pass indexes around manually. The performance impact of
     /// this is probably an instruction or two in the inner loop. However, on
     /// 64 bit, each StatePtr is half the size of a *State.
-    compiled: HashMap<State, StatePtr>,
+    compiled: StateMap,
     /// The transition table.
     ///
     /// The transition table is laid out in row-major order, where states are
@@ -135,9 +137,6 @@ struct CacheInner {
     /// bytes that never discriminate a distinct path through the DFA from each
     /// other.
     trans: Transitions,
-    /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
-    /// this Vec rather than just a `StatePtr`.
-    states: Vec<State>,
     /// A set of cached start states, which are limited to the number of
     /// permutations of flags set just before the initial byte of input. (The
     /// index into this vec is a `EmptyFlags`.)
@@ -270,8 +269,8 @@ impl<T> Result<T> {
 /// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
 /// is coded as a flag, followed by 4 bytes encoding the delta.
 #[derive(Clone, Eq, Hash, PartialEq)]
-struct State {
-    data: Box<[u8]>,
+pub struct State {
+    data: Arc<[u8]>,
 }
 
 impl Borrow<[u8]> for State {
@@ -280,6 +279,13 @@ impl Borrow<[u8]> for State {
     }
 }
 
+impl State {
+    fn heap_size(&self) -> usize {
+        // 2 * Reference counters
+        2 * mem::size_of::<usize>() + self.data.len()
+    }
+}
+
 /// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
 /// an NFA state).
 ///
@@ -437,9 +443,8 @@ impl Cache {
         let starts = vec![STATE_UNKNOWN; 256];
         let mut cache = Cache {
             inner: CacheInner {
-                compiled: HashMap::new(),
+                compiled: StateMap::new(),
                 trans: Transitions::new(num_byte_classes),
-                states: vec![],
                 start_states: starts,
                 stack: vec![],
                 flush_count: 0,
@@ -1157,7 +1162,11 @@ impl<'a> Fsm<'a> {
             Some(v) => v,
         }
         // In the cache? Cool. Done.
-        if let Some(&si) = self.cache.compiled.get(&self.cache.insts_scratch_space[..]) {
+        if let Some(si) = self
+            .cache
+            .compiled
+            .get_ptr(&self.cache.insts_scratch_space[..])
+        {
             return Some(si);
         }
 
@@ -1170,7 +1179,7 @@ impl<'a> Fsm<'a> {
         }
 
         let key = State {
-            data: self.cache.insts_scratch_space.clone().into_boxed_slice(),
+            data: Arc::from(&self.cache.insts_scratch_space[..]),
         };
         // Allocate room for our state and add it.
         self.add_state(key)
@@ -1246,7 +1255,7 @@ impl<'a> Fsm<'a> {
     /// This returns false if the cache is not cleared and the DFA should
     /// give up.
     fn clear_cache_and_save(&mut self, current_state: Option<&mut StatePtr>) -> bool {
-        if self.cache.states.is_empty() {
+        if self.cache.compiled.is_empty() {
             // Nothing to clear...
             return true;
         }
@@ -1276,7 +1285,7 @@ impl<'a> Fsm<'a> {
         // 10 or fewer bytes per state.
         // Additionally, we permit the cache to be flushed a few times before
         // caling it quits.
-        let nstates = self.cache.states.len();
+        let nstates = self.cache.compiled.len();
         if self.cache.flush_count >= 3
             && self.at >= self.last_cache_flush
             && (self.at - self.last_cache_flush) <= 10 * nstates
@@ -1296,7 +1305,6 @@ impl<'a> Fsm<'a> {
         };
         self.cache.reset_size();
         self.cache.trans.clear();
-        self.cache.states.clear();
         self.cache.compiled.clear();
         for s in &mut self.cache.start_states {
             *s = STATE_UNKNOWN;
@@ -1316,7 +1324,7 @@ impl<'a> Fsm<'a> {
     fn restore_state(&mut self, state: State) -> Option<StatePtr> {
         // If we've already stored this state, just return a pointer to it.
         // None will be the wiser.
-        if let Some(&si) = self.cache.compiled.get(&state) {
+        if let Some(si) = self.cache.compiled.get_ptr(&state.data) {
             return Some(si);
         }
         self.add_state(state)
@@ -1451,7 +1459,10 @@ impl<'a> Fsm<'a> {
 
     /// Returns a reference to a State given a pointer to it.
     fn state(&self, si: StatePtr) -> &State {
-        &self.cache.states[si as usize / self.num_byte_classes()]
+        self.cache
+            .compiled
+            .get_state(si as usize / self.num_byte_classes())
+            .unwrap()
     }
 
     /// Adds the given state to the DFA.
@@ -1483,14 +1494,12 @@ impl<'a> Fsm<'a> {
         // Finally, put our actual state on to our heap of states and index it
         // so we can find it later.
         self.cache.size += self.cache.trans.state_heap_size()
-            + (2 * state.data.len())
+            + state.heap_size()
             + (2 * mem::size_of::<State>())
             + mem::size_of::<StatePtr>();
-        self.cache.states.push(state.clone());
         self.cache.compiled.insert(state, si);
         // Transition table and set of states and map should all be in sync.
-        debug_assert!(self.cache.states.len() == self.cache.trans.num_states());
-        debug_assert!(self.cache.states.len() == self.cache.compiled.len());
+        debug_assert!(self.cache.compiled.len() == self.cache.trans.num_states());
         Some(si)
     }
 
@@ -1818,10 +1827,64 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
     (0, 0)
 }
 
+mod state_map {
+    use std::collections::HashMap;
+
+    use super::{State, StatePtr};
+
+    #[derive(Debug)]
+    pub struct StateMap {
+        /// The keys are not actually static but rely on always pointing to a buffer in `states`
+        /// which will never be moved except when clearing the map or on drop, in which case the
+        /// keys of this map will be removed before
+        map: HashMap<State, StatePtr>,
+        /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
+        /// this Vec rather than just a `StatePtr`.
+        states: Vec<State>,
+    }
+
+    impl StateMap {
+        pub fn new() -> StateMap {
+            StateMap {
+                map: HashMap::new(),
+                states: Vec::new(),
+            }
+        }
+
+        pub fn len(&self) -> usize {
+            self.states.len()
+        }
+
+        pub fn is_empty(&self) -> bool {
+            self.states.is_empty()
+        }
+
+        pub fn get_ptr(&self, index: &[u8]) -> Option<StatePtr> {
+            self.map.get(index).cloned()
+        }
+
+        pub fn get_state(&self, index: usize) -> Option<&State> {
+            self.states.get(index)
+        }
+
+        pub fn insert(&mut self, state: State, si: StatePtr) {
+            self.map.insert(state.clone(), si);
+            self.states.push(state);
+        }
+
+        pub fn clear(&mut self) {
+            self.map.clear();
+            self.states.clear();
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     extern crate rand;
 
+    use std::sync::Arc;
+
     use super::{
         push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32, State, StateFlags,
     };
@@ -1836,7 +1899,7 @@ mod tests {
                 push_inst_ptr(&mut data, &mut prev, ip);
             }
             let state = State {
-                data: data.into_boxed_slice(),
+                data: Arc::from(&data[..]),
             };
 
             let expected: Vec<usize> = ips.into_iter().map(|ip| ip as usize).collect();

From f7a5b10fd20a3abe353f964da53f265e7b449cd3 Mon Sep 17 00:00:00 2001
From: Markus Westerlind <markus.westerlind@distilnetworks.com>
Date: Mon, 8 Oct 2018 16:19:57 +0200
Subject: [PATCH 10/10] Bump minimum rust version to 1.21

Necessary for `From<&[T]> for Arc<T>`
---
 .travis.yml  | 2 +-
 README.md    | 2 +-
 ci/script.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index c49ba699fa..05db45d490 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,7 @@ dist: trusty
 sudo: required
 language: rust
 rust:
-  - 1.20.0
+  - 1.21.0
   - stable
   - beta
   - nightly
diff --git a/README.md b/README.md
index 82420bd377..11c1d8c50f 100644
--- a/README.md
+++ b/README.md
@@ -210,7 +210,7 @@ recommended for general use.
 
 ### Minimum Rust version policy
 
-This crate's minimum supported `rustc` version is `1.20.0`.
+This crate's minimum supported `rustc` version is `1.21.0`.
 
 The current **tentative** policy is that the minimum Rust version required
 to use this crate can be increased in minor version updates. For example, if
diff --git a/ci/script.sh b/ci/script.sh
index eea7d39d2a..ef440336dc 100755
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -3,7 +3,7 @@
 # This is the main CI script for testing the regex crate and its sub-crates.
 
 set -ex
-MSRV="1.20.0"
+MSRV="1.21.0"
 
 # If we're building on 1.20, then lazy_static 1.2 will fail to build since it
 # updated its MSRV to 1.24.1. In this case, we force the use of lazy_static 1.1