diff --git a/.travis.yml b/.travis.yml
index c49ba699fa..f6e7e4f5c9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,7 @@ dist: trusty
 sudo: required
 language: rust
 rust:
-  - 1.20.0
+  - 1.24.1
   - stable
   - beta
   - nightly
diff --git a/README.md b/README.md
index f9908caeda..d90c00fc1d 100644
--- a/README.md
+++ b/README.md
@@ -210,7 +210,7 @@ recommended for general use.
 
 ### Minimum Rust version policy
 
-This crate's minimum supported `rustc` version is `1.20.0`.
+This crate's minimum supported `rustc` version is `1.24.1`.
 
 The current **tentative** policy is that the minimum Rust version required
 to use this crate can be increased in minor version updates. For example, if
diff --git a/bench/src/bench.rs b/bench/src/bench.rs
index 6cb56db850..ac5c3d8502 100644
--- a/bench/src/bench.rs
+++ b/bench/src/bench.rs
@@ -44,9 +44,9 @@ pub use ffi::re2::Regex;
 #[cfg(feature = "re-dphobos")]
 pub use ffi::d_phobos::Regex;
 #[cfg(feature = "re-rust")]
-pub use regex::Regex;
+pub use regex::{Regex, RegexSet};
 #[cfg(feature = "re-rust-bytes")]
-pub use regex::bytes::Regex;
+pub use regex::bytes::{Regex, RegexSet};
 #[cfg(feature = "re-tcl")]
 pub use ffi::tcl::Regex;
 
@@ -272,6 +272,58 @@ macro_rules! bench_captures {
     }
 }
 
+// USAGE: bench_is_match_set!(name, is_match, regex, haystack)
+macro_rules! bench_is_match_set {
+    ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            use std::sync::Mutex;
+            lazy_static! {
+                static ref RE: Mutex<RegexSet> = Mutex::new($re);
+                static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
+            };
+            let re = RE.lock().unwrap();
+            let text = TEXT.lock().unwrap();
+            b.bytes = text.len() as u64;
+            b.iter(|| {
+                if re.is_match(&text) != $is_match {
+                    if $is_match {
+                        panic!("expected match, got not match");
+                    } else {
+                        panic!("expected no match, got match");
+                    }
+                }
+            });
+        }
+    }
+}
+
+// USAGE: bench_matches_set!(name, is_match, regex, haystack)
+macro_rules! bench_matches_set {
+    ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            use std::sync::Mutex;
+            lazy_static! {
+                static ref RE: Mutex<RegexSet> = Mutex::new($re);
+                static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
+            };
+            let re = RE.lock().unwrap();
+            let text = TEXT.lock().unwrap();
+            b.bytes = text.len() as u64;
+            b.iter(|| {
+                if re.matches(&text).matched_any() != $is_match {
+                    if $is_match {
+                        panic!("expected match, got not match");
+                    } else {
+                        panic!("expected no match, got match");
+                    }
+                }
+            });
+        }
+    }
+}
+
 mod ffi;
 mod misc;
 mod regexdna;
diff --git a/bench/src/misc.rs b/bench/src/misc.rs
index ad516e23e4..b5b25b737c 100644
--- a/bench/src/misc.rs
+++ b/bench/src/misc.rs
@@ -14,6 +14,8 @@ use std::iter::repeat;
 
 use test::Bencher;
 
+#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
+use RegexSet;
 use {Regex, Text};
 
 #[cfg(not(feature = "re-onig"))]
@@ -278,3 +280,25 @@ bench_captures!(short_haystack_1000000x,
             repeat("aaaa").take(1000000).collect::<String>(),
             repeat("dddd").take(1000000).collect::<String>(),
             ));
+
+#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
+bench_is_match_set!(is_match_set,
+    true,
+    RegexSet::new(vec![
+        "aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23.",
+    ]).unwrap(),
+    format!("{}a482c{}",
+            repeat('a').take(10).collect::<String>(),
+            repeat('b').take(10).collect::<String>())
+    );
+
+#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
+bench_matches_set!(matches_set,
+    true,
+    RegexSet::new(vec![
+        "aaaaaaaaaaaaaaaaaaa", "abc579", "def.+", "e24fg", "a.*2c", "23.",
+    ]).unwrap(),
+    format!("{}a482c{}",
+            repeat('a').take(10).collect::<String>(),
+            repeat('b').take(10).collect::<String>())
+    );
diff --git a/ci/script.sh b/ci/script.sh
index eea7d39d2a..ba66be192b 100755
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -3,18 +3,7 @@
 # This is the main CI script for testing the regex crate and its sub-crates.
 
 set -ex
-MSRV="1.20.0"
-
-# If we're building on 1.20, then lazy_static 1.2 will fail to build since it
-# updated its MSRV to 1.24.1. In this case, we force the use of lazy_static 1.1
-# to build on Rust 1.20.0.
-if [ "$TRAVIS_RUST_VERSION" = "$MSRV" ]; then
-    cargo update -p lazy_static --precise 1.1.0
-    # On older versions of Cargo, this apparently needs to be run twice
-    # if Cargo.lock didn't previously exist. Since this command should be
-    # idempotent, we run it again unconditionally.
-    cargo update -p lazy_static --precise 1.1.0
-fi
+MSRV="1.24.1"
 
 # Builds the regex crate and runs tests.
 cargo build --verbose
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000000..c7ad93bafe
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1 @@
+disable_all_formatting = true
diff --git a/src/dfa.rs b/src/dfa.rs
index 992a45cc62..e662b73380 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -50,6 +50,7 @@ use std::collections::HashMap;
 use std::fmt;
 use std::iter::repeat;
 use std::mem;
+use std::sync::Arc;
 
 use exec::ProgramCache;
 use prog::{Inst, Program};
@@ -88,7 +89,7 @@ pub fn can_exec(insts: &Program) -> bool {
 /// This cache is reused between multiple invocations of the same regex
 /// program. (It is not shared simultaneously between threads. If there is
 /// contention, then new caches are created.)
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct Cache {
     /// Group persistent DFA related cache state together. The sparse sets
     /// listed below are used as scratch space while computing uncached states.
@@ -107,7 +108,7 @@ pub struct Cache {
 /// `CacheInner` is logically just a part of Cache, but groups together fields
 /// that aren't passed as function parameters throughout search. (This split
 /// is mostly an artifact of the borrow checker. It is happily paid.)
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 struct CacheInner {
     /// A cache of pre-compiled DFA states, keyed by the set of NFA states
     /// and the set of empty-width flags set at the byte in the input when the
@@ -117,7 +118,7 @@ struct CacheInner {
     /// things, we just pass indexes around manually. The performance impact of
     /// this is probably an instruction or two in the inner loop. However, on
     /// 64 bit, each StatePtr is half the size of a *State.
-    compiled: HashMap<State, StatePtr>,
+    compiled: StateMap,
     /// The transition table.
     ///
     /// The transition table is laid out in row-major order, where states are
@@ -134,9 +135,6 @@ struct CacheInner {
     /// bytes that never discriminate a distinct path through the DFA from each
     /// other.
     trans: Transitions,
-    /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
-    /// this Vec rather than just a `StatePtr`.
-    states: Vec<State>,
     /// A set of cached start states, which are limited to the number of
     /// permutations of flags set just before the initial byte of input. (The
     /// index into this vec is a `EmptyFlags`.)
@@ -155,6 +153,9 @@ struct CacheInner {
     /// The total heap size of the DFA's cache. We use this to determine when
     /// we should flush the cache.
     size: usize,
+    /// Scratch space used when building instruction pointer lists for new
+    /// states. This helps amortize allocation.
+    insts_scratch_space: Vec<u8>,
 }
 
 /// The transition table.
@@ -267,8 +268,8 @@ impl<T> Result<T> {
 /// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
 /// is coded as a flag, followed by 4 bytes encoding the delta.
 #[derive(Clone, Eq, Hash, PartialEq)]
-struct State{
-    data: Box<[u8]>,
+struct State {
+    data: Arc<[u8]>,
 }
 
 /// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
@@ -428,13 +429,13 @@ impl Cache {
         let starts = vec![STATE_UNKNOWN; 256];
         let mut cache = Cache {
             inner: CacheInner {
-                compiled: HashMap::new(),
+                compiled: StateMap::new(num_byte_classes),
                 trans: Transitions::new(num_byte_classes),
-                states: vec![],
                 start_states: starts,
                 stack: vec![],
                 flush_count: 0,
                 size: 0,
+                insts_scratch_space: vec![],
             },
             qcur: SparseSet::new(prog.insts.len()),
             qnext: SparseSet::new(prog.insts.len()),
@@ -1179,7 +1180,11 @@ impl<'a> Fsm<'a> {
             Some(v) => v,
         };
         // In the cache? Cool. Done.
-        if let Some(&si) = self.cache.compiled.get(&key) {
+        if let Some(si) = self
+            .cache
+            .compiled
+            .get_ptr(&key)
+        {
             return Some(si);
         }
         // If the cache has gotten too big, wipe it.
@@ -1219,8 +1224,14 @@ impl<'a> Fsm<'a> {
         // are conditional, we need to make them part of a state's key in the
         // cache.
 
+        let mut insts = mem::replace(
+            &mut self.cache.insts_scratch_space,
+            vec![],
+        );
+        insts.clear();
         // Reserve 1 byte for flags.
-        let mut insts = vec![0];
+        insts.push(0);
+
         let mut prev = 0;
         for &ip in q {
             let ip = usize_to_u32(ip);
@@ -1244,13 +1255,16 @@ impl<'a> Fsm<'a> {
         // see a match when expanding NFA states previously, then this is a
         // dead state and no amount of additional input can transition out
         // of this state.
-        if insts.len() == 1 && !state_flags.is_match() {
-            None
-        } else {
-            let StateFlags(f) = *state_flags;
-            insts[0] = f;
-            Some(State { data: insts.into_boxed_slice() })
-        }
+        let opt_state =
+            if insts.len() == 1 && !state_flags.is_match() {
+                None
+            } else {
+                let StateFlags(f) = *state_flags;
+                insts[0] = f;
+                Some(State { data: Arc::from(&*insts) })
+            };
+        self.cache.insts_scratch_space = insts;
+        opt_state
     }
 
     /// Clears the cache, but saves and restores current_state if it is not
@@ -1265,7 +1279,7 @@ impl<'a> Fsm<'a> {
         &mut self,
         current_state: Option<&mut StatePtr>,
     ) -> bool {
-        if self.cache.states.is_empty() {
+        if self.cache.compiled.is_empty() {
             // Nothing to clear...
             return true;
         }
@@ -1295,7 +1309,7 @@ impl<'a> Fsm<'a> {
         // 10 or fewer bytes per state.
         // Additionally, we permit the cache to be flushed a few times before
         // caling it quits.
-        let nstates = self.cache.states.len();
+        let nstates = self.cache.compiled.len();
         if self.cache.flush_count >= 3
             && self.at >= self.last_cache_flush
             && (self.at - self.last_cache_flush) <= 10 * nstates {
@@ -1314,7 +1328,6 @@ impl<'a> Fsm<'a> {
         };
         self.cache.reset_size();
         self.cache.trans.clear();
-        self.cache.states.clear();
         self.cache.compiled.clear();
         for s in &mut self.cache.start_states {
             *s = STATE_UNKNOWN;
@@ -1334,7 +1347,7 @@ impl<'a> Fsm<'a> {
     fn restore_state(&mut self, state: State) -> Option<StatePtr> {
         // If we've already stored this state, just return a pointer to it.
         // None will be the wiser.
-        if let Some(&si) = self.cache.compiled.get(&state) {
+        if let Some(si) = self.cache.compiled.get_ptr(&state) {
             return Some(si);
         }
         self.add_state(state)
@@ -1475,7 +1488,7 @@ impl<'a> Fsm<'a> {
 
     /// Returns a reference to a State given a pointer to it.
     fn state(&self, si: StatePtr) -> &State {
-        &self.cache.states[si as usize / self.num_byte_classes()]
+        self.cache.compiled.get_state(si).unwrap()
     }
 
     /// Adds the given state to the DFA.
@@ -1508,16 +1521,13 @@ impl<'a> Fsm<'a> {
         // so we can find it later.
         self.cache.size +=
             self.cache.trans.state_heap_size()
-            + (2 * state.data.len())
+            + state.data.len()
             + (2 * mem::size_of::<State>())
             + mem::size_of::<StatePtr>();
-        self.cache.states.push(state.clone());
         self.cache.compiled.insert(state, si);
         // Transition table and set of states and map should all be in sync.
-        debug_assert!(self.cache.states.len()
+        debug_assert!(self.cache.compiled.len()
                       == self.cache.trans.num_states());
-        debug_assert!(self.cache.states.len()
-                      == self.cache.compiled.len());
         Some(si)
     }
 
@@ -1598,6 +1608,66 @@ impl<'a> Fsm<'a> {
     }
 }
 
+/// An abstraction for representing a map of states. The map supports two
+/// different ways of state lookup. One is fast constant time access via a
+/// state pointer. The other is a hashmap lookup based on the DFA's
+/// constituent NFA states.
+///
+/// A DFA state internally uses an Arc such that we only need to store the
+/// set of NFA states on the heap once, even though we support looking up
+/// states by two different means. A more natural way to express this might
+/// use raw pointers, but an Arc is safe and effectively achieves the same
+/// thing.
+#[derive(Debug)]
+struct StateMap {
+    /// The keys are not actually static but rely on always pointing to a
+    /// buffer in `states` which will never be moved except when clearing
+    /// the map or on drop, in which case the keys of this map will be
+    /// removed before
+    map: HashMap<State, StatePtr>,
+    /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
+    /// this Vec rather than just a `StatePtr`.
+    states: Vec<State>,
+    /// The number of byte classes in the DFA. Used to index `states`.
+    num_byte_classes: usize,
+}
+
+impl StateMap {
+    fn new(num_byte_classes: usize) -> StateMap {
+        StateMap {
+            map: HashMap::new(),
+            states: vec![],
+            num_byte_classes: num_byte_classes,
+        }
+    }
+
+    fn len(&self) -> usize {
+        self.states.len()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.states.is_empty()
+    }
+
+    fn get_ptr(&self, state: &State) -> Option<StatePtr> {
+        self.map.get(state).cloned()
+    }
+
+    fn get_state(&self, si: StatePtr) -> Option<&State> {
+        self.states.get(si as usize / self.num_byte_classes)
+    }
+
+    fn insert(&mut self, state: State, si: StatePtr) {
+        self.map.insert(state.clone(), si);
+        self.states.push(state);
+    }
+
+    fn clear(&mut self) {
+        self.map.clear();
+        self.states.clear();
+    }
+}
+
 impl Transitions {
     /// Create a new transition table.
     ///
@@ -1844,6 +1914,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
 mod tests {
     extern crate rand;
 
+    use std::sync::Arc;
     use quickcheck::{QuickCheck, StdGen, quickcheck};
     use super::{
         StateFlags, State, push_inst_ptr,
@@ -1858,7 +1929,7 @@ mod tests {
             for &ip in ips.iter() {
                 push_inst_ptr(&mut data, &mut prev, ip);
             }
-            let state = State { data: data.into_boxed_slice() };
+            let state = State { data: Arc::from(&data[..]) };
 
             let expected: Vec<usize> =
                 ips.into_iter().map(|ip| ip as usize).collect();
diff --git a/src/exec.rs b/src/exec.rs
index 578289aa5c..52d970f801 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -1271,7 +1271,7 @@ enum MatchNfaType {
 /// available to a particular program.
 pub type ProgramCache = RefCell<ProgramCacheInner>;
 
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct ProgramCacheInner {
     pub pikevm: pikevm::Cache,
     pub backtrack: backtrack::Cache,
diff --git a/src/prog.rs b/src/prog.rs
index 198e1f45c4..b62802f2b7 100644
--- a/src/prog.rs
+++ b/src/prog.rs
@@ -161,6 +161,7 @@ impl Program {
 impl Deref for Program {
     type Target = [Inst];
 
+    #[inline(always)]
     fn deref(&self) -> &Self::Target {
         &*self.insts
     }
diff --git a/src/sparse.rs b/src/sparse.rs
index 3cb8978ee4..ae4cae4b9b 100644
--- a/src/sparse.rs
+++ b/src/sparse.rs
@@ -14,52 +14,49 @@ use std::slice;
 #[derive(Clone, Debug)]
 pub struct SparseSet {
     /// Dense contains the instruction pointers in the order in which they
-    /// were inserted. Accessing elements >= self.size is illegal.
+    /// were inserted.
     dense: Vec<usize>,
     /// Sparse maps instruction pointers to their location in dense.
     ///
     /// An instruction pointer is in the set if and only if
-    /// sparse[ip] < size && ip == dense[sparse[ip]].
-    sparse: Vec<usize>,
-    /// The number of elements in the set.
-    size: usize,
+    /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
+    sparse: Box<[usize]>,
 }
 
 impl SparseSet {
     pub fn new(size: usize) -> SparseSet {
         SparseSet {
-            dense: vec![0; size],
-            sparse: vec![0; size],
-            size: 0,
+            dense: Vec::with_capacity(size),
+            sparse: vec![0; size].into_boxed_slice(),
         }
     }
 
     pub fn len(&self) -> usize {
-        self.size
+        self.dense.len()
     }
 
     pub fn is_empty(&self) -> bool {
-        self.size == 0
+        self.dense.is_empty()
     }
 
     pub fn capacity(&self) -> usize {
-        self.dense.len()
+        self.dense.capacity()
     }
 
     pub fn insert(&mut self, value: usize) {
-        let i = self.size;
-        self.dense[i] = value;
+        let i = self.len();
+        assert!(i < self.capacity());
+        self.dense.push(value);
         self.sparse[value] = i;
-        self.size += 1;
     }
 
     pub fn contains(&self, value: usize) -> bool {
         let i = self.sparse[value];
-        i < self.size && self.dense[i] == value
+        self.dense.get(i) == Some(&value)
     }
 
     pub fn clear(&mut self) {
-        self.size = 0;
+        self.dense.clear();
     }
 }
 
@@ -67,7 +64,7 @@ impl Deref for SparseSet {
     type Target = [usize];
 
     fn deref(&self) -> &Self::Target {
-        &self.dense[0..self.size]
+        &self.dense
     }
 }