Disallow empty character class ranges.

The compiler in particular assumes that it never gets an empty character class. The current parser is pretty paranoid about rejecting empty classes, but a few tricky cases made it through. In particular, one can write `[^\d\D]` to correspond to "match nothing." This commit now looks for empty classes explicitly, and if one is found, returns an error. Interestingly, other regex engines allow this particular idiosyncrasy and interpret it as "never match." Even more interesting, expressions like `a{0}` are also allowed (including by this regex library) and are interpreted as "always match the empty string." Both seem semantically the same. In any case, we forbid empty character classes, primarily because that seems like the sensible thing to do but secondarily because it's the conservative choice. It seems plausible that such a construct could be occasionally useful if one were machine generating regexes, because it could be used to indicate "never match." If we do want to support that use case, we'll need to add a new opcode to the regex matching engines. One can still achieve that today using something like `(a|[^a])`. Fixes #257, where using such a form caused an assert to trip in the compiler. A new, more explicit assert has been added.
rust-lang · BurntSushi · Jul 10, 2016 · Jul 9, 2016 · Jul 9, 2016 · Jul 9, 2016
commit 9062f38eff7b8030bc5dba7aa03bf7bb144c82b2
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
@@ -1336,6 +1336,9 @@ pub enum ErrorKind {
     /// This never returned if the parser is permitted to allow expressions
     /// that match arbitrary bytes.
     InvalidUtf8,
+    /// A character class was constructed such that it is empty.
+    /// e.g., `[^\d\D]`.
+    EmptyClass,
     /// Hints that destructuring should not be exhaustive.
     ///
     /// This enum may grow additional variants, so this makes sure clients
@@ -1398,6 +1401,7 @@ impl ErrorKind {
             FlagNotAllowed(_) => "flag not allowed",
             UnicodeNotAllowed => "Unicode features not allowed",
             InvalidUtf8 => "matching arbitrary bytes is not allowed",
+            EmptyClass => "empty character class",
             __Nonexhaustive => unreachable!(),
         }
     }
@@ -1507,6 +1511,8 @@ impl fmt::Display for ErrorKind {
                            (u) flag is not set."),
             InvalidUtf8 =>
                 write!(f, "Matching arbitrary bytes is not allowed."),
+            EmptyClass =>
+                write!(f, "Empty character classes are not allowed."),
             __Nonexhaustive => unreachable!(),
         }
     }

diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
@@ -587,6 +587,9 @@ impl Parser {
             }
         }
         class = self.class_transform(negated, class).canonicalize();
+        if class.is_empty() {
+            return Err(self.err(ErrorKind::EmptyClass));
+        }
         Ok(Build::Expr(if self.flags.unicode {
             Expr::Class(class)
         } else {
@@ -1277,7 +1280,7 @@ mod tests {
         ErrorKind,
     };
     use unicode::regex::{PERLD, PERLS, PERLW};
-    use super::{LOWER, UPPER, Flags, Parser, ascii_class};
+    use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};
 
     static YI: &'static [(char, char)] = &[
         ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
@@ -2127,10 +2130,10 @@ mod tests {
 
     #[test]
     fn class_multiple_class_negate_negate() {
-        let nperld = class(PERLD).negate();
+        let nperlw = class(PERLW).negate();
         let nyi = class(YI).negate();
-        let cls = CharClass::empty().merge(nperld).merge(nyi);
-        assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate()));
+        let cls = CharClass::empty().merge(nperlw).merge(nyi);
+        assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
     }
 
     #[test]
@@ -2149,10 +2152,10 @@ mod tests {
 
     #[test]
     fn class_multiple_class_negate_negate_casei() {
-        let nperld = class(PERLD).negate();
+        let nperlw = class(PERLW).negate();
         let nyi = class(YI).negate();
-        let class = CharClass::empty().merge(nperld).merge(nyi);
-        assert_eq!(p(r"(?i)[^\D\P{Yi}]"),
+        let class = CharClass::empty().merge(nperlw).merge(nyi);
+        assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
                    Expr::Class(class.case_fold().negate()));
     }
 
@@ -2236,10 +2239,10 @@ mod tests {
 
     #[test]
     fn ascii_classes_negate_multiple() {
-        let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate());
-        let cls = CharClass::empty().merge(nlower).merge(nupper);
-        assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone()));
-        assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate()));
+        let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
+        let cls = CharClass::empty().merge(nlower).merge(nword);
+        assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
+        assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
     }
 
     #[test]
@@ -2725,6 +2728,7 @@ mod tests {
     fn error_class_empty_range() {
         test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
         test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
+        test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
     }
 
     #[test]

diff --git a/src/compile.rs b/src/compile.rs
@@ -372,6 +372,7 @@ impl Compiler {
     }
 
     fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
+        assert!(!ranges.is_empty());
         if self.compiled.uses_bytes() {
             CompileClass {
                 c: self,