Add support for C++ std::regex to benchmarks

* bench/Cargo.toml: add `re-stdcpp` feature * bench/build.rs: add `cstdcpp` library to bench build * bench/compile: add `re-stdcpp` feature to bench compile script * bench/run: add `re-stdcpp` feature to bench run script * bench/src/bench.rs: use `ffi::stdcpp::Regex`, define its `text!` macro, and `Text` type * bench/src/ffi/mod.rs: declare `stdcpp` module * bench/src/ffi/stdcpp.cpp: implement C API using C++ `std::regex` * bench/src/ffi/stdcpp.rs: Rust `Regex` API implementation using C++ `std::regex` C API wrapper * bench/src/main.rs: add stdcpp to bench main * bench/src/misc.rs: - do not run `no_exponential` benchmark for `re-stdcpp` feature because `libstdc++` `std::regex` implementation currently seems to have exponential behavior here - do not run `match_class_unicode` benchmark for `re-stdcpp` feature because `std::regex` ECMAScript grammar does not support unicode character classes * bench/src/sherlock.rs: - do not run `name_sherlock_nocase`, `name_holmes_nocase`, `name_sherlock_holmes_nocase`, `name_alt3_nocase`, `name_alt4_nocase`, `name_alt5_nocase`, `the_nocase`, `everything_greedy_nl`, and `line_boundary_sherlock_holmes` benchmarks for `re-stdcpp` feature because `std::regex` ECMAScript grammar does not support inline modifier syntax - do not run `letters`, `letters_upper`, and `letters_lower` benchmarks for `re-stdcpp` feature because `std::regex` ECMAScript grammar does not support unicode character classes - use a different regex for `everything_greedy` benchmark because `std::regex` '.' does not match '\r' - `words` benchmark for `std::regex` matches RE2 test result, so use that test for `re-stdcpp` feature as well - do not run `holmes_coword_watson` benchmark for `re-stdcpp` feature because `libstdc++` `std::regex` implementation currently seems to have exponential behavior here
rust-lang · mkrupcale · Mar 15, 2018 · Mar 16, 2018 · Mar 24, 2018 · Mar 15, 2018
commit abc1ce29746d2cfb3a68ab1f49add191eb6f1ce0
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
@@ -46,6 +46,7 @@ bench = false
 re-pcre1 = ["libpcre-sys"]
 re-pcre2 = []
 re-onig = ["onig"]
+re-stdcpp = []
 re-re2 = []
 re-dphobos = []
 re-dphobos-dmd = ["re-dphobos"]

diff --git a/bench/build.rs b/bench/build.rs
@@ -18,6 +18,13 @@ fn main() {
     if env::var("CARGO_FEATURE_RE_PCRE2").is_ok() {
         pkg_config::probe_library("libpcre2-8").unwrap();
     }
+    if env::var("CARGO_FEATURE_RE_STDCPP").is_ok() {
+        // stdcpp is a C++ library, so we need to compile our shim layer.
+        cc::Build::new()
+            .cpp(true)
+            .file("src/ffi/stdcpp.cpp")
+            .compile("libcstdcpp.a");
+    }
     if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
         // RE2 is a C++ library, so we need to compile our shim layer.
         cc::Build::new()

diff --git a/bench/compile b/bench/compile
@@ -2,5 +2,5 @@
 
 exec cargo build \
   --release \
-  --features 're-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
+  --features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
   "$@"
diff --git a/bench/run b/bench/run
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 usage() {
-  echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | re2 | onig | tcl ]" >&2
+  echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | re2 | onig | tcl ]" >&2
   exit 1
 }
 
@@ -30,6 +30,9 @@ case $which in
   rust-bytes)
     exec cargo bench --bench bench --features re-rust-bytes "$@"
     ;;
+  stdcpp)
+    exec cargo bench --bench bench --features re-stdcpp "$@"
+    ;;
   re2)
     exec cargo bench --bench bench --features re-re2 "$@"
     ;;

diff --git a/bench/src/bench.rs b/bench/src/bench.rs
@@ -35,6 +35,8 @@ pub use ffi::onig::Regex;
 pub use ffi::pcre1::Regex;
 #[cfg(feature = "re-pcre2")]
 pub use ffi::pcre2::Regex;
+#[cfg(feature = "re-stdcpp")]
+pub use ffi::stdcpp::Regex;
 #[cfg(feature = "re-re2")]
 pub use ffi::re2::Regex;
 #[cfg(feature = "re-dphobos")]
@@ -90,6 +92,7 @@ macro_rules! text {
     feature = "re-onig",
     feature = "re-pcre1",
     feature = "re-pcre2",
+    feature = "re-stdcpp",
     feature = "re-re2",
     feature = "re-dphobos",
     feature = "re-rust",
@@ -107,6 +110,7 @@ type Text = Vec<u8>;
     feature = "re-onig",
     feature = "re-pcre1",
     feature = "re-pcre2",
+    feature = "re-stdcpp",
     feature = "re-re2",
     feature = "re-dphobos",
     feature = "re-rust",

diff --git a/bench/src/ffi/mod.rs b/bench/src/ffi/mod.rs
@@ -20,6 +20,8 @@ pub mod onig;
 pub mod pcre1;
 #[cfg(feature = "re-pcre2")]
 pub mod pcre2;
+#[cfg(feature = "re-stdcpp")]
+pub mod stdcpp;
 #[cfg(feature = "re-re2")]
 pub mod re2;
 #[cfg(feature = "re-tcl")]

diff --git a/bench/src/ffi/stdcpp.cpp b/bench/src/ffi/stdcpp.cpp
@@ -0,0 +1,42 @@
+#include <regex>
+
+extern "C" {
+    typedef void stdcpp_regexp;
+
+    typedef struct stdcpp_string {
+        const char *text;
+        int len;
+    } stdcpp_string;
+
+    stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
+        return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
+							       pat.len,
+							       std::regex::optimize));
+    }
+
+    void stdcpp_regexp_free(stdcpp_regexp *re) {
+        delete reinterpret_cast<std::regex*>(re);
+    }
+
+    bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
+			     int startpos, int endpos) {
+	std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
+        return std::regex_search(text.text + startpos, text.text + endpos,
+				 cpp_re);
+    }
+
+    bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
+			    int startpos, int endpos,
+			    int *match_start, int *match_end) {
+	std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
+	std::cmatch result;
+        bool matched;
+        matched = std::regex_search(text.text + startpos, text.text + endpos,
+				    result, cpp_re);
+        if (matched) {
+	    *match_start = result[0].first - text.text;
+	    *match_end = *match_start + result.length(0);
+        }
+        return matched;
+    }
+}
diff --git a/bench/src/ffi/stdcpp.rs b/bench/src/ffi/stdcpp.rs
@@ -0,0 +1,163 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![allow(non_camel_case_types)]
+
+use libc::{c_uchar, c_int, c_void};
+
+/// Regex wraps a std::regex regular expression.
+///
+/// It cannot be used safely from multiple threads simultaneously.
+pub struct Regex {
+    re: *mut stdcpp_regexp,
+}
+
+unsafe impl Send for Regex {}
+
+impl Drop for Regex {
+    fn drop(&mut self) {
+        unsafe { stdcpp_regexp_free(self.re); }
+    }
+}
+
+#[derive(Debug)]
+pub struct Error(());
+
+impl Regex {
+    pub fn new(pattern: &str) -> Result<Regex, Error> {
+        unsafe { Ok(Regex { re: stdcpp_regexp_new(pattern.into()) }) }
+    }
+
+    pub fn is_match(&self, text: &str) -> bool {
+        unsafe {
+            stdcpp_regexp_match(self.re, text.into(), 0, text.len() as c_int)
+        }
+    }
+
+    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
+        FindMatches {
+            re: self,
+            text: text,
+            last_end: 0,
+            last_match: None,
+        }
+    }
+
+    fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
+        let (mut s, mut e): (c_int, c_int) = (0, 0);
+        let matched = unsafe {
+            stdcpp_regexp_find(
+                self.re,
+                text.into(),
+                start as c_int,
+                text.len() as c_int,
+                &mut s,
+                &mut e,
+            )
+        };
+        if matched {
+            Some((s as usize, e as usize))
+        } else {
+            None
+        }
+    }
+}
+
+pub struct FindMatches<'r, 't> {
+    re: &'r Regex,
+    text: &'t str,
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+// This implementation is identical to the one Rust uses, since both Rust's
+// regex engine and std::regex handle empty matches in the same way.
+impl<'r, 't> Iterator for FindMatches<'r, 't> {
+    type Item = (usize, usize);
+
+    fn next(&mut self) -> Option<(usize, usize)> {
+        fn next_after_empty(text: &str, i: usize) -> usize {
+            let b = match text.as_bytes().get(i) {
+                None => return text.len() + 1,
+                Some(&b) => b,
+            };
+            let inc = if b <= 0x7F {
+                1
+            } else if b <= 0b110_11111 {
+                2
+            } else if b <= 0b1110_1111 {
+                3
+            } else {
+                4
+            };
+            i + inc
+        }
+
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let (s, e) = match self.re.find_at(self.text, self.last_end) {
+            None => return None,
+            Some((s, e)) => (s, e),
+        };
+        assert!(s >= self.last_end);
+        if s == e {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            self.last_end = next_after_empty(&self.text, e);
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(e) == self.last_match {
+                return self.next();
+            }
+        } else {
+            self.last_end = e;
+        }
+        self.last_match = Some(self.last_end);
+        Some((s, e))
+    }
+}
+
+// stdcpp FFI is below. Note that this uses a hand-rolled C API that is defined
+// in stdcpp.cpp.
+
+type stdcpp_regexp = c_void;
+
+#[repr(C)]
+struct stdcpp_string {
+    text: *const c_uchar,
+    len: c_int,
+}
+
+impl<'a> From<&'a str> for stdcpp_string {
+    fn from(s: &'a str) -> stdcpp_string {
+        stdcpp_string { text: s.as_ptr(), len: s.len() as c_int }
+    }
+}
+
+extern {
+    fn stdcpp_regexp_new(pat: stdcpp_string) -> *mut stdcpp_regexp;
+    fn stdcpp_regexp_free(re: *mut stdcpp_regexp);
+    fn stdcpp_regexp_match(
+        re: *mut stdcpp_regexp,
+        text: stdcpp_string,
+        startpos: c_int,
+        endpos: c_int,
+    ) -> bool;
+    fn stdcpp_regexp_find(
+        re: *mut stdcpp_regexp,
+        text: stdcpp_string,
+        startpos: c_int,
+        endpos: c_int,
+        match_start: *mut c_int,
+        match_end: *mut c_int,
+    ) -> bool;
+}
diff --git a/bench/src/main.rs b/bench/src/main.rs
@@ -45,7 +45,7 @@ Since this tool includes compilation of the <pattern>, sufficiently large
 haystacks should be used to amortize the cost of compilation. (e.g., >1MB.)
 
 Usage:
-    regex-run-one [options] [onig | pcre1 | pcre2 | re2 | rust | rust-bytes | tcl] <file> <pattern>
+    regex-run-one [options] [onig | pcre1 | pcre2 | stdcpp | re2 | rust | rust-bytes | tcl] <file> <pattern>
     regex-run-one [options] (-h | --help)
 
 Options:
@@ -59,6 +59,7 @@ struct Args {
     cmd_onig: bool,
     cmd_pcre1: bool,
     cmd_pcre2: bool,
+    cmd_stdcpp: bool,
     cmd_re2: bool,
     cmd_rust: bool,
     cmd_rust_bytes: bool,
@@ -87,6 +88,8 @@ impl Args {
             count_pcre1(pat, haystack)
         } else if self.cmd_pcre2 {
             count_pcre2(pat, haystack)
+        } else if self.cmd_stdcpp {
+            count_stdcpp(pat, haystack)
         } else if self.cmd_re2 {
             count_re2(pat, haystack)
         } else if self.cmd_rust {
@@ -132,6 +135,13 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
     Regex::new(pat).unwrap().find_iter(haystack).count()
 }
 
+nada!("re-stdcpp", count_stdcpp);
+#[cfg(feature = "re-stdcpp")]
+fn count_stdcpp(pat: &str, haystack: &str) -> usize {
+    use ffi::stdcpp::Regex;
+    Regex::new(pat).unwrap().find_iter(haystack).count()
+}
+
 nada!("re-re2", count_re2);
 #[cfg(feature = "re-re2")]
 fn count_re2(pat: &str, haystack: &str) -> usize {

diff --git a/bench/src/misc.rs b/bench/src/misc.rs
@@ -19,6 +19,7 @@ use {Regex, Text};
 #[cfg(not(feature = "re-onig"))]
 #[cfg(not(feature = "re-pcre1"))]
 #[cfg(not(feature = "re-pcre2"))]
+#[cfg(not(feature = "re-stdcpp"))]
 #[cfg(not(feature = "re-dphobos-dmd-ct"))]
 #[cfg(not(feature = "re-dphobos-ldc-ct"))]
 bench_match!(no_exponential, {
@@ -45,6 +46,7 @@ bench_match!(match_class_in_range, "[ac]", {
 });
 
 #[cfg(not(feature = "re-rust-bytes"))]
+#[cfg(not(feature = "re-stdcpp"))]
 #[cfg(not(feature = "re-tcl"))]
 bench_match!(match_class_unicode, r"\p{L}", {
     format!("{}a", repeat("☃5☃5").take(20).collect::<String>())