Add support for C++ boost::regex to benchmarks

This is very similar to the `std::regex` benchmark implementation since Boost.Regex and `std::regex` have very similar APIs and regex grammar support. As such, it uses the `stdcpp` Rust and C FFIs to reduce code duplication. * bench/Cargo.toml: add `re-boost` feature * bench/build.rs: add `cboost` library to bench build. This uses a compiler preprocessor definition to indicate whether or not to use Boost when compiling the `stdcpp` FFI. * bench/compile: add `re-boost` feature to bench compile script * bench/run: add `re-boost` feature to bench run script * bench/src/bench.rs: use `ffi::stdcpp::Regex`, define its `text!` macro, and `Text` type for feature `re-boost` * bench/src/ffi/mod.rs: declare `stdcpp` module for `re-boost` feature * bench/src/ffi/stdcpp.cpp: implement C API using C++ `boost::regex`. The Boost.Regex API is very similar to the `std::regex` API and therefore only uses a different namespace. * bench/src/main.rs: add boost to bench main * bench/src/misc.rs: - do not run `match_class_unicode` benchmark for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes * bench/src/sherlock.rs: - do not run `letters`, `letters_upper`, and `letters_lower` benchmarks for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes - use a different regex for `everything_greedy` benchmark because `boost::regex` '.' does not match '\r' - `words` benchmark for `boost::regex` matches RE2 test result, so use that test for `re-boost` feature as well. Also fixes conditional compilation issue for `re-stdcpp`. - do not run `holmes_coword_watson` benchmark for `re-boost` feature because Boost.Regex implementation currently seems to have exponential behavior here
rust-lang · mkrupcale · Mar 15, 2018 · Mar 16, 2018 · Mar 24, 2018 · Mar 24, 2018
commit 6c5158e300062e78a276ee2c3767801c3156b745
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
@@ -47,8 +47,9 @@ re-pcre1 = ["libpcre-sys"]
 re-pcre2 = []
 re-onig = ["onig"]
 re-stdcpp = []
-re-re2 = []
 libcxx = []
+re-boost = []
+re-re2 = []
 re-dphobos = []
 re-dphobos-dmd = ["re-dphobos"]
 re-dphobos-dmd-ct = ["re-dphobos-dmd"]

diff --git a/bench/build.rs b/bench/build.rs
@@ -35,6 +35,15 @@ fn main() {
                 .compile("libcstdcpp.a");
         }
     }
+    if env::var("CARGO_FEATURE_RE_BOOST").is_ok() {
+        // stdcpp is a C++ library, so we need to compile our shim layer.
+        cc::Build::new()
+            .cpp(true)
+            .file("src/ffi/stdcpp.cpp")
+            .define("USE_BOOST", None)
+            .compile("libcboost.a");
+        println!("cargo:rustc-link-lib=boost_regex");
+    }
     if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
         // RE2 is a C++ library, so we need to compile our shim layer.
         cc::Build::new()

diff --git a/bench/compile b/bench/compile
@@ -2,5 +2,5 @@
 
 exec cargo build \
   --release \
-  --features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
+  --features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
   "$@"
diff --git a/bench/run b/bench/run
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 usage() {
-  echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | re2 | onig | tcl ]" >&2
+  echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2
   exit 1
 }
 
@@ -36,6 +36,9 @@ case $which in
   stdcpp-libcxx)
     exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@"
     ;;
+  boost)
+    exec cargo bench --bench bench --features re-boost "$@"
+    ;;
   re2)
     exec cargo bench --bench bench --features re-re2 "$@"
     ;;

diff --git a/bench/src/bench.rs b/bench/src/bench.rs
@@ -28,14 +28,16 @@ extern crate regex;
 extern crate regex_syntax;
 extern crate test;
 
-
 #[cfg(feature = "re-onig")]
 pub use ffi::onig::Regex;
 #[cfg(feature = "re-pcre1")]
 pub use ffi::pcre1::Regex;
 #[cfg(feature = "re-pcre2")]
 pub use ffi::pcre2::Regex;
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 pub use ffi::stdcpp::Regex;
 #[cfg(feature = "re-re2")]
 pub use ffi::re2::Regex;
@@ -93,6 +95,7 @@ macro_rules! text {
     feature = "re-pcre1",
     feature = "re-pcre2",
     feature = "re-stdcpp",
+    feature = "re-boost",
     feature = "re-re2",
     feature = "re-dphobos",
     feature = "re-rust",
@@ -111,6 +114,7 @@ type Text = Vec<u8>;
     feature = "re-pcre1",
     feature = "re-pcre2",
     feature = "re-stdcpp",
+    feature = "re-boost",
     feature = "re-re2",
     feature = "re-dphobos",
     feature = "re-rust",

diff --git a/bench/src/ffi/mod.rs b/bench/src/ffi/mod.rs
@@ -20,7 +20,10 @@ pub mod onig;
 pub mod pcre1;
 #[cfg(feature = "re-pcre2")]
 pub mod pcre2;
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 pub mod stdcpp;
 #[cfg(feature = "re-re2")]
 pub mod re2;

diff --git a/bench/src/ffi/stdcpp.cpp b/bench/src/ffi/stdcpp.cpp
@@ -1,6 +1,17 @@
+#ifdef USE_BOOST
+#include <boost/regex.hpp>
+#else
 #include <regex>
+#endif
 
 extern "C" {
+
+#ifdef USE_BOOST
+    namespace regex_ns = boost;
+#else
+    namespace regex_ns = std;
+#endif
+
     typedef void stdcpp_regexp;
 
     typedef struct stdcpp_string {
@@ -9,34 +20,34 @@ extern "C" {
     } stdcpp_string;
 
     stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
-        return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
-							       pat.len,
-							       std::regex::optimize));
+	return reinterpret_cast<stdcpp_regexp*>(new regex_ns::regex(pat.text,
+								    pat.len,
+								    regex_ns::regex::optimize));
     }
 
     void stdcpp_regexp_free(stdcpp_regexp *re) {
-        delete reinterpret_cast<std::regex*>(re);
+	delete reinterpret_cast<regex_ns::regex*>(re);
     }
 
     bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
 			     int startpos, int endpos) {
-	std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
-        return std::regex_search(text.text + startpos, text.text + endpos,
-				 cpp_re);
+	regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
+	return regex_ns::regex_search(text.text + startpos, text.text + endpos,
+				      cpp_re);
     }
 
     bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
 			    int startpos, int endpos,
 			    int *match_start, int *match_end) {
-	std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
-	std::cmatch result;
-        bool matched;
-        matched = std::regex_search(text.text + startpos, text.text + endpos,
-				    result, cpp_re);
-        if (matched) {
+	regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
+	regex_ns::cmatch result;
+	bool matched;
+	matched = regex_ns::regex_search(text.text + startpos, text.text + endpos,
+					 result, cpp_re);
+	if (matched) {
 	    *match_start = result[0].first - text.text;
 	    *match_end = *match_start + result.length(0);
-        }
-        return matched;
+	}
+	return matched;
     }
 }
diff --git a/bench/src/main.rs b/bench/src/main.rs
@@ -135,8 +135,15 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
     Regex::new(pat).unwrap().find_iter(haystack).count()
 }
 
+#[cfg(not(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  )))]
 nada!("re-stdcpp", count_stdcpp);
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 fn count_stdcpp(pat: &str, haystack: &str) -> usize {
     use ffi::stdcpp::Regex;
     Regex::new(pat).unwrap().find_iter(haystack).count()

diff --git a/bench/src/misc.rs b/bench/src/misc.rs
@@ -46,7 +46,9 @@ bench_match!(match_class_in_range, "[ac]", {
 });
 
 #[cfg(not(feature = "re-rust-bytes"))]
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 bench_match!(match_class_unicode, r"\p{L}", {
     format!("{}a", repeat("☃5☃5").take(20).collect::<String>())

diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs
@@ -106,10 +106,14 @@ sherlock!(the_whitespace, r"the\s+\w+", 5410);
 #[cfg(not(feature = "re-pcre1"))]
 #[cfg(not(feature = "re-pcre2"))]
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(everything_greedy, r".*", 13053);
 // std::regex . does not match \r
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 sherlock!(everything_greedy, r"[^\n]*", 13053);
 #[cfg(not(feature = "re-dphobos"))]
 #[cfg(not(feature = "re-onig"))]
@@ -122,24 +126,34 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1);
 
 // How fast can we match every letter? This also defeats any clever prefix
 // tricks.
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(letters, r"\p{L}", 447160);
 
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(letters_upper, r"\p{Lu}", 14180);
 
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(letters_lower, r"\p{Ll}", 432980);
 
 // Similarly, for words.
-#[cfg(not(feature = "re-re2"))]
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
+#[cfg(not(feature = "re-re2"))]
 sherlock!(words, r"\w+", 109214);
-#[cfg(feature = "re-re2")]
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+    feature = "re-re2",
+  ))]
 sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
 
 // Find complete words before Holmes. The `\w` defeats any prefix
@@ -162,6 +176,7 @@ sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
 #[cfg(not(feature = "re-pcre1"))]
 #[cfg(not(feature = "re-pcre2"))]
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(
     holmes_coword_watson,
@@ -178,13 +193,17 @@ sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
 // lazy DFA the entire way.
 // std C++ does not support multiline until C++17 nor the inline modifier syntax
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-dphobos"))]
 sherlock!(
     line_boundary_sherlock_holmes,
     r"(?m)^Sherlock Holmes|Sherlock Holmes$",
     34);
 // D matches both \r\n and \n as EOL
-#[cfg(feature = "re-dphobos")]
+#[cfg(any(
+    feature = "re-boost",
+    feature = "re-dphobos",
+  ))]
 sherlock!(
     line_boundary_sherlock_holmes,
     r"(?m)^Sherlock Holmes|Sherlock Holmes$",