Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Add support for C++ boost::regex to benchmarks
This is very similar to the `std::regex` benchmark implementation since Boost.Regex and `std::regex` have very similar APIs and regex grammar support. As such, it uses the `stdcpp` Rust and C FFIs to reduce code duplication.

 * bench/Cargo.toml: add `re-boost` feature
 * bench/build.rs: add `cboost` library to bench build. This uses a compiler preprocessor definition to indicate whether or not to use Boost when compiling the `stdcpp` FFI.
 * bench/compile: add `re-boost` feature to bench compile script
 * bench/run: add `re-boost` feature to bench run script
 * bench/src/bench.rs: use `ffi::stdcpp::Regex`, define its `text!` macro, and `Text` type for feature `re-boost`
 * bench/src/ffi/mod.rs: declare `stdcpp` module for `re-boost` feature
 * bench/src/ffi/stdcpp.cpp: implement C API using C++ `boost::regex`. The Boost.Regex API is very similar to the `std::regex` API and therefore only uses a different namespace.
 * bench/src/main.rs: add boost to bench main
 * bench/src/misc.rs:
    - do not run `match_class_unicode` benchmark for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes
 * bench/src/sherlock.rs:
    - do not run `letters`, `letters_upper`, and `letters_lower` benchmarks for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes
    - use a different regex for `everything_greedy` benchmark because `boost::regex` '.' does not match '\r'
    - `words` benchmark for `boost::regex` matches RE2 test result, so use that test for `re-boost` feature as well. Also fixes conditional compilation issue for `re-stdcpp`.
    - do not run `holmes_coword_watson` benchmark for `re-boost` feature because Boost.Regex implementation currently seems to have exponential behavior here
  • Loading branch information
mkrupcale committed Mar 24, 2018
commit 6c5158e300062e78a276ee2c3767801c3156b745
3 changes: 2 additions & 1 deletion bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ re-pcre1 = ["libpcre-sys"]
re-pcre2 = []
re-onig = ["onig"]
re-stdcpp = []
re-re2 = []
libcxx = []
re-boost = []
re-re2 = []
re-dphobos = []
re-dphobos-dmd = ["re-dphobos"]
re-dphobos-dmd-ct = ["re-dphobos-dmd"]
Expand Down
9 changes: 9 additions & 0 deletions bench/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ fn main() {
.compile("libcstdcpp.a");
}
}
if env::var("CARGO_FEATURE_RE_BOOST").is_ok() {
// stdcpp is a C++ library, so we need to compile our shim layer.
cc::Build::new()
.cpp(true)
.file("src/ffi/stdcpp.cpp")
.define("USE_BOOST", None)
.compile("libcboost.a");
println!("cargo:rustc-link-lib=boost_regex");
}
if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
// RE2 is a C++ library, so we need to compile our shim layer.
cc::Build::new()
Expand Down
2 changes: 1 addition & 1 deletion bench/compile
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

exec cargo build \
--release \
--features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
--features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
"$@"
5 changes: 4 additions & 1 deletion bench/run
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

usage() {
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | re2 | onig | tcl ]" >&2
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2
exit 1
}

Expand Down Expand Up @@ -36,6 +36,9 @@ case $which in
stdcpp-libcxx)
exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@"
;;
boost)
exec cargo bench --bench bench --features re-boost "$@"
;;
re2)
exec cargo bench --bench bench --features re-re2 "$@"
;;
Expand Down
8 changes: 6 additions & 2 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,16 @@ extern crate regex;
extern crate regex_syntax;
extern crate test;


#[cfg(feature = "re-onig")]
pub use ffi::onig::Regex;
#[cfg(feature = "re-pcre1")]
pub use ffi::pcre1::Regex;
#[cfg(feature = "re-pcre2")]
pub use ffi::pcre2::Regex;
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
pub use ffi::stdcpp::Regex;
#[cfg(feature = "re-re2")]
pub use ffi::re2::Regex;
Expand Down Expand Up @@ -93,6 +95,7 @@ macro_rules! text {
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand All @@ -111,6 +114,7 @@ type Text = Vec<u8>;
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand Down
5 changes: 4 additions & 1 deletion bench/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ pub mod onig;
pub mod pcre1;
#[cfg(feature = "re-pcre2")]
pub mod pcre2;
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
pub mod stdcpp;
#[cfg(feature = "re-re2")]
pub mod re2;
Expand Down
41 changes: 26 additions & 15 deletions bench/src/ffi/stdcpp.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
#ifdef USE_BOOST
#include <boost/regex.hpp>
#else
#include <regex>
#endif

extern "C" {

#ifdef USE_BOOST
namespace regex_ns = boost;
#else
namespace regex_ns = std;
#endif

typedef void stdcpp_regexp;

typedef struct stdcpp_string {
Expand All @@ -9,34 +20,34 @@ extern "C" {
} stdcpp_string;

stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
pat.len,
std::regex::optimize));
return reinterpret_cast<stdcpp_regexp*>(new regex_ns::regex(pat.text,
pat.len,
regex_ns::regex::optimize));
}

void stdcpp_regexp_free(stdcpp_regexp *re) {
delete reinterpret_cast<std::regex*>(re);
delete reinterpret_cast<regex_ns::regex*>(re);
}

bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos) {
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
return std::regex_search(text.text + startpos, text.text + endpos,
cpp_re);
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
return regex_ns::regex_search(text.text + startpos, text.text + endpos,
cpp_re);
}

bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos,
int *match_start, int *match_end) {
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
std::cmatch result;
bool matched;
matched = std::regex_search(text.text + startpos, text.text + endpos,
result, cpp_re);
if (matched) {
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
regex_ns::cmatch result;
bool matched;
matched = regex_ns::regex_search(text.text + startpos, text.text + endpos,
result, cpp_re);
if (matched) {
*match_start = result[0].first - text.text;
*match_end = *match_start + result.length(0);
}
return matched;
}
return matched;
}
}
9 changes: 8 additions & 1 deletion bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,15 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
Regex::new(pat).unwrap().find_iter(haystack).count()
}

#[cfg(not(any(
feature = "re-stdcpp",
feature = "re-boost",
)))]
nada!("re-stdcpp", count_stdcpp);
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
fn count_stdcpp(pat: &str, haystack: &str) -> usize {
use ffi::stdcpp::Regex;
Regex::new(pat).unwrap().find_iter(haystack).count()
Expand Down
2 changes: 2 additions & 0 deletions bench/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ bench_match!(match_class_in_range, "[ac]", {
});

#[cfg(not(feature = "re-rust-bytes"))]
// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
bench_match!(match_class_unicode, r"\p{L}", {
format!("{}a", repeat("☃5☃5").take(20).collect::<String>())
Expand Down
29 changes: 24 additions & 5 deletions bench/src/sherlock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,14 @@ sherlock!(the_whitespace, r"the\s+\w+", 5410);
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(everything_greedy, r".*", 13053);
// std::regex . does not match \r
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
sherlock!(everything_greedy, r"[^\n]*", 13053);
#[cfg(not(feature = "re-dphobos"))]
#[cfg(not(feature = "re-onig"))]
Expand All @@ -122,24 +126,34 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1);

// How fast can we match every letter? This also defeats any clever prefix
// tricks.
// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters, r"\p{L}", 447160);

// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters_upper, r"\p{Lu}", 14180);

// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters_lower, r"\p{Ll}", 432980);

// Similarly, for words.
#[cfg(not(feature = "re-re2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-re2"))]
sherlock!(words, r"\w+", 109214);
#[cfg(feature = "re-re2")]
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
))]
sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?

// Find complete words before Holmes. The `\w` defeats any prefix
Expand All @@ -162,6 +176,7 @@ sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(
holmes_coword_watson,
Expand All @@ -178,13 +193,17 @@ sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
// lazy DFA the entire way.
// std C++ does not support multiline until C++17 nor the inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-dphobos"))]
sherlock!(
line_boundary_sherlock_holmes,
r"(?m)^Sherlock Holmes|Sherlock Holmes$",
34);
// D matches both \r\n and \n as EOL
#[cfg(feature = "re-dphobos")]
#[cfg(any(
feature = "re-boost",
feature = "re-dphobos",
))]
sherlock!(
line_boundary_sherlock_holmes,
r"(?m)^Sherlock Holmes|Sherlock Holmes$",
Expand Down