diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 0423e71c1e..82496ceb66 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -46,6 +46,9 @@ bench = false re-pcre1 = ["libpcre-sys"] re-pcre2 = [] re-onig = ["onig"] +re-stdcpp = [] +libcxx = [] +re-boost = [] re-re2 = [] re-dphobos = [] re-dphobos-dmd = ["re-dphobos"] diff --git a/bench/build.rs b/bench/build.rs index 06509bdbc6..6b2d755b6a 100644 --- a/bench/build.rs +++ b/bench/build.rs @@ -18,6 +18,32 @@ fn main() { if env::var("CARGO_FEATURE_RE_PCRE2").is_ok() { pkg_config::probe_library("libpcre2-8").unwrap(); } + if env::var("CARGO_FEATURE_RE_STDCPP").is_ok() { + // stdcpp is a C++ library, so we need to compile our shim layer. + if !env::var("CARGO_FEATURE_LIBCXX").is_ok() { + // use default stdlib + cc::Build::new() + .cpp(true) + .file("src/ffi/stdcpp.cpp") + .compile("libcstdcpp.a"); + } else { + // use libc++ stdlib + cc::Build::new() + .cpp(true) + .file("src/ffi/stdcpp.cpp") + .cpp_link_stdlib("c++") + .compile("libcstdcpp.a"); + } + } + if env::var("CARGO_FEATURE_RE_BOOST").is_ok() { + // stdcpp is a C++ library, so we need to compile our shim layer. + cc::Build::new() + .cpp(true) + .file("src/ffi/stdcpp.cpp") + .define("USE_BOOST", None) + .compile("libcboost.a"); + println!("cargo:rustc-link-lib=boost_regex"); + } if env::var("CARGO_FEATURE_RE_RE2").is_ok() { // RE2 is a C++ library, so we need to compile our shim layer. cc::Build::new() diff --git a/bench/compile b/bench/compile index 698dbe3918..3a8d22ffe2 100755 --- a/bench/compile +++ b/bench/compile @@ -2,5 +2,5 @@ exec cargo build \ --release \ - --features 're-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \ + --features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \ "$@" diff --git a/bench/run b/bench/run index 6c960e2f4d..800a4d5aff 100755 --- a/bench/run +++ b/bench/run @@ -1,7 +1,7 @@ #!/bin/bash usage() { - echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | re2 | onig | tcl ]" >&2 + echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2 exit 1 } @@ -30,6 +30,15 @@ case $which in rust-bytes) exec cargo bench --bench bench --features re-rust-bytes "$@" ;; + stdcpp) + exec cargo bench --bench bench --features re-stdcpp "$@" + ;; + stdcpp-libcxx) + exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@" + ;; + boost) + exec cargo bench --bench bench --features re-boost "$@" + ;; re2) exec cargo bench --bench bench --features re-re2 "$@" ;; diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 61d53151c4..6cb56db850 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -28,13 +28,17 @@ extern crate regex; extern crate regex_syntax; extern crate test; - #[cfg(feature = "re-onig")] pub use ffi::onig::Regex; #[cfg(feature = "re-pcre1")] pub use ffi::pcre1::Regex; #[cfg(feature = "re-pcre2")] pub use ffi::pcre2::Regex; +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] +pub use ffi::stdcpp::Regex; #[cfg(feature = "re-re2")] pub use ffi::re2::Regex; #[cfg(feature = "re-dphobos")] @@ -90,6 +94,8 @@ macro_rules! text { feature = "re-onig", feature = "re-pcre1", feature = "re-pcre2", + feature = "re-stdcpp", + feature = "re-boost", feature = "re-re2", feature = "re-dphobos", feature = "re-rust", @@ -107,6 +113,8 @@ type Text = Vec; feature = "re-onig", feature = "re-pcre1", feature = "re-pcre2", + feature = "re-stdcpp", + feature = "re-boost", feature = "re-re2", feature = "re-dphobos", feature = "re-rust", diff --git a/bench/src/ffi/mod.rs b/bench/src/ffi/mod.rs index 2f2e657327..e9733715ef 100644 --- a/bench/src/ffi/mod.rs +++ b/bench/src/ffi/mod.rs @@ -20,6 +20,11 @@ pub mod onig; pub mod pcre1; #[cfg(feature = "re-pcre2")] pub mod pcre2; +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] +pub mod stdcpp; #[cfg(feature = "re-re2")] pub mod re2; #[cfg(feature = "re-tcl")] diff --git a/bench/src/ffi/stdcpp.cpp b/bench/src/ffi/stdcpp.cpp new file mode 100644 index 0000000000..d5abc9cdae --- /dev/null +++ b/bench/src/ffi/stdcpp.cpp @@ -0,0 +1,53 @@ +#ifdef USE_BOOST +#include +#else +#include +#endif + +extern "C" { + +#ifdef USE_BOOST + namespace regex_ns = boost; +#else + namespace regex_ns = std; +#endif + + typedef void stdcpp_regexp; + + typedef struct stdcpp_string { + const char *text; + int len; + } stdcpp_string; + + stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) { + return reinterpret_cast(new regex_ns::regex(pat.text, + pat.len, + regex_ns::regex::optimize)); + } + + void stdcpp_regexp_free(stdcpp_regexp *re) { + delete reinterpret_cast(re); + } + + bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text, + int startpos, int endpos) { + regex_ns::regex cpp_re(*reinterpret_cast(re)); + return regex_ns::regex_search(text.text + startpos, text.text + endpos, + cpp_re); + } + + bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text, + int startpos, int endpos, + int *match_start, int *match_end) { + regex_ns::regex cpp_re(*reinterpret_cast(re)); + regex_ns::cmatch result; + bool matched; + matched = regex_ns::regex_search(text.text + startpos, text.text + endpos, + result, cpp_re); + if (matched) { + *match_start = result[0].first - text.text; + *match_end = *match_start + result.length(0); + } + return matched; + } +} diff --git a/bench/src/ffi/stdcpp.rs b/bench/src/ffi/stdcpp.rs new file mode 100644 index 0000000000..48ac3d7f8f --- /dev/null +++ b/bench/src/ffi/stdcpp.rs @@ -0,0 +1,163 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![allow(non_camel_case_types)] + +use libc::{c_uchar, c_int, c_void}; + +/// Regex wraps a std::regex regular expression. +/// +/// It cannot be used safely from multiple threads simultaneously. +pub struct Regex { + re: *mut stdcpp_regexp, +} + +unsafe impl Send for Regex {} + +impl Drop for Regex { + fn drop(&mut self) { + unsafe { stdcpp_regexp_free(self.re); } + } +} + +#[derive(Debug)] +pub struct Error(()); + +impl Regex { + pub fn new(pattern: &str) -> Result { + unsafe { Ok(Regex { re: stdcpp_regexp_new(pattern.into()) }) } + } + + pub fn is_match(&self, text: &str) -> bool { + unsafe { + stdcpp_regexp_match(self.re, text.into(), 0, text.len() as c_int) + } + } + + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + FindMatches { + re: self, + text: text, + last_end: 0, + last_match: None, + } + } + + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + let (mut s, mut e): (c_int, c_int) = (0, 0); + let matched = unsafe { + stdcpp_regexp_find( + self.re, + text.into(), + start as c_int, + text.len() as c_int, + &mut s, + &mut e, + ) + }; + if matched { + Some((s as usize, e as usize)) + } else { + None + } + } +} + +pub struct FindMatches<'r, 't> { + re: &'r Regex, + text: &'t str, + last_end: usize, + last_match: Option, +} + +// This implementation is identical to the one Rust uses, since both Rust's +// regex engine and std::regex handle empty matches in the same way. +impl<'r, 't> Iterator for FindMatches<'r, 't> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + fn next_after_empty(text: &str, i: usize) -> usize { + let b = match text.as_bytes().get(i) { + None => return text.len() + 1, + Some(&b) => b, + }; + let inc = if b <= 0x7F { + 1 + } else if b <= 0b110_11111 { + 2 + } else if b <= 0b1110_1111 { + 3 + } else { + 4 + }; + i + inc + } + + if self.last_end > self.text.len() { + return None; + } + let (s, e) = match self.re.find_at(self.text, self.last_end) { + None => return None, + Some((s, e)) => (s, e), + }; + assert!(s >= self.last_end); + if s == e { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = next_after_empty(&self.text, e); + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(e) == self.last_match { + return self.next(); + } + } else { + self.last_end = e; + } + self.last_match = Some(self.last_end); + Some((s, e)) + } +} + +// stdcpp FFI is below. Note that this uses a hand-rolled C API that is defined +// in stdcpp.cpp. + +type stdcpp_regexp = c_void; + +#[repr(C)] +struct stdcpp_string { + text: *const c_uchar, + len: c_int, +} + +impl<'a> From<&'a str> for stdcpp_string { + fn from(s: &'a str) -> stdcpp_string { + stdcpp_string { text: s.as_ptr(), len: s.len() as c_int } + } +} + +extern { + fn stdcpp_regexp_new(pat: stdcpp_string) -> *mut stdcpp_regexp; + fn stdcpp_regexp_free(re: *mut stdcpp_regexp); + fn stdcpp_regexp_match( + re: *mut stdcpp_regexp, + text: stdcpp_string, + startpos: c_int, + endpos: c_int, + ) -> bool; + fn stdcpp_regexp_find( + re: *mut stdcpp_regexp, + text: stdcpp_string, + startpos: c_int, + endpos: c_int, + match_start: *mut c_int, + match_end: *mut c_int, + ) -> bool; +} diff --git a/bench/src/main.rs b/bench/src/main.rs index c590dfff7d..e4dc7c933b 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -45,7 +45,7 @@ Since this tool includes compilation of the , sufficiently large haystacks should be used to amortize the cost of compilation. (e.g., >1MB.) Usage: - regex-run-one [options] [onig | pcre1 | pcre2 | re2 | rust | rust-bytes | tcl] + regex-run-one [options] [onig | pcre1 | pcre2 | stdcpp | re2 | rust | rust-bytes | tcl] regex-run-one [options] (-h | --help) Options: @@ -59,6 +59,7 @@ struct Args { cmd_onig: bool, cmd_pcre1: bool, cmd_pcre2: bool, + cmd_stdcpp: bool, cmd_re2: bool, cmd_rust: bool, cmd_rust_bytes: bool, @@ -87,6 +88,8 @@ impl Args { count_pcre1(pat, haystack) } else if self.cmd_pcre2 { count_pcre2(pat, haystack) + } else if self.cmd_stdcpp { + count_stdcpp(pat, haystack) } else if self.cmd_re2 { count_re2(pat, haystack) } else if self.cmd_rust { @@ -132,6 +135,20 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize { Regex::new(pat).unwrap().find_iter(haystack).count() } +#[cfg(not(any( + feature = "re-stdcpp", + feature = "re-boost", + )))] +nada!("re-stdcpp", count_stdcpp); +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] +fn count_stdcpp(pat: &str, haystack: &str) -> usize { + use ffi::stdcpp::Regex; + Regex::new(pat).unwrap().find_iter(haystack).count() +} + nada!("re-re2", count_re2); #[cfg(feature = "re-re2")] fn count_re2(pat: &str, haystack: &str) -> usize { diff --git a/bench/src/misc.rs b/bench/src/misc.rs index 8d7cc40d42..ad516e23e4 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -19,6 +19,7 @@ use {Regex, Text}; #[cfg(not(feature = "re-onig"))] #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] +#[cfg(not(feature = "re-stdcpp"))] #[cfg(not(feature = "re-dphobos-dmd-ct"))] #[cfg(not(feature = "re-dphobos-ldc-ct"))] bench_match!(no_exponential, { @@ -45,6 +46,9 @@ bench_match!(match_class_in_range, "[ac]", { }); #[cfg(not(feature = "re-rust-bytes"))] +// std C++ does not support unicode character classes +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] bench_match!(match_class_unicode, r"\p{L}", { format!("{}a", repeat("☃5☃5").take(20).collect::()) diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs index 760903d09d..c54c9b772e 100644 --- a/bench/src/sherlock.rs +++ b/bench/src/sherlock.rs @@ -35,8 +35,14 @@ sherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91); // Like the above, except case insensitively. The prefix detector will extract // multiple *cut* prefix literals for each of the following before hitting its // limit. All of these should be able to use either memchr2 or memchr3. +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] sherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102); +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] sherlock!(name_holmes_nocase, r"(?i)Holmes", 467); +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] sherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96); // Will quickly find instances of 'Sherlock', but then needs to fall back to @@ -55,6 +61,8 @@ sherlock!(name_alt2, r"Sherlock|Holmes", 558); // also can't use any memchr variant. sherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740); // Still using Aho-Corasick, but needs the lazy DFA. +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] sherlock!( name_alt3_nocase, r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker", @@ -62,9 +70,13 @@ sherlock!( // Should still use Aho-Corasick for the prefixes in each alternate, but // we need to use the lazy DFA to complete it. sherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582); +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] sherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697); // Uses Aho-Corasick, but can use memchr3 (unlike name_alt3). sherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639); +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] sherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650); // How long does it take to discover that there's no match? In the first two @@ -80,6 +92,8 @@ sherlock!(no_match_really_common, r"aei", 0); // matching engines.) sherlock!(the_lower, r"the", 7218); sherlock!(the_upper, r"The", 741); +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] sherlock!(the_nocase, r"(?i)the", 7987); // Process whitespace after a very common word. @@ -91,30 +105,55 @@ sherlock!(the_whitespace, r"the\s+\w+", 5410); #[cfg(not(feature = "re-dphobos"))] #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(everything_greedy, r".*", 13053); +// std::regex . does not match \r +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] +sherlock!(everything_greedy, r"[^\n]*", 13053); #[cfg(not(feature = "re-dphobos"))] #[cfg(not(feature = "re-onig"))] #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] +// std C++ does not support inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] #[cfg(not(feature = "re-tcl"))] sherlock!(everything_greedy_nl, r"(?s).*", 1); // How fast can we match every letter? This also defeats any clever prefix // tricks. +// std C++ does not support unicode character classes +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(letters, r"\p{L}", 447160); +// std C++ does not support unicode character classes +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(letters_upper, r"\p{Lu}", 14180); +// std C++ does not support unicode character classes +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(letters_lower, r"\p{Ll}", 432980); // Similarly, for words. +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-re2"))] sherlock!(words, r"\w+", 109214); -#[cfg(feature = "re-re2")] +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + feature = "re-re2", + ))] sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here? // Find complete words before Holmes. The `\w` defeats any prefix @@ -136,6 +175,8 @@ sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7); #[cfg(not(feature = "re-onig"))] #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!( holmes_coword_watson, @@ -150,13 +191,19 @@ sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767); // Finds all occurrences of Sherlock Holmes at the beginning or end of a line. // The empty assertions defeat any detection of prefix literals, so it's the // lazy DFA the entire way. +// std C++ does not support multiline until C++17 nor the inline modifier syntax +#[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-dphobos"))] sherlock!( line_boundary_sherlock_holmes, r"(?m)^Sherlock Holmes|Sherlock Holmes$", 34); // D matches both \r\n and \n as EOL -#[cfg(feature = "re-dphobos")] +#[cfg(any( + feature = "re-boost", + feature = "re-dphobos", + ))] sherlock!( line_boundary_sherlock_holmes, r"(?m)^Sherlock Holmes|Sherlock Holmes$",