Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add support for C++ std::regex to benchmarks
* bench/Cargo.toml: add `re-stdcpp` feature
* bench/build.rs: add `cstdcpp` library to bench build
* bench/compile: add `re-stdcpp` feature to bench compile script
* bench/run: add `re-stdcpp` feature to bench run script
* bench/src/bench.rs: use `ffi::stdcpp::Regex`, define its `text!` macro, and `Text` type
* bench/src/ffi/mod.rs: declare `stdcpp` module
* bench/src/ffi/stdcpp.cpp: implement C API using C++ `std::regex`
* bench/src/ffi/stdcpp.rs: Rust `Regex` API implementation using C++ `std::regex` C API wrapper
* bench/src/main.rs: add stdcpp to bench main
* bench/src/misc.rs:
   - do not run `no_exponential` benchmark for `re-stdcpp` feature because `libstdc++` `std::regex` implementation currently seems to have exponential behavior here
   - do not run `match_class_unicode` benchmark for `re-stdcpp` feature because `std::regex` ECMAScript grammar does not support unicode character classes
* bench/src/sherlock.rs:
   - do not run `name_sherlock_nocase`, `name_holmes_nocase`, `name_sherlock_holmes_nocase`, `name_alt3_nocase`, `name_alt4_nocase`, `name_alt5_nocase`, `the_nocase`, `everything_greedy_nl`, and `line_boundary_sherlock_holmes` benchmarks for `re-stdcpp` feature because `std::regex` ECMAScript grammar does not support inline modifier syntax
   - do not run `letters`, `letters_upper`, and `letters_lower` benchmarks for `re-stdcpp` feature because `std::regex` ECMAScript grammar does not support unicode character classes
   - use a different regex for `everything_greedy` benchmark because `std::regex` '.' does not match '\r'
   - `words` benchmark for `std::regex` matches RE2 test result, so use that test for `re-stdcpp` feature as well
   - do not run `holmes_coword_watson` benchmark for `re-stdcpp` feature because `libstdc++` `std::regex` implementation currently seems to have exponential behavior here
  • Loading branch information
mkrupcale committed Mar 15, 2018
commit abc1ce29746d2cfb3a68ab1f49add191eb6f1ce0
1 change: 1 addition & 0 deletions bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ bench = false
re-pcre1 = ["libpcre-sys"]
re-pcre2 = []
re-onig = ["onig"]
re-stdcpp = []
re-re2 = []
re-dphobos = []
re-dphobos-dmd = ["re-dphobos"]
Expand Down
7 changes: 7 additions & 0 deletions bench/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ fn main() {
if env::var("CARGO_FEATURE_RE_PCRE2").is_ok() {
pkg_config::probe_library("libpcre2-8").unwrap();
}
if env::var("CARGO_FEATURE_RE_STDCPP").is_ok() {
// stdcpp is a C++ library, so we need to compile our shim layer.
cc::Build::new()
.cpp(true)
.file("src/ffi/stdcpp.cpp")
.compile("libcstdcpp.a");
}
if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
// RE2 is a C++ library, so we need to compile our shim layer.
cc::Build::new()
Expand Down
2 changes: 1 addition & 1 deletion bench/compile
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

exec cargo build \
--release \
--features 're-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
--features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
"$@"
5 changes: 4 additions & 1 deletion bench/run
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

usage() {
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | re2 | onig | tcl ]" >&2
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | re2 | onig | tcl ]" >&2
exit 1
}

Expand Down Expand Up @@ -30,6 +30,9 @@ case $which in
rust-bytes)
exec cargo bench --bench bench --features re-rust-bytes "$@"
;;
stdcpp)
exec cargo bench --bench bench --features re-stdcpp "$@"
;;
re2)
exec cargo bench --bench bench --features re-re2 "$@"
;;
Expand Down
4 changes: 4 additions & 0 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ pub use ffi::onig::Regex;
pub use ffi::pcre1::Regex;
#[cfg(feature = "re-pcre2")]
pub use ffi::pcre2::Regex;
#[cfg(feature = "re-stdcpp")]
pub use ffi::stdcpp::Regex;
#[cfg(feature = "re-re2")]
pub use ffi::re2::Regex;
#[cfg(feature = "re-dphobos")]
Expand Down Expand Up @@ -90,6 +92,7 @@ macro_rules! text {
feature = "re-onig",
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand All @@ -107,6 +110,7 @@ type Text = Vec<u8>;
feature = "re-onig",
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand Down
2 changes: 2 additions & 0 deletions bench/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ pub mod onig;
pub mod pcre1;
#[cfg(feature = "re-pcre2")]
pub mod pcre2;
#[cfg(feature = "re-stdcpp")]
pub mod stdcpp;
#[cfg(feature = "re-re2")]
pub mod re2;
#[cfg(feature = "re-tcl")]
Expand Down
42 changes: 42 additions & 0 deletions bench/src/ffi/stdcpp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <regex>

extern "C" {
typedef void stdcpp_regexp;

typedef struct stdcpp_string {
const char *text;
int len;
} stdcpp_string;

stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
pat.len,
std::regex::optimize));
}

void stdcpp_regexp_free(stdcpp_regexp *re) {
delete reinterpret_cast<std::regex*>(re);
}

bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos) {
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
return std::regex_search(text.text + startpos, text.text + endpos,
cpp_re);
}

bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos,
int *match_start, int *match_end) {
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
std::cmatch result;
bool matched;
matched = std::regex_search(text.text + startpos, text.text + endpos,
result, cpp_re);
if (matched) {
*match_start = result[0].first - text.text;
*match_end = *match_start + result.length(0);
}
return matched;
}
}
163 changes: 163 additions & 0 deletions bench/src/ffi/stdcpp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![allow(non_camel_case_types)]

use libc::{c_uchar, c_int, c_void};

/// Regex wraps a std::regex regular expression.
///
/// It cannot be used safely from multiple threads simultaneously.
pub struct Regex {
re: *mut stdcpp_regexp,
}

unsafe impl Send for Regex {}

impl Drop for Regex {
fn drop(&mut self) {
unsafe { stdcpp_regexp_free(self.re); }
}
}

#[derive(Debug)]
pub struct Error(());

impl Regex {
pub fn new(pattern: &str) -> Result<Regex, Error> {
unsafe { Ok(Regex { re: stdcpp_regexp_new(pattern.into()) }) }
}

pub fn is_match(&self, text: &str) -> bool {
unsafe {
stdcpp_regexp_match(self.re, text.into(), 0, text.len() as c_int)
}
}

pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
FindMatches {
re: self,
text: text,
last_end: 0,
last_match: None,
}
}

fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
let (mut s, mut e): (c_int, c_int) = (0, 0);
let matched = unsafe {
stdcpp_regexp_find(
self.re,
text.into(),
start as c_int,
text.len() as c_int,
&mut s,
&mut e,
)
};
if matched {
Some((s as usize, e as usize))
} else {
None
}
}
}

pub struct FindMatches<'r, 't> {
re: &'r Regex,
text: &'t str,
last_end: usize,
last_match: Option<usize>,
}

// This implementation is identical to the one Rust uses, since both Rust's
// regex engine and std::regex handle empty matches in the same way.
impl<'r, 't> Iterator for FindMatches<'r, 't> {
type Item = (usize, usize);

fn next(&mut self) -> Option<(usize, usize)> {
fn next_after_empty(text: &str, i: usize) -> usize {
let b = match text.as_bytes().get(i) {
None => return text.len() + 1,
Some(&b) => b,
};
let inc = if b <= 0x7F {
1
} else if b <= 0b110_11111 {
2
} else if b <= 0b1110_1111 {
3
} else {
4
};
i + inc
}

if self.last_end > self.text.len() {
return None;
}
let (s, e) = match self.re.find_at(self.text, self.last_end) {
None => return None,
Some((s, e)) => (s, e),
};
assert!(s >= self.last_end);
if s == e {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = next_after_empty(&self.text, e);
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(e) == self.last_match {
return self.next();
}
} else {
self.last_end = e;
}
self.last_match = Some(self.last_end);
Some((s, e))
}
}

// stdcpp FFI is below. Note that this uses a hand-rolled C API that is defined
// in stdcpp.cpp.

type stdcpp_regexp = c_void;

#[repr(C)]
struct stdcpp_string {
text: *const c_uchar,
len: c_int,
}

impl<'a> From<&'a str> for stdcpp_string {
fn from(s: &'a str) -> stdcpp_string {
stdcpp_string { text: s.as_ptr(), len: s.len() as c_int }
}
}

extern {
fn stdcpp_regexp_new(pat: stdcpp_string) -> *mut stdcpp_regexp;
fn stdcpp_regexp_free(re: *mut stdcpp_regexp);
fn stdcpp_regexp_match(
re: *mut stdcpp_regexp,
text: stdcpp_string,
startpos: c_int,
endpos: c_int,
) -> bool;
fn stdcpp_regexp_find(
re: *mut stdcpp_regexp,
text: stdcpp_string,
startpos: c_int,
endpos: c_int,
match_start: *mut c_int,
match_end: *mut c_int,
) -> bool;
}
12 changes: 11 additions & 1 deletion bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Since this tool includes compilation of the <pattern>, sufficiently large
haystacks should be used to amortize the cost of compilation. (e.g., >1MB.)

Usage:
regex-run-one [options] [onig | pcre1 | pcre2 | re2 | rust | rust-bytes | tcl] <file> <pattern>
regex-run-one [options] [onig | pcre1 | pcre2 | stdcpp | re2 | rust | rust-bytes | tcl] <file> <pattern>
regex-run-one [options] (-h | --help)

Options:
Expand All @@ -59,6 +59,7 @@ struct Args {
cmd_onig: bool,
cmd_pcre1: bool,
cmd_pcre2: bool,
cmd_stdcpp: bool,
cmd_re2: bool,
cmd_rust: bool,
cmd_rust_bytes: bool,
Expand Down Expand Up @@ -87,6 +88,8 @@ impl Args {
count_pcre1(pat, haystack)
} else if self.cmd_pcre2 {
count_pcre2(pat, haystack)
} else if self.cmd_stdcpp {
count_stdcpp(pat, haystack)
} else if self.cmd_re2 {
count_re2(pat, haystack)
} else if self.cmd_rust {
Expand Down Expand Up @@ -132,6 +135,13 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
Regex::new(pat).unwrap().find_iter(haystack).count()
}

nada!("re-stdcpp", count_stdcpp);
#[cfg(feature = "re-stdcpp")]
fn count_stdcpp(pat: &str, haystack: &str) -> usize {
use ffi::stdcpp::Regex;
Regex::new(pat).unwrap().find_iter(haystack).count()
}

nada!("re-re2", count_re2);
#[cfg(feature = "re-re2")]
fn count_re2(pat: &str, haystack: &str) -> usize {
Expand Down
2 changes: 2 additions & 0 deletions bench/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use {Regex, Text};
#[cfg(not(feature = "re-onig"))]
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-dphobos-dmd-ct"))]
#[cfg(not(feature = "re-dphobos-ldc-ct"))]
bench_match!(no_exponential, {
Expand All @@ -45,6 +46,7 @@ bench_match!(match_class_in_range, "[ac]", {
});

#[cfg(not(feature = "re-rust-bytes"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-tcl"))]
bench_match!(match_class_unicode, r"\p{L}", {
format!("{}a", repeat("☃5☃5").take(20).collect::<String>())
Expand Down
Loading