Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added Regex::participating_captures_len.
  • Loading branch information
orlp committed Sep 24, 2022
commit 7874e124c4b66fa6d597953f166b9b39d05043d6
54 changes: 53 additions & 1 deletion regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ impl Hir {
info.set_match_empty(true);
info.set_literal(false);
info.set_alternation_literal(false);
info.static_capture_count = Some(0);
Hir { kind: HirKind::Empty, info }
}

Expand All @@ -268,6 +269,7 @@ impl Hir {
info.set_match_empty(false);
info.set_literal(true);
info.set_alternation_literal(true);
info.static_capture_count = Some(0);
Hir { kind: HirKind::Literal(lit), info }
}

Expand All @@ -285,6 +287,7 @@ impl Hir {
info.set_match_empty(false);
info.set_literal(false);
info.set_alternation_literal(false);
info.static_capture_count = Some(0);
Hir { kind: HirKind::Class(class), info }
}

Expand Down Expand Up @@ -318,6 +321,7 @@ impl Hir {
if let Anchor::EndLine = anchor {
info.set_line_anchored_end(true);
}
info.static_capture_count = Some(0);
Hir { kind: HirKind::Anchor(anchor), info }
}

Expand Down Expand Up @@ -345,6 +349,7 @@ impl Hir {
if let WordBoundary::AsciiNegate = word_boundary {
info.set_always_utf8(false);
}
info.static_capture_count = Some(0);
Hir { kind: HirKind::WordBoundary(word_boundary), info }
}

Expand Down Expand Up @@ -372,6 +377,28 @@ impl Hir {
info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
info.set_literal(false);
info.set_alternation_literal(false);
info.static_capture_count = {
use RepetitionRange::{AtLeast, Bounded, Exactly};
match (&rep.kind, rep.hir.info.static_capture_count) {
// Always zero.
(_, Some(0)) => Some(0),
(RepetitionKind::Range(Exactly(0)), _) => Some(0),
(RepetitionKind::Range(Bounded(_, 0)), _) => Some(0),

// Impossible to know statically.
(_, None) => None,
(RepetitionKind::ZeroOrOne, _) => None,
(RepetitionKind::ZeroOrMore, _) => None,
(RepetitionKind::Range(AtLeast(0)), _) => None,
(RepetitionKind::Range(Bounded(0, _)), _) => None,

// Guaranteed to be static.
(RepetitionKind::OneOrMore, Some(n)) => Some(n),
(RepetitionKind::Range(Exactly(_)), Some(n)) => Some(n),
(RepetitionKind::Range(AtLeast(_)), Some(n)) => Some(n),
(RepetitionKind::Range(Bounded(_, _)), Some(n)) => Some(n),
}
};
Hir { kind: HirKind::Repetition(rep), info }
}

Expand All @@ -389,6 +416,12 @@ impl Hir {
info.set_match_empty(group.hir.is_match_empty());
info.set_literal(false);
info.set_alternation_literal(false);
info.static_capture_count = match group.kind {
GroupKind::NonCapturing { .. } => {
group.hir.info.static_capture_count
}
_ => group.hir.info.static_capture_count.map(|n| n + 1),
};
Hir { kind: HirKind::Group(group), info }
}

Expand Down Expand Up @@ -480,6 +513,10 @@ impl Hir {
})
.any(|e| e.is_line_anchored_end()),
);
info.static_capture_count =
exprs.iter().fold(Some(0), |cnt, e| {
Some(cnt? + e.info.static_capture_count?)
});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be in the loop above. Otherwise this iterates over the entire concatenation a second time for no good reason as far as I can tell.

Hir { kind: HirKind::Concat(exprs), info }
}
}
Expand Down Expand Up @@ -542,6 +579,9 @@ impl Hir {
let x = info.is_alternation_literal() && e.is_literal();
info.set_alternation_literal(x);
}
let mut capture_counts = exprs.iter().map(|e| e.info.static_capture_count);
let first = capture_counts.next().unwrap_or(Some(0));
info.static_capture_count = capture_counts.fold(first, |a, b| if a == b { a } else { None });
Hir { kind: HirKind::Alternation(exprs), info }
}
}
Expand Down Expand Up @@ -692,6 +732,13 @@ impl Hir {
pub fn is_alternation_literal(&self) -> bool {
self.info.is_alternation_literal()
}

/// Returns the number of captures groups that would participate in a
/// successful match of this expression. If this number can not be
/// statically determined from the regex this function returns `None`.
pub fn participating_captures_len(&self) -> Option<usize> {
self.info.static_capture_count.map(|c| c as usize)
}
}

impl HirKind {
Expand Down Expand Up @@ -1484,6 +1531,11 @@ struct HirInfo {
/// If more attributes need to be added, it is OK to increase the size of
/// this as appropriate.
bools: u16,

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This blank line can be dropped.

/// How many capture groups this HIR expression deterministically fills.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think the word here should be "static" and not "deterministic."

/// If this number could depend on the input (e.g. an Alternation where the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replace "input" with "haystack" here please. (I'm trying to be better about always using the word "haystack" when referring to the text being searched.)

/// two branches have a different number of capture groups), this is None.
static_capture_count: Option<u32>,
}

// A simple macro for defining bitfield accessors/mutators.
Expand All @@ -1505,7 +1557,7 @@ macro_rules! define_bool {

impl HirInfo {
fn new() -> HirInfo {
HirInfo { bools: 0 }
HirInfo { bools: 0, static_capture_count: None }
}

define_bool!(0, is_always_utf8, set_always_utf8);
Expand Down
1 change: 1 addition & 0 deletions src/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ impl Compiler {
self.compiled.start = dotstar_patch.entry;
}
self.compiled.captures = vec![None];
self.compiled.participating_captures_len = expr.participating_captures_len();
let patch =
self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
if self.compiled.needs_dotstar() {
Expand Down
9 changes: 8 additions & 1 deletion src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ struct ExecReadOnly {
/// A compiled program that is used in the NFA simulation and backtracking.
/// It can be byte-based or Unicode codepoint based.
///
/// N.B. It is not possibly to make this byte-based from the public API.
/// N.B. It is not possible to make this byte-based from the public API.
/// It is only used for testing byte based programs in the NFA simulations.
nfa: Program,
/// A compiled byte based program for DFA execution. This is only used
Expand Down Expand Up @@ -1311,6 +1311,13 @@ impl Exec {
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}

/// Returns the number of participating captures that this regex will
/// return on a successful match. If this number can not be statically
/// determined from the regex this function returns `None`.
pub fn participating_captures_len(&self) -> Option<usize> {
self.ro.nfa.participating_captures_len
}
}

impl Clone for Exec {
Expand Down
4 changes: 4 additions & 0 deletions src/prog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ pub struct Program {
/// The ordered sequence of all capture groups extracted from the AST.
/// Unnamed groups are `None`.
pub captures: Vec<Option<String>>,
/// The number of capture groups that participate in a successful match.
/// None if this can't be determined statically at compile time.
pub participating_captures_len: Option<usize>,
/// Pointers to all named capture groups into `captures`.
pub capture_name_idx: Arc<HashMap<String, usize>>,
/// A pointer to the start instruction. This can vary depending on how
Expand Down Expand Up @@ -82,6 +85,7 @@ impl Program {
insts: vec![],
matches: vec![],
captures: vec![],
participating_captures_len: None,
capture_name_idx: Arc::new(HashMap::new()),
start: 0,
byte_classes: vec![0; 256],
Expand Down
7 changes: 7 additions & 0 deletions src/re_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,13 @@ impl Regex {
pub fn captures_len(&self) -> usize {
self.0.capture_names().len()
}

/// Returns the number of participating captures that this regex will
/// return on a successful match. If this number can not be statically
/// determined from the regex this function returns `None`.
pub fn participating_captures_len(&self) -> Option<usize> {
self.0.participating_captures_len()
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you back out the public API changes to regex please? I'm fine adding this to regex-syntax, but I'm less sure about it being in the regex API.

/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
Expand Down
7 changes: 7 additions & 0 deletions src/re_unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,13 @@ impl Regex {
pub fn captures_len(&self) -> usize {
self.0.capture_names().len()
}

/// Returns the number of participating captures that this regex will
/// return on a successful match. If this number can not be statically
/// determined from the regex this function returns `None`.
pub fn participating_captures_len(&self) -> Option<usize> {
self.0.participating_captures_len()
}

/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
Expand Down
38 changes: 38 additions & 0 deletions tests/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,44 @@ fn capture_index_lifetime() {
assert_eq!(3, inner("123"));
}

#[test]
fn participating_captures_len() {
let tests = [
("", Some(0)),
("foo|bar", Some(0)),
("(foo)|bar", None),
("foo|(bar)", None),
("(foo|bar)", Some(1)),
("(a|b|c|d|e|f)", Some(1)),
("(a)|(b)|(c)|(d)|(e)|(f)", Some(1)),
("(a)(b)|(c)(d)|(e)(f)", Some(2)),
("(a)(b)(c)|(d)(e)(f)", Some(3)),
("(a)(b)(c)(d)(e)(f)", Some(6)),
("(a)(b)(extra)|(a)(b)()", Some(3)),
("(a)(b)((?:extra)?)", Some(3)),
("(a)(b)(extra)?", None),
("(foo)|(bar)", Some(1)),
("(foo)(bar)", Some(2)),
("(foo)+(bar)", Some(2)),
("(foo)*(bar)", None),
("(foo)?{0}", Some(0)),
("(foo)?{1}", None),
("(foo){1}", Some(1)),
("(foo){1,}", Some(1)),
("(foo){1,}?", Some(1)),
("(foo){0,}", None),
("(foo)(?:bar)", Some(1)),
("(foo(?:bar)+)(?:baz(boo))", Some(2)),
("(?P<bar>foo)(?:bar)(bal|loon)", Some(2)),
(r"(?:(\w)(\s))?", None),
(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#, Some(2)),
];
for (test_regex, expected) in tests {
let re = regex!(test_regex);
assert_eq!(re.participating_captures_len(), expected, "for regex {test_regex}");
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you move these tests to regex-syntax/src/hir/translate.rs? That's where all of the existing "analysis" tests are. It also pushes them closer to where they are implemented.


#[test]
fn capture_misc() {
let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)");
Expand Down