@@ -1833,6 +1833,7 @@ struct PropertiesI {
18331833 look_set_suffix : LookSet ,
18341834 utf8 : bool ,
18351835 captures_len : usize ,
1836+ static_captures_len : Option < usize > ,
18361837 literal : bool ,
18371838 alternation_literal : bool ,
18381839}
@@ -1990,6 +1991,44 @@ impl Properties {
19901991 self . 0 . captures_len
19911992 }
19921993
1994+ /// Returns the total number of explicit capturing groups that appear in
1995+ /// every possible match.
1996+ ///
1997+ /// If the number of capture groups can vary depending on the match, then
1998+ /// this returns `None`. That is, a value is only returned when the number
1999+ /// of matching groups is invariant or "static."
2000+ ///
2001+ /// Note that this does not include the implicit capturing group
2002+ /// corresponding to the entire match.
2003+ ///
2004+ /// # Example
2005+ ///
2006+ /// This shows a few cases where a static number of capture groups is
2007+ /// available and a few cases where it is not.
2008+ ///
2009+ /// ```
2010+ /// use regex_syntax::parse;
2011+ ///
2012+ /// let len = |pattern| {
2013+ /// parse(pattern).map(|h| h.properties().static_captures_len())
2014+ /// };
2015+ ///
2016+ /// assert_eq!(Some(0), len("a")?);
2017+ /// assert_eq!(Some(1), len("(a)")?);
2018+ /// assert_eq!(Some(1), len("(a)|(b)")?);
2019+ /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2020+ /// assert_eq!(None, len("(a)|b")?);
2021+ /// assert_eq!(None, len("a|(b)")?);
2022+ /// assert_eq!(None, len("(b)*")?);
2023+ /// assert_eq!(Some(1), len("(b)+")?);
2024+ ///
2025+ /// # Ok::<(), Box<dyn std::error::Error>>(())
2026+ /// ```
2027+ #[ inline]
2028+ pub fn static_captures_len ( & self ) -> Option < usize > {
2029+ self . 0 . static_captures_len
2030+ }
2031+
19932032 /// Return true if and only if this HIR is a simple literal. This is
19942033 /// only true when this HIR expression is either itself a `Literal` or a
19952034 /// concatenation of only `Literal`s.
@@ -2100,6 +2139,13 @@ impl Properties {
21002139 } else {
21012140 LookSet :: full ( )
21022141 } ;
2142+ // And also, an empty alternate means we have 0 static capture groups,
2143+ // but we otherwise start with the number corresponding to the first
2144+ // alternate. If any subsequent alternate has a different number of
2145+ // static capture groups, then we overall have a variation and not a
2146+ // static number of groups.
2147+ let static_captures_len =
2148+ it. peek ( ) . and_then ( |p| p. borrow ( ) . static_captures_len ( ) ) ;
21032149 // The base case is an empty alternation, which matches nothing.
21042150 // Note though that empty alternations aren't possible, because the
21052151 // Hir::alternation smart constructor rewrites those as empty character
@@ -2112,6 +2158,7 @@ impl Properties {
21122158 look_set_suffix : fix,
21132159 utf8 : true ,
21142160 captures_len : 0 ,
2161+ static_captures_len,
21152162 literal : false ,
21162163 alternation_literal : true ,
21172164 } ;
@@ -2125,6 +2172,9 @@ impl Properties {
21252172 props. utf8 = props. utf8 && p. is_utf8 ( ) ;
21262173 props. captures_len =
21272174 props. captures_len . saturating_add ( p. captures_len ( ) ) ;
2175+ if props. static_captures_len != p. static_captures_len ( ) {
2176+ props. static_captures_len = None ;
2177+ }
21282178 props. alternation_literal =
21292179 props. alternation_literal && p. is_alternation_literal ( ) ;
21302180 if !min_poisoned {
@@ -2180,6 +2230,7 @@ impl Properties {
21802230 // since it too can match the empty string.
21812231 utf8 : true ,
21822232 captures_len : 0 ,
2233+ static_captures_len : Some ( 0 ) ,
21832234 literal : false ,
21842235 alternation_literal : false ,
21852236 } ;
@@ -2196,6 +2247,7 @@ impl Properties {
21962247 look_set_suffix : LookSet :: empty ( ) ,
21972248 utf8 : core:: str:: from_utf8 ( & lit. 0 ) . is_ok ( ) ,
21982249 captures_len : 0 ,
2250+ static_captures_len : Some ( 0 ) ,
21992251 literal : true ,
22002252 alternation_literal : true ,
22012253 } ;
@@ -2212,6 +2264,7 @@ impl Properties {
22122264 look_set_suffix : LookSet :: empty ( ) ,
22132265 utf8 : class. is_utf8 ( ) ,
22142266 captures_len : 0 ,
2267+ static_captures_len : Some ( 0 ) ,
22152268 literal : false ,
22162269 alternation_literal : false ,
22172270 } ;
@@ -2241,6 +2294,7 @@ impl Properties {
22412294 // property borderline useless.
22422295 utf8 : true ,
22432296 captures_len : 0 ,
2297+ static_captures_len : Some ( 0 ) ,
22442298 literal : false ,
22452299 alternation_literal : false ,
22462300 } ;
@@ -2268,6 +2322,7 @@ impl Properties {
22682322 look_set_suffix : LookSet :: empty ( ) ,
22692323 utf8 : p. is_utf8 ( ) ,
22702324 captures_len : p. captures_len ( ) ,
2325+ static_captures_len : p. static_captures_len ( ) ,
22712326 literal : false ,
22722327 alternation_literal : false ,
22732328 } ;
@@ -2278,6 +2333,23 @@ impl Properties {
22782333 inner. look_set_prefix = p. look_set_prefix ( ) ;
22792334 inner. look_set_suffix = p. look_set_suffix ( ) ;
22802335 }
2336+ // If the static captures len of the sub-expression is not known or is
2337+ // zero, then it automatically propagates to the repetition, regardless
2338+ // of the repetition. Otherwise, it might change, but only when the
2339+ // repetition can match 0 times.
2340+ if rep. min == 0
2341+ && inner. static_captures_len . map_or ( false , |len| len > 0 )
2342+ {
2343+ // If we require a match 0 times, then our captures len is
2344+ // guaranteed to be zero. Otherwise, if we *can* match the empty
2345+ // string, then it's impossible to know how many captures will be
2346+ // in the resulting match.
2347+ if rep. max == Some ( 0 ) {
2348+ inner. static_captures_len = Some ( 0 ) ;
2349+ } else {
2350+ inner. static_captures_len = None ;
2351+ }
2352+ }
22812353 Properties ( Box :: new ( inner) )
22822354 }
22832355
@@ -2286,6 +2358,9 @@ impl Properties {
22862358 let p = capture. sub . properties ( ) ;
22872359 Properties ( Box :: new ( PropertiesI {
22882360 captures_len : p. captures_len ( ) . saturating_add ( 1 ) ,
2361+ static_captures_len : p
2362+ . static_captures_len ( )
2363+ . map ( |len| len. saturating_add ( 1 ) ) ,
22892364 literal : false ,
22902365 alternation_literal : false ,
22912366 ..* p. 0 . clone ( )
@@ -2306,6 +2381,7 @@ impl Properties {
23062381 look_set_suffix : LookSet :: empty ( ) ,
23072382 utf8 : true ,
23082383 captures_len : 0 ,
2384+ static_captures_len : Some ( 0 ) ,
23092385 literal : true ,
23102386 alternation_literal : true ,
23112387 } ;
@@ -2316,6 +2392,10 @@ impl Properties {
23162392 props. utf8 = props. utf8 && p. is_utf8 ( ) ;
23172393 props. captures_len =
23182394 props. captures_len . saturating_add ( p. captures_len ( ) ) ;
2395+ props. static_captures_len = p
2396+ . static_captures_len ( )
2397+ . and_then ( |len1| Some ( ( len1, props. static_captures_len ?) ) )
2398+ . and_then ( |( len1, len2) | Some ( len1. saturating_add ( len2) ) ) ;
23192399 props. literal = props. literal && p. is_literal ( ) ;
23202400 props. alternation_literal =
23212401 props. alternation_literal && p. is_alternation_literal ( ) ;
0 commit comments