diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index aa8e9ce953cdf1..57ab539a01337f 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -825,6 +825,7 @@ void TransferSliceStaticPosToPos() void EmitAlternation(RegexNode node) { Debug.Assert(node.Type is RegexNode.Alternate, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}"); int childCount = node.ChildCount(); Debug.Assert(childCount >= 2); @@ -1203,6 +1204,7 @@ void EmitWhenHasCapture() void EmitBackreferenceConditional(RegexNode node) { Debug.Assert(node.Type is RegexNode.Testref, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 2, $"Expected 2 children, found {node.ChildCount()}"); // We're branching in a complicated fashion. Make sure sliceStaticPos is 0. TransferSliceStaticPosToPos(); @@ -1210,9 +1212,10 @@ void EmitBackreferenceConditional(RegexNode node) // Get the capture number to test. int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); - // Get the "yes" branch and the optional "no" branch, if it exists. + // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus + // somewhat likely to be Empty. RegexNode yesBranch = node.Child(0); - RegexNode? noBranch = node.ChildCount() > 1 && node.Child(1) is { Type: not RegexNode.Empty } childNo ? childNo : null; + RegexNode? noBranch = node.Child(1) is { Type: not RegexNode.Empty } childNo ? childNo : null; string originalDoneLabel = doneLabel; // If the child branches might backtrack, we can't emit the branches inside constructs that @@ -1239,10 +1242,13 @@ void EmitBackreferenceConditional(RegexNode node) } } - doneLabel = originalDoneLabel; + doneLabel = originalDoneLabel; // atomicity return; } + string refNotMatched = ReserveName("ConditionalBackreferenceNotMatched"); + string endConditional = ReserveName("ConditionalBackreferenceEnd"); + // As with alternations, we have potentially multiple branches, each of which may contain // backtracking constructs, but the expression after the conditional needs a single target // to backtrack to. So, we expose a single Backtrack label and track which branch was @@ -1255,7 +1261,6 @@ void EmitBackreferenceConditional(RegexNode node) // inside the scope block for the if or else, that will prevent jumping to them from // elsewhere. So we implement the if/else with labels and gotos manually. // Check to see if the specified capture number was captured. - string refNotMatched = ReserveName("ConditionalBackreferenceNotMatched"); using (EmitBlock(writer, $"if (!base.IsMatched({capnum}))")) { writer.WriteLine($"goto {refNotMatched};"); @@ -1272,10 +1277,11 @@ void EmitBackreferenceConditional(RegexNode node) { writer.WriteLine($"{resumeAt} = 0;"); } - string endRef = ReserveName("ConditionalBackreferenceEnd"); - if (postYesDoneLabel != originalDoneLabel || noBranch is not null) + + bool needsEndConditional = postYesDoneLabel != originalDoneLabel || noBranch is not null; + if (needsEndConditional) { - writer.WriteLine($"goto {endRef};"); + writer.WriteLine($"goto {endConditional};"); writer.WriteLine(); } @@ -1283,7 +1289,6 @@ void EmitBackreferenceConditional(RegexNode node) string postNoDoneLabel = originalDoneLabel; if (noBranch is not null) { - // The earlier base.IsMatched returning false will jump to here. // Output the no branch. doneLabel = originalDoneLabel; EmitNode(noBranch); @@ -1308,16 +1313,19 @@ void EmitBackreferenceConditional(RegexNode node) // If either the yes branch or the no branch contained backtracking, subsequent expressions // might try to backtrack to here, so output a backtracking map based on resumeAt. - if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) + bool hasBacktracking = postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel; + if (hasBacktracking) { // Skip the backtracking section. - writer.WriteLine($"goto {endRef};"); + writer.WriteLine($"goto {endConditional};"); writer.WriteLine(); + // Backtrack section string backtrack = ReserveName("ConditionalBackreferenceBacktrack"); doneLabel = backtrack; MarkLabel(backtrack); + // Pop from the stack the branch that was used and jump back to its backtracking location. EmitStackPop(resumeAt); using (EmitBlock(writer, $"switch ({resumeAt})")) { @@ -1335,13 +1343,17 @@ void EmitBackreferenceConditional(RegexNode node) } } - if (postYesDoneLabel != originalDoneLabel || noBranch is not null) + if (needsEndConditional) { - MarkLabel(endRef); - if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) - { - EmitStackPush(resumeAt); - } + MarkLabel(endConditional); + } + + if (hasBacktracking) + { + // We're not atomic and at least one of the yes or no branches contained backtracking constructs, + // so finish outputting our backtracking logic, which involves pushing onto the stack which + // branch to backtrack into. + EmitStackPush(resumeAt); } } @@ -1349,87 +1361,105 @@ void EmitBackreferenceConditional(RegexNode node) void EmitExpressionConditional(RegexNode node) { Debug.Assert(node.Type is RegexNode.Testgroup, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 3, $"Expected 3 children, found {node.ChildCount()}"); bool isAtomic = node.IsAtomicByParent(); // We're branching in a complicated fashion. Make sure sliceStaticPos is 0. TransferSliceStaticPosToPos(); - // The first child node is the conditional expression. If this matches, then we branch to the "yes" branch. + // The first child node is the condition expression. If this matches, then we branch to the "yes" branch. // If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes" - // branch, otherwise. The conditional is treated as a positive lookahead if it isn't already one. - RegexNode conditional = node.Child(0); - if (conditional is { Type: RegexNode.Require }) - { - conditional = conditional.Child(0); - } + // branch, otherwise. The condition is treated as a positive lookahead. + RegexNode condition = node.Child(0); - // Get the "yes" branch and the optional "no" branch, if it exists. + // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus + // somewhat likely to be Empty. RegexNode yesBranch = node.Child(1); - RegexNode? noBranch = node.ChildCount() > 2 && node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null; + RegexNode? noBranch = node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null; + string originalDoneLabel = doneLabel; - string end = ReserveName("end"); - string? no = noBranch is not null ? ReserveName("ConditionalExpressionNoBranch") : null; + string expressionNotMatched = ReserveName("ConditionalExpressionNotMatched"); + string endConditional = ReserveName("ConditionalExpressionEnd"); - // If the conditional expression has captures, we'll need to uncapture them in the case of no match. - string? startingCapturePos = null; - if ((conditional.Options & RegexNode.HasCapturesFlag) != 0) - { - startingCapturePos = ReserveName("conditionalexpression_starting_capturepos"); - writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();"); - } - - string resumeAt = ReserveName("conditionalexpression_resumeAt"); + // As with alternations, we have potentially multiple branches, each of which may contain + // backtracking constructs, but the expression after the condition needs a single target + // to backtrack to. So, we expose a single Backtrack label and track which branch was + // followed in this resumeAt local. + string resumeAt = ReserveName("conditionalexpression_branch"); if (!isAtomic) { writer.WriteLine($"int {resumeAt} = 0;"); } - // Emit the conditional expression. We need to reroute any match failures to either the "no" branch - // if it exists, or to the end of the node (skipping the "yes" branch) if it doesn't. - string originalDoneLabel = doneLabel; - string tmpDoneLabel = no ?? end; - doneLabel = tmpDoneLabel; - EmitPositiveLookaheadAssertionChild(conditional); - if (doneLabel == tmpDoneLabel) + // If the condition expression has captures, we'll need to uncapture them in the case of no match. + string? startingCapturePos = null; + if ((condition.Options & RegexNode.HasCapturesFlag) != 0) { - doneLabel = originalDoneLabel; + startingCapturePos = ReserveName("conditionalexpression_starting_capturepos"); + writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();"); } - string postConditionalDoneLabel = doneLabel; - // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch. - // Since the "yes" branch may have a different execution path than the "no" branch or the lack of - // any branch, we need to store the current sliceStaticPos and reset it prior to emitting the code - // for what comes after the "yes" branch, so that everyone is on equal footing. + // Emit the condition expression. Route any failures to after the yes branch. This code is almost + // the same as for a positive lookahead; however, a positive lookahead only needs to reset the position + // on a successful match, as a failed match fails the whole expression; here, we need to reset the + // position on completion, regardless of whether the match is successful or not. + doneLabel = expressionNotMatched; + + // Save off pos. We'll need to reset this upon successful completion of the lookahead. + string startingPos = ReserveName("conditionalexpression_starting_pos"); + writer.WriteLine($"int {startingPos} = pos;"); + writer.WriteLine(); int startingSliceStaticPos = sliceStaticPos; + + // Emit the child. The condition expression is a zero-width assertion, which is atomic, + // so prevent backtracking into it. + writer.WriteLine("// Condition:"); + EmitNode(condition); + writer.WriteLine(); + doneLabel = originalDoneLabel; + + // After the condition completes successfully, reset the text positions. + // Do not reset captures, which persist beyond the lookahead. + writer.WriteLine("// Condition matched:"); + writer.WriteLine($"pos = {startingPos};"); + SliceInputSpan(writer); + sliceStaticPos = startingSliceStaticPos; + writer.WriteLine(); + + // The expression matched. Run the "yes" branch. If it successfully matches, jump to the end. EmitNode(yesBranch); writer.WriteLine(); - TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0 + TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch string postYesDoneLabel = doneLabel; - if (postYesDoneLabel != originalDoneLabel || noBranch is not null) + if (!isAtomic && postYesDoneLabel != originalDoneLabel) { - writer.WriteLine($"goto {end};"); + writer.WriteLine($"{resumeAt} = 0;"); } + writer.WriteLine($"goto {endConditional};"); + writer.WriteLine(); + + // After the condition completes unsuccessfully, reset the text positions + // _and_ reset captures, which should not persist when the whole expression failed. + writer.WriteLine("// Condition did not match:"); + MarkLabel(expressionNotMatched, emitSemicolon: false); + writer.WriteLine($"pos = {startingPos};"); + SliceInputSpan(writer); + sliceStaticPos = startingSliceStaticPos; + if (startingCapturePos is not null) + { + EmitUncaptureUntil(startingCapturePos); + } + writer.WriteLine(); - // If there's a no branch, we need to emit it, but skipping it from a successful "yes" branch match. string postNoDoneLabel = originalDoneLabel; if (noBranch is not null) { - writer.WriteLine(); - - // Emit the no branch, first uncapturing any captures from the expression condition that failed - // to match and emit the branch. - MarkLabel(no, emitSemicolon: startingCapturePos is null); - if (startingCapturePos is not null) - { - EmitUncaptureUntil(startingCapturePos); - } - - doneLabel = postConditionalDoneLabel; - sliceStaticPos = startingSliceStaticPos; + // Output the no branch. + doneLabel = originalDoneLabel; EmitNode(noBranch); writer.WriteLine(); - TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0 + TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch postNoDoneLabel = doneLabel; if (!isAtomic && postNoDoneLabel != originalDoneLabel) { @@ -1447,51 +1477,49 @@ void EmitExpressionConditional(RegexNode node) } } - if (isAtomic) + // If either the yes branch or the no branch contained backtracking, subsequent expressions + // might try to backtrack to here, so output a backtracking map based on resumeAt. + if (isAtomic || (postYesDoneLabel == originalDoneLabel && postNoDoneLabel == originalDoneLabel)) { doneLabel = originalDoneLabel; + MarkLabel(endConditional); } else { - if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel) - { - // Skip the backtracking section. - writer.WriteLine($"goto {end};"); - writer.WriteLine(); + // Skip the backtracking section. + writer.WriteLine($"goto {endConditional};"); + writer.WriteLine(); - string backtrack = ReserveName("ConditionalExpressionBacktrack"); - doneLabel = backtrack; - MarkLabel(backtrack); + string backtrack = ReserveName("ConditionalExpressionBacktrack"); + doneLabel = backtrack; + MarkLabel(backtrack, emitSemicolon: false); - using (EmitBlock(writer, $"switch ({StackPop()})")) + EmitStackPop(resumeAt); + using (EmitBlock(writer, $"switch ({resumeAt})")) + { + if (postYesDoneLabel != originalDoneLabel) { - if (postYesDoneLabel != postConditionalDoneLabel) - { - writer.WriteLine($"case 0: goto {postYesDoneLabel};"); - } - - if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel) - { - writer.WriteLine($"case 1: goto {postNoDoneLabel};"); - } + writer.WriteLine($"case 0: goto {postYesDoneLabel};"); + } - writer.WriteLine($"default: goto {postConditionalDoneLabel};"); + if (postNoDoneLabel != originalDoneLabel) + { + writer.WriteLine($"case 1: goto {postNoDoneLabel};"); } - } - if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) - { - EmitStackPush(resumeAt); + writer.WriteLine($"default: goto {originalDoneLabel};"); } - } - MarkLabel(end); + MarkLabel(endConditional, emitSemicolon: false); + EmitStackPush(resumeAt); + } } // Emits the code for a Capture node. void EmitCapture(RegexNode node, RegexNode? subsequent = null) { Debug.Assert(node.Type is RegexNode.Capture, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); int uncapnum = RegexParser.MapCaptureNumber(node.N, rm.Code.Caps); @@ -1542,7 +1570,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) // Emit a backtracking section that restores the capture's state and then jumps to the previous done label string backtrack = ReserveName($"CaptureBacktrack"); - MarkLabel(backtrack); + MarkLabel(backtrack, emitSemicolon: false); EmitStackPop(startingPos); if (!childBacktracks) { @@ -1565,12 +1593,8 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) void EmitPositiveLookaheadAssertion(RegexNode node) { Debug.Assert(node.Type is RegexNode.Require, $"Unexpected type: {node.Type}"); - EmitPositiveLookaheadAssertionChild(node.Child(0)); - } + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); - // Emits the code to handle a node as if it's wrapped in a positive lookahead assertion. - void EmitPositiveLookaheadAssertionChild(RegexNode child) - { // Lookarounds are implicitly atomic. Store the original done label to reset at the end. string originalDoneLabel = doneLabel; @@ -1581,7 +1605,7 @@ void EmitPositiveLookaheadAssertionChild(RegexNode child) int startingSliceStaticPos = sliceStaticPos; // Emit the child. - EmitNode(child); + EmitNode(node.Child(0)); // After the child completes successfully, reset the text positions. // Do not reset captures, which persist beyond the lookahead. @@ -1597,6 +1621,7 @@ void EmitPositiveLookaheadAssertionChild(RegexNode child) void EmitNegativeLookaheadAssertion(RegexNode node) { Debug.Assert(node.Type is RegexNode.Prevent, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); // Lookarounds are implicitly atomic. Store the original done label to reset at the end. string originalDoneLabel = doneLabel; @@ -1776,6 +1801,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck void EmitAtomic(RegexNode node, RegexNode? subsequent) { Debug.Assert(node.Type is RegexNode.Atomic, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); // Atomic simply outputs the code for the child, but it ensures that any done label left // set by the child is reset to what it was prior to the node's processing. That way, @@ -1800,6 +1826,7 @@ void EmitUpdateBumpalong(RegexNode node) void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) { Debug.Assert(node.Type is RegexNode.Concatenate, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}"); // Emit the code for each child one after the other. string? prevDescription = null; @@ -2217,7 +2244,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL SliceInputSpan(writer); writer.WriteLine(); - MarkLabel(endLoop); + MarkLabel(endLoop, emitSemicolon: false); EmitStackPush(expressionHasCaptures ? new[] { startingPos, endingPos, "base.Crawlpos()" } : new[] { startingPos, endingPos }); @@ -2348,7 +2375,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true) // Emit a backtracking section that restores the capture's state and then jumps to the previous done label string backtrack = ReserveName("CharLazyBacktrack"); - MarkLabel(backtrack); + MarkLabel(backtrack, emitSemicolon: false); Array.Reverse(toPushPopArray); EmitStackPop(toPushPopArray); @@ -2366,6 +2393,8 @@ void EmitLazy(RegexNode node) Debug.Assert(node.Type is RegexNode.Lazyloop, $"Unexpected type: {node.Type}"); Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}"); Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + int minIterations = node.M; int maxIterations = node.N; string originalDoneLabel = doneLabel; @@ -2515,7 +2544,7 @@ void EmitLazy(RegexNode node) // Emit a backtracking section that restores the capture's state and then jumps to the previous done label string backtrack = ReserveName($"LazyLoopBacktrack"); - MarkLabel(backtrack); + MarkLabel(backtrack, emitSemicolon: false); EmitStackPop(sawEmpty, iterationCount, startingPos); @@ -2782,6 +2811,8 @@ void EmitLoop(RegexNode node) Debug.Assert(node.Type is RegexNode.Loop or RegexNode.Lazyloop, $"Unexpected type: {node.Type}"); Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}"); Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + int minIterations = node.M; int maxIterations = node.N; bool isAtomic = node.IsAtomicByParent(); @@ -2901,7 +2932,7 @@ void EmitLoop(RegexNode node) writer.WriteLine(); string backtrack = ReserveName("LoopBacktrack"); - MarkLabel(backtrack); + MarkLabel(backtrack, emitSemicolon: false); using (EmitBlock(writer, $"if ({iterationCount} == 0)")) { writer.WriteLine($"goto {originalDoneLabel};"); @@ -2926,7 +2957,7 @@ void EmitLoop(RegexNode node) // Emit a backtracking section that restores the capture's state and then jumps to the previous done label string backtrack = ReserveName("LoopBacktrack"); - MarkLabel(backtrack); + MarkLabel(backtrack, emitSemicolon: false); EmitStackPop(iterationCount, startingPos); writer.WriteLine($"goto {doneLabel};"); @@ -3395,8 +3426,8 @@ private static string DescribeNode(RegexNode node) => RegexNode.Set => $"Match a character in the set {RegexCharClass.SetDescription(node.Str!)}.", RegexNode.Setloop or RegexNode.Setloopatomic or RegexNode.Setlazy => $"Match a character in the set {RegexCharClass.SetDescription(node.Str!)} {DescribeLoop(node)}.", RegexNode.Start => "Match if at the start position.", - RegexNode.Testgroup => $"Conditionally match {(node.ChildCount() == 2 ? "an expression" : "one of two expressions")} depending on whether an initial expression matches.", - RegexNode.Testref => $"Conditionally match {(node.ChildCount() == 1 ? "an expression" : "one of two expressions")} depending on whether the {DescribeNonNegative(node.M)} capture group matched.", + RegexNode.Testgroup => $"Conditionally match one of two expressions depending on whether an initial expression matches.", + RegexNode.Testref => $"Conditionally match one of two expressions depending on whether the {DescribeNonNegative(node.M)} capture group matched.", RegexNode.UpdateBumpalong => $"Advance the next matching position.", _ => $"Unknown node type {node.Type}", }; @@ -3423,20 +3454,29 @@ RegexNode.Atomic when node.Child(0).Type is RegexNode.Loop or RegexNode.Lazyloop if (!skip) { + string tag = node.Next?.Type switch + { + RegexNode.Testgroup when node.Next.Child(0) == node => "Condition: ", + RegexNode.Testgroup when node.Next.Child(1) == node => "Matched: ", + RegexNode.Testgroup when node.Next.Child(2) == node => "Not Matched: ", + + RegexNode.Testref when node.Next.Child(0) == node => "Matched: ", + RegexNode.Testref when node.Next.Child(1) == node => "Not Matched: ", + + _ => "", + }; + // Write out the line for the node. const char BulletPoint = '\u25CB'; - writer.WriteLine($"{prefix}{new string(' ', depth * 4)}{BulletPoint} {DescribeNode(node)}"); + writer.WriteLine($"{prefix}{new string(' ', depth * 4)}{BulletPoint} {tag}{DescribeNode(node)}"); } // Recur into each of its children. int childCount = node.ChildCount(); - if (childCount > 0) + for (int i = 0; i < childCount; i++) { - for (int i = 0; i < childCount; i++) - { - int childDepth = skip ? depth : depth + 1; - DescribeExpression(writer, node.Child(i), prefix, childDepth); - } + int childDepth = skip ? depth : depth + 1; + DescribeExpression(writer, node.Child(i), prefix, childDepth); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 5f6fb626e8de0c..29875367928e34 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1075,6 +1075,9 @@ void TransferSliceStaticPosToPos() // Emits the code for an alternation. void EmitAlternation(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Alternate, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}"); + int childCount = node.ChildCount(); Debug.Assert(childCount >= 2); @@ -1169,13 +1172,13 @@ void EmitAlternation(RegexNode node) // base.runstack[stackpos++] = i; // base.runstack[stackpos++] = startingCapturePos; // base.runstack[stackpos++] = startingPos; - EmitRunstackResizeIfNeeded(3); - EmitRunstackPush(() => Ldc(i)); + EmitStackResizeIfNeeded(3); + EmitStackPush(() => Ldc(i)); if (startingCapturePos is not null) { - EmitRunstackPush(() => Ldloc(startingCapturePos)); + EmitStackPush(() => Ldloc(startingCapturePos)); } - EmitRunstackPush(() => Ldloc(startingPos)); + EmitStackPush(() => Ldloc(startingPos)); } labelMap[i] = doneLabel; @@ -1230,14 +1233,14 @@ void EmitAlternation(RegexNode node) // startingPos = base.runstack[--stackpos]; // startingCapturePos = base.runstack[--stackpos]; // switch (base.runstack[--stackpos]) { ... } // branch number - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); if (startingCapturePos is not null) { - EmitRunstackPop(); + EmitStackPop(); Stloc(startingCapturePos); } - EmitRunstackPop(); + EmitStackPop(); Switch(labelMap); } @@ -1249,6 +1252,8 @@ void EmitAlternation(RegexNode node) // Emits the code to handle a backreference. void EmitBackreference(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Ref, $"Unexpected type: {node.Type}"); + int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); TransferSliceStaticPosToPos(); @@ -1339,6 +1344,9 @@ void EmitBackreference(RegexNode node) // Emits the code for an if(backreference)-then-else conditional. void EmitBackreferenceConditional(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Testref, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 2, $"Expected 2 children, found {node.ChildCount()}"); + bool isAtomic = node.IsAtomicByParent(); // We're branching in a complicated fashion. Make sure sliceStaticPos is 0. @@ -1347,9 +1355,14 @@ void EmitBackreferenceConditional(RegexNode node) // Get the capture number to test. int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); + // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus + // somewhat likely to be Empty. + RegexNode yesBranch = node.Child(0); + RegexNode? noBranch = node.Child(1) is { Type: not RegexNode.Empty } childNo ? childNo : null; Label originalDoneLabel = doneLabel; - Label backreferenceConditionalEnd = DefineLabel(); - bool hasNo = node.ChildCount() > 1 && node.Child(1).Type != RegexNode.Empty; + + Label refNotMatched = DefineLabel(); + Label endConditional = DefineLabel(); // As with alternations, we have potentially multiple branches, each of which may contain // backtracking constructs, but the expression after the conditional needs a single target @@ -1358,7 +1371,6 @@ void EmitBackreferenceConditional(RegexNode node) LocalBuilder resumeAt = DeclareInt32(); // if (!base.IsMatched(capnum)) goto refNotMatched; - Label refNotMatched = DefineLabel(); Ldthis(); Ldc(capnum); Call(s_isMatchedMethod); @@ -1366,32 +1378,33 @@ void EmitBackreferenceConditional(RegexNode node) // The specified capture was captured. Run the "yes" branch. // If it successfully matches, jump to the end. - EmitNode(node.Child(0)); + EmitNode(yesBranch); TransferSliceStaticPosToPos(); - Label postIfDoneLabel = doneLabel; - if (postIfDoneLabel != originalDoneLabel) + Label postYesDoneLabel = doneLabel; + if (!isAtomic && postYesDoneLabel != originalDoneLabel) { // resumeAt = 0; Ldc(0); Stloc(resumeAt); } - if (postIfDoneLabel != originalDoneLabel || hasNo) + + bool needsEndConditional = postYesDoneLabel != originalDoneLabel || noBranch is not null; + if (needsEndConditional) { - // goto endRef; - BrFar(backreferenceConditionalEnd); + // goto endConditional; + BrFar(endConditional); } MarkLabel(refNotMatched); - Label postElseDoneLabel = originalDoneLabel; - if (hasNo) + Label postNoDoneLabel = originalDoneLabel; + if (noBranch is not null) { - // The earlier base.IsMatched returning false will jump to here. // Output the no branch. doneLabel = originalDoneLabel; - EmitNode(node.Child(1)); + EmitNode(noBranch); TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch - postElseDoneLabel = doneLabel; - if (postElseDoneLabel != originalDoneLabel) + postNoDoneLabel = doneLabel; + if (!isAtomic && postNoDoneLabel != originalDoneLabel) { // resumeAt = 1; Ldc(1); @@ -1403,7 +1416,7 @@ void EmitBackreferenceConditional(RegexNode node) // There's only a yes branch. If it's going to cause us to output a backtracking // label but code may not end up taking the yes branch path, we need to emit a resumeAt // that will cause the backtracking to immediately pass through this node. - if (postIfDoneLabel != originalDoneLabel) + if (!isAtomic && postYesDoneLabel != originalDoneLabel) { // resumeAt = 2; Ldc(2); @@ -1411,92 +1424,107 @@ void EmitBackreferenceConditional(RegexNode node) } } - if (isAtomic) + if (isAtomic || (postYesDoneLabel == originalDoneLabel && postNoDoneLabel == originalDoneLabel)) { + // We're atomic by our parent, so even if either child branch has backtracking constructs, + // we don't need to emit any backtracking logic in support, as nothing will backtrack in. + // Instead, we just ensure we revert back to the original done label so that any backtracking + // skips over this node. doneLabel = originalDoneLabel; + if (needsEndConditional) + { + MarkLabel(endConditional); + } } else { - // If either the yes branch or the no branch contained backtracking, subsequent expressions - // might try to backtrack to here, so output a backtracking map based on resumeAt. - if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) - { - // Skip the backtracking section - // goto endRef; - Br(backreferenceConditionalEnd); + // Subsequent expressions might try to backtrack to here, so output a backtracking map based on resumeAt. - Label backtrack = DefineLabel(); - doneLabel = backtrack; - MarkLabel(backtrack); + // Skip the backtracking section + // goto endConditional; + Debug.Assert(needsEndConditional); + Br(endConditional); - // resumeAt = base.runstack[--stackpos]; - EmitRunstackPop(); - Stloc(resumeAt); + // Backtrack section + Label backtrack = DefineLabel(); + doneLabel = backtrack; + MarkLabel(backtrack); - if (postIfDoneLabel != originalDoneLabel) - { - // if (resumeAt == 0) goto postIfDoneLabel; - Ldloc(resumeAt); - Ldc(0); - BeqFar(postIfDoneLabel); - } + // Pop from the stack the branch that was used and jump back to its backtracking location. - if (postElseDoneLabel != originalDoneLabel) - { - // if (resumeAt == 1) goto postElseDoneLabel; - Ldloc(resumeAt); - Ldc(1); - BeqFar(postElseDoneLabel); - } + // resumeAt = base.runstack[--stackpos]; + EmitStackPop(); + Stloc(resumeAt); - // goto originalDoneLabel; - BrFar(originalDoneLabel); + if (postYesDoneLabel != originalDoneLabel) + { + // if (resumeAt == 0) goto postIfDoneLabel; + Ldloc(resumeAt); + Ldc(0); + BeqFar(postYesDoneLabel); } - } - if (postIfDoneLabel != originalDoneLabel || hasNo) - { - MarkLabel(backreferenceConditionalEnd); - if (!isAtomic && (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel)) + if (postNoDoneLabel != originalDoneLabel) + { + // if (resumeAt == 1) goto postNoDoneLabel; + Ldloc(resumeAt); + Ldc(1); + BeqFar(postNoDoneLabel); + } + + // goto originalDoneLabel; + BrFar(originalDoneLabel); + + if (needsEndConditional) { - // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); - // base.runstack[stackpos++] = resumeAt; - EmitRunstackResizeIfNeeded(1); - EmitRunstackPush(() => Ldloc(resumeAt)); + MarkLabel(endConditional); } + + // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); + // base.runstack[stackpos++] = resumeAt; + EmitStackResizeIfNeeded(1); + EmitStackPush(() => Ldloc(resumeAt)); } } // Emits the code for an if(expression)-then-else conditional. void EmitExpressionConditional(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Testgroup, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 3, $"Expected 3 children, found {node.ChildCount()}"); + bool isAtomic = node.IsAtomicByParent(); // We're branching in a complicated fashion. Make sure sliceStaticPos is 0. TransferSliceStaticPosToPos(); - // The first child node is the conditional expression. If this matches, then we branch to the "yes" branch. + // The first child node is the condition expression. If this matches, then we branch to the "yes" branch. // If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes" - // branch, otherwise. The conditional is treated as a positive lookahead. If it's not already - // such a node, wrap it in one. - RegexNode conditional = node.Child(0); - if (conditional is not { Type: RegexNode.Require }) - { - var newConditional = new RegexNode(RegexNode.Require, conditional.Options); - newConditional.AddChild(conditional); - conditional = newConditional; - } + // branch, otherwise. The condition is treated as a positive lookahead. + RegexNode condition = node.Child(0); - // Get the "yes" branch and the optional "no" branch, if it exists. + // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus + // somewhat likely to be Empty. RegexNode yesBranch = node.Child(1); - RegexNode? noBranch = node.ChildCount() > 2 && node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null; + RegexNode? noBranch = node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null; + Label originalDoneLabel = doneLabel; - Label expressionConditionalEnd = DefineLabel(); - Label no = DefineLabel(); + Label expressionNotMatched = DefineLabel(); + Label endConditional = DefineLabel(); + + // As with alternations, we have potentially multiple branches, each of which may contain + // backtracking constructs, but the expression after the condition needs a single target + // to backtrack to. So, we expose a single Backtrack label and track which branch was + // followed in this resumeAt local. + LocalBuilder? resumeAt = null; + if (!isAtomic) + { + resumeAt = DeclareInt32(); + } - // If the conditional expression has captures, we'll need to uncapture them in the case of no match. + // If the condition expression has captures, we'll need to uncapture them in the case of no match. LocalBuilder? startingCapturePos = null; - if ((conditional.Options & RegexNode.HasCapturesFlag) != 0) + if ((condition.Options & RegexNode.HasCapturesFlag) != 0) { // int startingCapturePos = base.Crawlpos(); startingCapturePos = DeclareInt32(); @@ -1505,62 +1533,73 @@ void EmitExpressionConditional(RegexNode node) Stloc(startingCapturePos); } - // Emit the conditional expression. We need to reroute any match failures to either the "no" branch - // if it exists, or to the end of the node (skipping the "yes" branch) if it doesn't. - Label originalDoneLabel = doneLabel; - Label tmpDoneLabel = noBranch is not null ? no : expressionConditionalEnd; - doneLabel = tmpDoneLabel; - EmitPositiveLookaheadAssertion(conditional); - if (doneLabel == tmpDoneLabel) - { - doneLabel = originalDoneLabel; - } + // Emit the condition expression. Route any failures to after the yes branch. This code is almost + // the same as for a positive lookahead; however, a positive lookahead only needs to reset the position + // on a successful match, as a failed match fails the whole expression; here, we need to reset the + // position on completion, regardless of whether the match is successful or not. + doneLabel = expressionNotMatched; - Label postConditionalDoneLabel = doneLabel; - LocalBuilder? resumeAt = !isAtomic ? DeclareInt32() : null; + // Save off pos. We'll need to reset this upon successful completion of the lookahead. + // startingPos = pos; + LocalBuilder startingPos = DeclareInt32(); + Ldloc(pos); + Stloc(startingPos); + int startingSliceStaticPos = sliceStaticPos; - // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch. - // Since the "yes" branch may have a different execution path than the "no" branch or the lack of - // any branch, we need to store the current sliceStaticPos and reset it prior to emitting the code - // for what comes after the "yes" branch, so that everyone is on equal footing. - int startingTextSpanPos = sliceStaticPos; + // Emit the child. The condition expression is a zero-width assertion, which is atomic, + // so prevent backtracking into it. + EmitNode(condition); + doneLabel = originalDoneLabel; + + // After the condition completes successfully, reset the text positions. + // Do not reset captures, which persist beyond the lookahead. + // pos = startingPos; + // slice = inputSpan.Slice(pos, end - pos); + Ldloc(startingPos); + Stloc(pos); + SliceInputSpan(); + sliceStaticPos = startingSliceStaticPos; + + // The expression matched. Run the "yes" branch. If it successfully matches, jump to the end. EmitNode(yesBranch); - TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0 + TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch Label postYesDoneLabel = doneLabel; - if (resumeAt is not null && postYesDoneLabel != originalDoneLabel) + if (!isAtomic && postYesDoneLabel != originalDoneLabel) { // resumeAt = 0; Ldc(0); - Stloc(resumeAt); + Stloc(resumeAt!); } - if (postYesDoneLabel != originalDoneLabel || noBranch is not null) + + // goto endConditional; + BrFar(endConditional); + + // After the condition completes unsuccessfully, reset the text positions + // _and_ reset captures, which should not persist when the whole expression failed. + // pos = startingPos; + MarkLabel(expressionNotMatched); + Ldloc(startingPos); + Stloc(pos); + SliceInputSpan(); + sliceStaticPos = startingSliceStaticPos; + if (startingCapturePos is not null) { - // goto end; - BrFar(expressionConditionalEnd); + EmitUncaptureUntil(startingCapturePos); } - // If there's a no branch, we need to emit it, but skipping it from a successful "yes" branch match. Label postNoDoneLabel = originalDoneLabel; if (noBranch is not null) { - // Emit the no branch, first uncapturing any captures from the expression condition that failed - // to match and emit the branch. - MarkLabel(no); - if (startingCapturePos is not null) - { - // while (base.Crawlpos() > startingCapturePos) base.Uncapture(); - EmitUncaptureUntil(startingCapturePos); - } - - doneLabel = postConditionalDoneLabel; - sliceStaticPos = startingTextSpanPos; + // Output the no branch. + doneLabel = originalDoneLabel; EmitNode(noBranch); - TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0 + TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch postNoDoneLabel = doneLabel; - if (postNoDoneLabel != originalDoneLabel) + if (!isAtomic && postNoDoneLabel != originalDoneLabel) { - // goto end; - BrFar(expressionConditionalEnd); + // resumeAt = 1; + Ldc(1); + Stloc(resumeAt!); } } else @@ -1568,66 +1607,72 @@ void EmitExpressionConditional(RegexNode node) // There's only a yes branch. If it's going to cause us to output a backtracking // label but code may not end up taking the yes branch path, we need to emit a resumeAt // that will cause the backtracking to immediately pass through this node. - if (resumeAt is not null && postYesDoneLabel != originalDoneLabel) + if (!isAtomic && postYesDoneLabel != originalDoneLabel) { // resumeAt = 2; Ldc(2); - Stloc(resumeAt); + Stloc(resumeAt!); } } - if (isAtomic) + // If either the yes branch or the no branch contained backtracking, subsequent expressions + // might try to backtrack to here, so output a backtracking map based on resumeAt. + if (isAtomic || (postYesDoneLabel == originalDoneLabel && postNoDoneLabel == originalDoneLabel)) { + // EndConditional: doneLabel = originalDoneLabel; + MarkLabel(endConditional); } else { Debug.Assert(resumeAt is not null); - if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel) - { - // Skip the backtracking section. - BrFar(expressionConditionalEnd); - Label backtrack = DefineLabel(); - doneLabel = backtrack; - MarkLabel(backtrack); + // Skip the backtracking section. + BrFar(endConditional); - if (postYesDoneLabel != postConditionalDoneLabel) - { - // if (resumeAt == 0) goto postYesDoneLabel; - Ldloc(resumeAt); - Ldc(0); - BeqFar(postYesDoneLabel); - } + Label backtrack = DefineLabel(); + doneLabel = backtrack; + MarkLabel(backtrack); - if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel) - { - // if (resumeAt == 1) goto postNoDoneLabel; - Ldloc(resumeAt); - Ldc(1); - BeqFar(postNoDoneLabel); - } + // resumeAt = StackPop(); + EmitStackPop(); + Stloc(resumeAt); - // goto postConditionalDoneLabel; - BrFar(postConditionalDoneLabel); + if (postYesDoneLabel != originalDoneLabel) + { + // if (resumeAt == 0) goto postYesDoneLabel; + Ldloc(resumeAt); + Ldc(0); + BeqFar(postYesDoneLabel); } - if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) + if (postNoDoneLabel != originalDoneLabel) { - // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); - // base.runstack[stackpos++] = resumeAt; - EmitRunstackResizeIfNeeded(1); - EmitRunstackPush(() => Ldloc(resumeAt)); + // if (resumeAt == 1) goto postNoDoneLabel; + Ldloc(resumeAt); + Ldc(1); + BeqFar(postNoDoneLabel); } - } - MarkLabel(expressionConditionalEnd); + // goto postConditionalDoneLabel; + BrFar(originalDoneLabel); + + // EndConditional: + MarkLabel(endConditional); + + // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); + // base.runstack[stackpos++] = resumeAt; + EmitStackResizeIfNeeded(1); + EmitStackPush(() => Ldloc(resumeAt!)); + } } // Emits the code for a Capture node. void EmitCapture(RegexNode node, RegexNode? subsequent = null) { - Debug.Assert(node.Type == RegexNode.Capture); + Debug.Assert(node.Type is RegexNode.Capture, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); int uncapnum = RegexParser.MapCaptureNumber(node.N, _code.Caps); bool isAtomic = node.IsAtomicByParent(); @@ -1685,8 +1730,8 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) { // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); // base.runstack[stackpos++] = startingPos; - EmitRunstackResizeIfNeeded(1); - EmitRunstackPush(() => Ldloc(startingPos)); + EmitStackResizeIfNeeded(1); + EmitStackPush(() => Ldloc(startingPos)); // Skip past the backtracking section // goto backtrackingEnd; @@ -1696,7 +1741,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) // Emit a backtracking section that restores the capture's state and then jumps to the previous done label Label backtrack = DefineLabel(); MarkLabel(backtrack); - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); if (!childBacktracks) { @@ -1742,6 +1787,9 @@ void EmitUncaptureUntil(LocalBuilder startingCapturePos) // Emits the code to handle a positive lookahead assertion. void EmitPositiveLookaheadAssertion(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Require, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + // Lookarounds are implicitly atomic. Store the original done label to reset at the end. Label originalDoneLabel = doneLabel; @@ -1770,6 +1818,9 @@ void EmitPositiveLookaheadAssertion(RegexNode node) // Emits the code to handle a negative lookahead assertion. void EmitNegativeLookaheadAssertion(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Prevent, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + // Lookarounds are implicitly atomic. Store the original done label to reset at the end. Label originalDoneLabel = doneLabel; @@ -1916,7 +1967,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.UpdateBumpalong: - EmitUpdateBumpalong(); + EmitUpdateBumpalong(node); break; default: @@ -1928,6 +1979,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck // Emits the node for an atomic. void EmitAtomic(RegexNode node, RegexNode? subsequent) { + Debug.Assert(node.Type is RegexNode.Atomic, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + // Atomic simply outputs the code for the child, but it ensures that any done label left // set by the child is reset to what it was prior to the node's processing. That way, // anything later that tries to jump back won't see labels set inside the atomic. @@ -1939,8 +1993,10 @@ void EmitAtomic(RegexNode node, RegexNode? subsequent) // Emits the code to handle updating base.runtextpos to pos in response to // an UpdateBumpalong node. This is used when we want to inform the scan loop that // it should bump from this location rather than from the original location. - void EmitUpdateBumpalong() + void EmitUpdateBumpalong(RegexNode node) { + Debug.Assert(node.Type is RegexNode.UpdateBumpalong, $"Unexpected type: {node.Type}"); + // base.runtextpos = pos; TransferSliceStaticPosToPos(); Ldthis(); @@ -1951,6 +2007,9 @@ void EmitUpdateBumpalong() // Emits code for a concatenation void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) { + Debug.Assert(node.Type is RegexNode.Concatenate, $"Unexpected type: {node.Type}"); + Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}"); + // Emit the code for each child one after the other. int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) @@ -1976,6 +2035,8 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null) { + Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Type}"); + // This only emits a single check, but it's called from the looping constructs in a loop // to generate the code for a single check, so we check for each "family" (one, notone, set) // rather than only for the specific single character nodes. @@ -2017,6 +2078,8 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o // Emits the code to handle a boundary check on a character. void EmitBoundary(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary, $"Unexpected type: {node.Type}"); + // if (!IsBoundary(pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel; Ldthis(); Ldloc(pos); @@ -2055,6 +2118,8 @@ void EmitBoundary(RegexNode node) // Emits the code to handle various anchors. void EmitAnchors(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Beginning or RegexNode.Start or RegexNode.Bol or RegexNode.End or RegexNode.EndZ or RegexNode.Eol, $"Unexpected type: {node.Type}"); + Debug.Assert(sliceStaticPos >= 0); switch (node.Type) { @@ -2147,6 +2212,8 @@ void EmitAnchors(RegexNode node) // Emits the code to handle a multiple-character match. void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) { + Debug.Assert(node.Type is RegexNode.Multi, $"Unexpected type: {node.Type}"); + bool caseInsensitive = IsCaseInsensitive(node); // If the multi string's length exceeds the maximum length we want to unroll, instead generate a call to StartsWith. @@ -2242,6 +2309,8 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) // Emits the code to handle a backtracking, single-character loop. void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) { + Debug.Assert(node.Type is RegexNode.Oneloop or RegexNode.Notoneloop or RegexNode.Setloop, $"Unexpected type: {node.Type}"); + // If this is actually a repeater, emit that instead; no backtracking necessary. if (node.M == node.N) { @@ -2310,16 +2379,16 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL { // capturepos = base.runstack[--stackpos]; // while (base.Crawlpos() > capturepos) base.Uncapture(); - EmitRunstackPop(); + EmitStackPop(); Stloc(capturepos); EmitUncaptureUntil(capturepos); } // endingPos = base.runstack[--stackpos]; // startingPos = base.runstack[--stackpos]; - EmitRunstackPop(); + EmitStackPop(); Stloc(endingPos); - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); // if (startingPos >= endingPos) goto originalDoneLabel; @@ -2372,17 +2441,19 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL SliceInputSpan(); MarkLabel(endLoop); - EmitRunstackResizeIfNeeded(expressionHasCaptures ? 3 : 2); - EmitRunstackPush(() => Ldloc(startingPos)); - EmitRunstackPush(() => Ldloc(endingPos)); + EmitStackResizeIfNeeded(expressionHasCaptures ? 3 : 2); + EmitStackPush(() => Ldloc(startingPos)); + EmitStackPush(() => Ldloc(endingPos)); if (capturepos is not null) { - EmitRunstackPush(() => Ldloc(capturepos!)); + EmitStackPush(() => Ldloc(capturepos!)); } } void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true) { + Debug.Assert(node.Type is RegexNode.Onelazy or RegexNode.Notonelazy or RegexNode.Setlazy, $"Unexpected type: {node.Type}"); + // Emit the min iterations as a repeater. Any failures here don't necessitate backtracking, // as the lazy itself failed to match, and there's no backtracking possible by the individual // characters/iterations themselves. @@ -2500,15 +2571,15 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true) // base.runstack[stackpos++] = startingPos; // base.runstack[stackpos++] = capturepos; // base.runstack[stackpos++] = iterationCount; - EmitRunstackResizeIfNeeded(3); - EmitRunstackPush(() => Ldloc(startingPos)); + EmitStackResizeIfNeeded(3); + EmitStackPush(() => Ldloc(startingPos)); if (capturepos is not null) { - EmitRunstackPush(() => Ldloc(capturepos)); + EmitStackPush(() => Ldloc(capturepos)); } if (iterationCount is not null) { - EmitRunstackPush(() => Ldloc(iterationCount)); + EmitStackPush(() => Ldloc(iterationCount)); } // Skip past the backtracking section @@ -2524,15 +2595,15 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true) // startingPos = base.runstack[--stackpos]; if (iterationCount is not null) { - EmitRunstackPop(); + EmitStackPop(); Stloc(iterationCount); } if (capturepos is not null) { - EmitRunstackPop(); + EmitStackPop(); Stloc(capturepos); } - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); // goto doneLabel; @@ -2548,6 +2619,8 @@ void EmitLazy(RegexNode node) Debug.Assert(node.Type is RegexNode.Lazyloop, $"Unexpected type: {node.Type}"); Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}"); Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + int minIterations = node.M; int maxIterations = node.N; Label originalDoneLabel = doneLabel; @@ -2610,18 +2683,18 @@ void EmitLazy(RegexNode node) // base.runstack[stackpos++] = startingPos; // base.runstack[stackpos++] = pos; // base.runstack[stackpos++] = sawEmpty; - EmitRunstackResizeIfNeeded(3); + EmitStackResizeIfNeeded(3); if (expressionHasCaptures) { - EmitRunstackPush(() => + EmitStackPush(() => { Ldthis(); Call(s_crawlposMethod); }); } - EmitRunstackPush(() => Ldloc(startingPos)); - EmitRunstackPush(() => Ldloc(pos)); - EmitRunstackPush(() => Ldloc(sawEmpty)); + EmitStackPush(() => Ldloc(startingPos)); + EmitStackPush(() => Ldloc(pos)); + EmitStackPush(() => Ldloc(sawEmpty)); // Save off some state. We need to store the current pos so we can compare it against // pos after the iteration, in order to determine whether the iteration was empty. Empty @@ -2702,16 +2775,16 @@ void EmitLazy(RegexNode node) // startingPos = base.runstack[--stackpos]; // capturepos = base.runstack[--stackpos]; // while (base.Crawlpos() > capturepos) base.Uncapture(); - EmitRunstackPop(); + EmitStackPop(); Stloc(sawEmpty); - EmitRunstackPop(); + EmitStackPop(); Stloc(pos); - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); if (expressionHasCaptures) { using RentedLocalBuilder poppedCrawlPos = RentInt32Local(); - EmitRunstackPop(); + EmitStackPop(); Stloc(poppedCrawlPos); EmitUncaptureUntil(poppedCrawlPos); } @@ -2737,10 +2810,10 @@ void EmitLazy(RegexNode node) if (!isAtomic) { // Store the capture's state and skip the backtracking section - EmitRunstackResizeIfNeeded(3); - EmitRunstackPush(() => Ldloc(startingPos)); - EmitRunstackPush(() => Ldloc(iterationCount)); - EmitRunstackPush(() => Ldloc(sawEmpty)); + EmitStackResizeIfNeeded(3); + EmitStackPush(() => Ldloc(startingPos)); + EmitStackPush(() => Ldloc(iterationCount)); + EmitStackPush(() => Ldloc(sawEmpty)); Label skipBacktrack = DefineLabel(); BrFar(skipBacktrack); @@ -2751,11 +2824,11 @@ void EmitLazy(RegexNode node) // sawEmpty = base.runstack[--stackpos]; // iterationCount = base.runstack[--stackpos]; // startingPos = base.runstack[--stackpos]; - EmitRunstackPop(); + EmitStackPop(); Stloc(sawEmpty); - EmitRunstackPop(); + EmitStackPop(); Stloc(iterationCount); - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); if (maxIterations == int.MaxValue) @@ -2788,8 +2861,9 @@ void EmitLazy(RegexNode node) // RegexNode.M is used for the number of iterations; RegexNode.N is ignored. void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true) { - int iterations = node.M; + Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Type}"); + int iterations = node.M; if (iterations == 0) { // No iterations, nothing to do. @@ -2871,6 +2945,8 @@ void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired // Emits the code to handle a non-backtracking, variable-length loop around a single character comparison. void EmitSingleCharAtomicLoop(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Notoneloop or RegexNode.Notoneloopatomic or RegexNode.Setloop or RegexNode.Setloopatomic, $"Unexpected type: {node.Type}"); + // If this is actually a repeater, emit that instead. if (node.M == node.N) { @@ -3111,6 +3187,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) // Emits the code to handle a non-backtracking optional zero-or-one loop. void EmitAtomicSingleCharZeroOrOne(RegexNode node) { + Debug.Assert(node.Type is RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Notoneloop or RegexNode.Notoneloopatomic or RegexNode.Setloop or RegexNode.Setloopatomic, $"Unexpected type: {node.Type}"); Debug.Assert(node.M == 0 && node.N == 1); Label skipUpdatesLabel = DefineLabel(); @@ -3168,6 +3245,8 @@ void EmitLoop(RegexNode node) Debug.Assert(node.Type is RegexNode.Loop or RegexNode.Lazyloop, $"Unexpected type: {node.Type}"); Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}"); Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}"); + Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + int minIterations = node.M; int maxIterations = node.N; bool isAtomic = node.IsAtomicByParent(); @@ -3198,14 +3277,14 @@ void EmitLoop(RegexNode node) // We need to store the starting pos and crawl position so that it may // be backtracked through later. This needs to be the starting position from // the iteration we're leaving, so it's pushed before updating it to pos. - EmitRunstackResizeIfNeeded(3); + EmitStackResizeIfNeeded(3); if (expressionHasCaptures) { // base.runstack[stackpos++] = base.Crawlpos(); - EmitRunstackPush(() => { Ldthis(); Call(s_crawlposMethod); }); + EmitStackPush(() => { Ldthis(); Call(s_crawlposMethod); }); } - EmitRunstackPush(() => Ldloc(startingPos)); - EmitRunstackPush(() => Ldloc(pos)); + EmitStackPush(() => Ldloc(startingPos)); + EmitStackPush(() => Ldloc(pos)); // Save off some state. We need to store the current pos so we can compare it against // pos after the iteration, in order to determine whether the iteration was empty. Empty @@ -3310,16 +3389,16 @@ void EmitLoop(RegexNode node) // pos = base.runstack[--stackpos]; // startingPos = base.runstack[--stackpos]; - EmitRunstackPop(); + EmitStackPop(); Stloc(pos); - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); if (expressionHasCaptures) { // int poppedCrawlPos = base.runstack[--stackpos]; // while (base.Crawlpos() > poppedCrawlPos) base.Uncapture(); using RentedLocalBuilder poppedCrawlPos = RentInt32Local(); - EmitRunstackPop(); + EmitStackPop(); Stloc(poppedCrawlPos); EmitUncaptureUntil(poppedCrawlPos); } @@ -3370,9 +3449,9 @@ void EmitLoop(RegexNode node) if (node.IsInLoop()) { // Store the capture's state - EmitRunstackResizeIfNeeded(3); - EmitRunstackPush(() => Ldloc(startingPos)); - EmitRunstackPush(() => Ldloc(iterationCount)); + EmitStackResizeIfNeeded(3); + EmitStackPush(() => Ldloc(startingPos)); + EmitStackPush(() => Ldloc(iterationCount)); // Skip past the backtracking section // goto backtrackingEnd; @@ -3385,9 +3464,9 @@ void EmitLoop(RegexNode node) // iterationCount = base.runstack[--runstack]; // startingPos = base.runstack[--runstack]; - EmitRunstackPop(); + EmitStackPop(); Stloc(iterationCount); - EmitRunstackPop(); + EmitStackPop(); Stloc(startingPos); // goto doneLabel; @@ -3399,7 +3478,7 @@ void EmitLoop(RegexNode node) } } - void EmitRunstackResizeIfNeeded(int count) + void EmitStackResizeIfNeeded(int count) { Debug.Assert(count >= 1); @@ -3431,7 +3510,7 @@ void EmitRunstackResizeIfNeeded(int count) MarkLabel(skipResize); } - void EmitRunstackPush(Action load) + void EmitStackPush(Action load) { // base.runstack[stackpos] = load(); Ldthisfld(s_runstackField); @@ -3446,7 +3525,7 @@ void EmitRunstackPush(Action load) Stloc(stackpos); } - void EmitRunstackPop() + void EmitStackPop() { // ... = base.runstack[--stackpos]; Ldthisfld(s_runstackField); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index fe04caca5e7bbe..ea696c6680a2a7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -308,11 +308,11 @@ private void ValidateFinalTreeInvariants() break; case Testref: - Debug.Assert(childCount is 1 or 2, $"Expected one or two children for {node.TypeName}, got {childCount}"); + Debug.Assert(childCount == 2, $"Expected two children for {node.TypeName}, got {childCount}"); break; case Testgroup: - Debug.Assert(childCount is 2 or 3, $"Expected two or three children for {node.TypeName}, got {childCount}"); + Debug.Assert(childCount == 3, $"Expected three children for {node.TypeName}, got {childCount}"); break; case Concatenate: @@ -562,39 +562,20 @@ public bool IsAtomicByParent() /// /// Removes redundant nodes from the subtree, and returns an optimized subtree. /// - internal RegexNode Reduce() - { - switch (Type) + internal RegexNode Reduce() => + Type switch { - case Alternate: - return ReduceAlternation(); - - case Concatenate: - return ReduceConcatenation(); - - case Loop: - case Lazyloop: - return ReduceLoops(); - - case Atomic: - return ReduceAtomic(); - - case Group: - return ReduceGroup(); - - case Set: - case Setloop: - case Setloopatomic: - case Setlazy: - return ReduceSet(); - - case Prevent: - return ReducePrevent(); - - default: - return this; - } - } + Alternate => ReduceAlternation(), + Atomic => ReduceAtomic(), + Concatenate => ReduceConcatenation(), + Group => ReduceGroup(), + Loop or Lazyloop => ReduceLoops(), + Prevent => ReducePrevent(), + Set or Setloop or Setloopatomic or Setlazy => ReduceSet(), + Testgroup => ReduceTestgroup(), + Testref => ReduceTestref(), + _ => this, + }; /// Remove an unnecessary Concatenation or Alternation node /// @@ -1819,6 +1800,53 @@ private RegexNode ReducePrevent() return this; } + /// Optimizations for backreference conditionals. + private RegexNode ReduceTestref() + { + Debug.Assert(Type == Testref); + Debug.Assert(ChildCount() is 1 or 2); + + // This isn't so much an optimization as it is changing the tree for consistency. + // We want all engines to be able to trust that every Testref will have two children, + // even though it's optional in the syntax. If it's missing a "not matched" branch, + // we add one that will match empty. + if (ChildCount() == 1) + { + AddChild(new RegexNode(Empty, Options)); + } + + return this; + } + + /// Optimizations for expression conditionals. + private RegexNode ReduceTestgroup() + { + Debug.Assert(Type == Testgroup); + Debug.Assert(ChildCount() is 2 or 3); + + // This isn't so much an optimization as it is changing the tree for consistency. + // We want all engines to be able to trust that every Testgroup will have three children, + // even though it's optional in the syntax. If it's missing a "not matched" branch, + // we add one that will match empty. + if (ChildCount() == 2) + { + AddChild(new RegexNode(Empty, Options)); + } + + // It's common for the condition to be an explicit positive lookahead, as specifying + // that eliminates any ambiguity in syntax as to whether the expression is to be matched + // as an expression or to be a reference to a capture group. After parsing, however, + // there's no ambiguity, and we can remove an extra level of positive lookahead, as the + // engines need to treat the condition as a zero-width positive, atomic assertion regardless. + RegexNode condition = Child(0); + if (condition.Type == Require && (condition.Options & RegexOptions.RightToLeft) == 0) + { + ReplaceChild(0, condition.Child(0)); + } + + return this; + } + /// /// Determines whether node can be switched to an atomic loop. Subsequent is the node /// immediately after 'node'. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 5ef7281b5884da..d3caec254a94fb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -288,13 +288,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex) Emit(RegexCode.Goto, 0); PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Forejump); - if (node.ChildCount() > 1) - { - break; - } - - // else fallthrough - goto case 1; + break; } case 1: PatchJump(_intStack.Pop(), _emitted.Length); @@ -328,11 +322,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex) PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); - - if (node.ChildCount() > 2) - break; - // else fallthrough - goto case 2; + break; case 2: PatchJump(_intStack.Pop(), _emitted.Length); break; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs index ab6e819b6a60ab..6c299578983d7c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs @@ -484,7 +484,7 @@ SymbolicRegexNode ConvertSetloop(RegexNode node, bool isLazy) bool IsDotStar(RegexNode node) => node.Type == RegexNode.Setloop && Convert(node, topLevel: false).IsAnyStar; - bool IsIntersect(RegexNode node) => node.Type == RegexNode.Testgroup && node.ChildCount() > 2 && IsNothing(node.Child(2)); + bool IsIntersect(RegexNode node) => node.Type == RegexNode.Testgroup && IsNothing(node.Child(2)); bool TryGetIntersection(RegexNode node, [Diagnostics.CodeAnalysis.NotNullWhen(true)] out List? conjuncts) { diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs index 723bb034acc6cd..d2a07d60dd04e3 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs @@ -833,7 +833,7 @@ public static IEnumerable Groups_Basic_TestData() yield return new object[] { engine, null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } }; } - // Grouping Constructs Invalid Regular Expressions + // Grouping Constructs yield return new object[] { engine, null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; yield return new object[] { engine, null, @"(?)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; yield return new object[] { engine, null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; @@ -844,13 +844,15 @@ public static IEnumerable Groups_Basic_TestData() yield return new object[] { engine, null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } }; yield return new object[] { engine, null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } }; - // Alternation construct Invalid Regular Expressions + // Alternation construct yield return new object[] { engine, null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } }; yield return new object[] { engine, null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } }; yield return new object[] { engine, null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } }; yield return new object[] { engine, null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } }; + yield return new object[] { engine, null, @"(?(cat)cat\w\w\w)*", "catdogcathog", RegexOptions.None, new string[] { "catdogcathog" } }; + yield return new object[] { engine, null, @"(?(?=cat)cat\w\w\w)*", "catdogcathog", RegexOptions.None, new string[] { "catdogcathog" } }; yield return new object[] { engine, null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } }; yield return new object[] { engine, null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } }; yield return new object[] { engine, null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } }; @@ -859,6 +861,9 @@ public static IEnumerable Groups_Basic_TestData() yield return new object[] { engine, null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } }; yield return new object[] { engine, null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } }; + yield return new object[] { engine, null, @"(?((\w{3}))\1\1|no)", "dogdogdog", RegexOptions.None, new string[] { "dogdog", "dog" } }; + yield return new object[] { engine, null, @"(?((\w{3}))\1\1|no)", "no", RegexOptions.None, new string[] { "no", "" } }; + // Invalid unicode yield return new object[] { engine, null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } }; yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } }; diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index fc89bb68e9bfaf..97f98cd991132d 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -357,12 +357,17 @@ public static IEnumerable Match_MemberData() yield return ("(?(dog2))", "dog2", RegexOptions.None, 0, 4, true, string.Empty); yield return ("(?(a:b))", "a", RegexOptions.None, 0, 1, true, string.Empty); yield return ("(?(a:))", "a", RegexOptions.None, 0, 1, true, string.Empty); + yield return ("(?(cat)cat|dog)", "cat", RegexOptions.None, 0, 3, true, "cat"); + yield return ("(?((?=cat))cat|dog)", "cat", RegexOptions.None, 0, 3, true, "cat"); yield return ("(?(cat)|dog)", "cat", RegexOptions.None, 0, 3, true, string.Empty); yield return ("(?(cat)|dog)", "catdog", RegexOptions.None, 0, 6, true, string.Empty); yield return ("(?(cat)|dog)", "oof", RegexOptions.None, 0, 3, false, string.Empty); yield return ("(?(cat)dog1|dog2)", "catdog1", RegexOptions.None, 0, 7, false, string.Empty); yield return ("(?(cat)dog1|dog2)", "catdog2", RegexOptions.None, 0, 7, true, "dog2"); yield return ("(?(cat)dog1|dog2)", "catdog1dog2", RegexOptions.None, 0, 11, true, "dog2"); + yield return (@"(?(\w+)\w+)dog", "catdog", RegexOptions.None, 0, 6, true, "catdog"); + yield return (@"(?(abc)\w+|\w{0,2})dog", "catdog", RegexOptions.None, 0, 6, true, "atdog"); + yield return (@"(?(abc)cat|\w{0,2})dog", "catdog", RegexOptions.None, 0, 6, true, "atdog"); yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa"); yield return ("(a|ab|abc|abcd)d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd"); yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.None, 0, 4, false, string.Empty); diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs index 4c009b46bf73dd..34f60c672e1101 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs @@ -295,6 +295,32 @@ public static IEnumerable Matches_TestData() } }; + yield return new object[] + { + engine, + @"(?(\w+)\w+|)", "abcd", RegexOptions.None, + new CaptureData[] + { + new CaptureData("abcd", 0, 4), + new CaptureData("", 4, 0), + } + }; + + if (!PlatformDetection.IsNetFramework) + { + // .NET Framework has some behavioral inconsistencies when there's no else branch. + yield return new object[] + { + engine, + @"(?(\w+)\w+)", "abcd", RegexOptions.None, + new CaptureData[] + { + new CaptureData("abcd", 0, 4), + new CaptureData("", 4, 0), + } + }; + } + yield return new object[] { engine, diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index d541964bfb6f05..c3cd830fbb7b29 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -371,6 +371,11 @@ private static int GetMinRequiredLength(Regex r) [InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")] [InlineData("bonjour|hej|ciao|shalom|zdravo|pozdrav|hallo|hola|hello|hey|witam|tere|bonjou|salam|helo|sawubona", "(?>bonjou(?>r|)|h(?>e(?>j|(?>l(?>lo|o)|y))|allo|ola)|ciao|s(?>halom|a(?>lam|wubona))|zdravo|pozdrav|witam|tere)")] [InlineData("\\w\\d123|\\w\\dabc", "\\w\\d(?:123|abc)")] + [InlineData("(a)(?(1)b)", "(a)(?(1)b|)")] + [InlineData("(abc)(?(1)def)", "(abc)(?(1)def|)")] + [InlineData("(?(a)a)", "(?(a)a|)")] + [InlineData("(?(abc)def)", "(?(abc)def|)")] + [InlineData("(?(\\w)\\d)", "(?(\\w)\\d|)")] // Auto-atomicity [InlineData("a*b", "(?>a*)b")] [InlineData("a*b+", "(?>a*)b+")]