diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index aa8e9ce953cdf1..57ab539a01337f 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -825,6 +825,7 @@ void TransferSliceStaticPosToPos()
void EmitAlternation(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Alternate, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}");
int childCount = node.ChildCount();
Debug.Assert(childCount >= 2);
@@ -1203,6 +1204,7 @@ void EmitWhenHasCapture()
void EmitBackreferenceConditional(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Testref, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 2, $"Expected 2 children, found {node.ChildCount()}");
// We're branching in a complicated fashion. Make sure sliceStaticPos is 0.
TransferSliceStaticPosToPos();
@@ -1210,9 +1212,10 @@ void EmitBackreferenceConditional(RegexNode node)
// Get the capture number to test.
int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps);
- // Get the "yes" branch and the optional "no" branch, if it exists.
+ // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus
+ // somewhat likely to be Empty.
RegexNode yesBranch = node.Child(0);
- RegexNode? noBranch = node.ChildCount() > 1 && node.Child(1) is { Type: not RegexNode.Empty } childNo ? childNo : null;
+ RegexNode? noBranch = node.Child(1) is { Type: not RegexNode.Empty } childNo ? childNo : null;
string originalDoneLabel = doneLabel;
// If the child branches might backtrack, we can't emit the branches inside constructs that
@@ -1239,10 +1242,13 @@ void EmitBackreferenceConditional(RegexNode node)
}
}
- doneLabel = originalDoneLabel;
+ doneLabel = originalDoneLabel; // atomicity
return;
}
+ string refNotMatched = ReserveName("ConditionalBackreferenceNotMatched");
+ string endConditional = ReserveName("ConditionalBackreferenceEnd");
+
// As with alternations, we have potentially multiple branches, each of which may contain
// backtracking constructs, but the expression after the conditional needs a single target
// to backtrack to. So, we expose a single Backtrack label and track which branch was
@@ -1255,7 +1261,6 @@ void EmitBackreferenceConditional(RegexNode node)
// inside the scope block for the if or else, that will prevent jumping to them from
// elsewhere. So we implement the if/else with labels and gotos manually.
// Check to see if the specified capture number was captured.
- string refNotMatched = ReserveName("ConditionalBackreferenceNotMatched");
using (EmitBlock(writer, $"if (!base.IsMatched({capnum}))"))
{
writer.WriteLine($"goto {refNotMatched};");
@@ -1272,10 +1277,11 @@ void EmitBackreferenceConditional(RegexNode node)
{
writer.WriteLine($"{resumeAt} = 0;");
}
- string endRef = ReserveName("ConditionalBackreferenceEnd");
- if (postYesDoneLabel != originalDoneLabel || noBranch is not null)
+
+ bool needsEndConditional = postYesDoneLabel != originalDoneLabel || noBranch is not null;
+ if (needsEndConditional)
{
- writer.WriteLine($"goto {endRef};");
+ writer.WriteLine($"goto {endConditional};");
writer.WriteLine();
}
@@ -1283,7 +1289,6 @@ void EmitBackreferenceConditional(RegexNode node)
string postNoDoneLabel = originalDoneLabel;
if (noBranch is not null)
{
- // The earlier base.IsMatched returning false will jump to here.
// Output the no branch.
doneLabel = originalDoneLabel;
EmitNode(noBranch);
@@ -1308,16 +1313,19 @@ void EmitBackreferenceConditional(RegexNode node)
// If either the yes branch or the no branch contained backtracking, subsequent expressions
// might try to backtrack to here, so output a backtracking map based on resumeAt.
- if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel)
+ bool hasBacktracking = postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel;
+ if (hasBacktracking)
{
// Skip the backtracking section.
- writer.WriteLine($"goto {endRef};");
+ writer.WriteLine($"goto {endConditional};");
writer.WriteLine();
+ // Backtrack section
string backtrack = ReserveName("ConditionalBackreferenceBacktrack");
doneLabel = backtrack;
MarkLabel(backtrack);
+ // Pop from the stack the branch that was used and jump back to its backtracking location.
EmitStackPop(resumeAt);
using (EmitBlock(writer, $"switch ({resumeAt})"))
{
@@ -1335,13 +1343,17 @@ void EmitBackreferenceConditional(RegexNode node)
}
}
- if (postYesDoneLabel != originalDoneLabel || noBranch is not null)
+ if (needsEndConditional)
{
- MarkLabel(endRef);
- if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel)
- {
- EmitStackPush(resumeAt);
- }
+ MarkLabel(endConditional);
+ }
+
+ if (hasBacktracking)
+ {
+ // We're not atomic and at least one of the yes or no branches contained backtracking constructs,
+ // so finish outputting our backtracking logic, which involves pushing onto the stack which
+ // branch to backtrack into.
+ EmitStackPush(resumeAt);
}
}
@@ -1349,87 +1361,105 @@ void EmitBackreferenceConditional(RegexNode node)
void EmitExpressionConditional(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Testgroup, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 3, $"Expected 3 children, found {node.ChildCount()}");
bool isAtomic = node.IsAtomicByParent();
// We're branching in a complicated fashion. Make sure sliceStaticPos is 0.
TransferSliceStaticPosToPos();
- // The first child node is the conditional expression. If this matches, then we branch to the "yes" branch.
+ // The first child node is the condition expression. If this matches, then we branch to the "yes" branch.
// If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes"
- // branch, otherwise. The conditional is treated as a positive lookahead if it isn't already one.
- RegexNode conditional = node.Child(0);
- if (conditional is { Type: RegexNode.Require })
- {
- conditional = conditional.Child(0);
- }
+ // branch, otherwise. The condition is treated as a positive lookahead.
+ RegexNode condition = node.Child(0);
- // Get the "yes" branch and the optional "no" branch, if it exists.
+ // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus
+ // somewhat likely to be Empty.
RegexNode yesBranch = node.Child(1);
- RegexNode? noBranch = node.ChildCount() > 2 && node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null;
+ RegexNode? noBranch = node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null;
+ string originalDoneLabel = doneLabel;
- string end = ReserveName("end");
- string? no = noBranch is not null ? ReserveName("ConditionalExpressionNoBranch") : null;
+ string expressionNotMatched = ReserveName("ConditionalExpressionNotMatched");
+ string endConditional = ReserveName("ConditionalExpressionEnd");
- // If the conditional expression has captures, we'll need to uncapture them in the case of no match.
- string? startingCapturePos = null;
- if ((conditional.Options & RegexNode.HasCapturesFlag) != 0)
- {
- startingCapturePos = ReserveName("conditionalexpression_starting_capturepos");
- writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
- }
-
- string resumeAt = ReserveName("conditionalexpression_resumeAt");
+ // As with alternations, we have potentially multiple branches, each of which may contain
+ // backtracking constructs, but the expression after the condition needs a single target
+ // to backtrack to. So, we expose a single Backtrack label and track which branch was
+ // followed in this resumeAt local.
+ string resumeAt = ReserveName("conditionalexpression_branch");
if (!isAtomic)
{
writer.WriteLine($"int {resumeAt} = 0;");
}
- // Emit the conditional expression. We need to reroute any match failures to either the "no" branch
- // if it exists, or to the end of the node (skipping the "yes" branch) if it doesn't.
- string originalDoneLabel = doneLabel;
- string tmpDoneLabel = no ?? end;
- doneLabel = tmpDoneLabel;
- EmitPositiveLookaheadAssertionChild(conditional);
- if (doneLabel == tmpDoneLabel)
+ // If the condition expression has captures, we'll need to uncapture them in the case of no match.
+ string? startingCapturePos = null;
+ if ((condition.Options & RegexNode.HasCapturesFlag) != 0)
{
- doneLabel = originalDoneLabel;
+ startingCapturePos = ReserveName("conditionalexpression_starting_capturepos");
+ writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
}
- string postConditionalDoneLabel = doneLabel;
- // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch.
- // Since the "yes" branch may have a different execution path than the "no" branch or the lack of
- // any branch, we need to store the current sliceStaticPos and reset it prior to emitting the code
- // for what comes after the "yes" branch, so that everyone is on equal footing.
+ // Emit the condition expression. Route any failures to after the yes branch. This code is almost
+ // the same as for a positive lookahead; however, a positive lookahead only needs to reset the position
+ // on a successful match, as a failed match fails the whole expression; here, we need to reset the
+ // position on completion, regardless of whether the match is successful or not.
+ doneLabel = expressionNotMatched;
+
+ // Save off pos. We'll need to reset this upon successful completion of the lookahead.
+ string startingPos = ReserveName("conditionalexpression_starting_pos");
+ writer.WriteLine($"int {startingPos} = pos;");
+ writer.WriteLine();
int startingSliceStaticPos = sliceStaticPos;
+
+ // Emit the child. The condition expression is a zero-width assertion, which is atomic,
+ // so prevent backtracking into it.
+ writer.WriteLine("// Condition:");
+ EmitNode(condition);
+ writer.WriteLine();
+ doneLabel = originalDoneLabel;
+
+ // After the condition completes successfully, reset the text positions.
+ // Do not reset captures, which persist beyond the lookahead.
+ writer.WriteLine("// Condition matched:");
+ writer.WriteLine($"pos = {startingPos};");
+ SliceInputSpan(writer);
+ sliceStaticPos = startingSliceStaticPos;
+ writer.WriteLine();
+
+ // The expression matched. Run the "yes" branch. If it successfully matches, jump to the end.
EmitNode(yesBranch);
writer.WriteLine();
- TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
+ TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch
string postYesDoneLabel = doneLabel;
- if (postYesDoneLabel != originalDoneLabel || noBranch is not null)
+ if (!isAtomic && postYesDoneLabel != originalDoneLabel)
{
- writer.WriteLine($"goto {end};");
+ writer.WriteLine($"{resumeAt} = 0;");
}
+ writer.WriteLine($"goto {endConditional};");
+ writer.WriteLine();
+
+ // After the condition completes unsuccessfully, reset the text positions
+ // _and_ reset captures, which should not persist when the whole expression failed.
+ writer.WriteLine("// Condition did not match:");
+ MarkLabel(expressionNotMatched, emitSemicolon: false);
+ writer.WriteLine($"pos = {startingPos};");
+ SliceInputSpan(writer);
+ sliceStaticPos = startingSliceStaticPos;
+ if (startingCapturePos is not null)
+ {
+ EmitUncaptureUntil(startingCapturePos);
+ }
+ writer.WriteLine();
- // If there's a no branch, we need to emit it, but skipping it from a successful "yes" branch match.
string postNoDoneLabel = originalDoneLabel;
if (noBranch is not null)
{
- writer.WriteLine();
-
- // Emit the no branch, first uncapturing any captures from the expression condition that failed
- // to match and emit the branch.
- MarkLabel(no, emitSemicolon: startingCapturePos is null);
- if (startingCapturePos is not null)
- {
- EmitUncaptureUntil(startingCapturePos);
- }
-
- doneLabel = postConditionalDoneLabel;
- sliceStaticPos = startingSliceStaticPos;
+ // Output the no branch.
+ doneLabel = originalDoneLabel;
EmitNode(noBranch);
writer.WriteLine();
- TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
+ TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch
postNoDoneLabel = doneLabel;
if (!isAtomic && postNoDoneLabel != originalDoneLabel)
{
@@ -1447,51 +1477,49 @@ void EmitExpressionConditional(RegexNode node)
}
}
- if (isAtomic)
+ // If either the yes branch or the no branch contained backtracking, subsequent expressions
+ // might try to backtrack to here, so output a backtracking map based on resumeAt.
+ if (isAtomic || (postYesDoneLabel == originalDoneLabel && postNoDoneLabel == originalDoneLabel))
{
doneLabel = originalDoneLabel;
+ MarkLabel(endConditional);
}
else
{
- if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel)
- {
- // Skip the backtracking section.
- writer.WriteLine($"goto {end};");
- writer.WriteLine();
+ // Skip the backtracking section.
+ writer.WriteLine($"goto {endConditional};");
+ writer.WriteLine();
- string backtrack = ReserveName("ConditionalExpressionBacktrack");
- doneLabel = backtrack;
- MarkLabel(backtrack);
+ string backtrack = ReserveName("ConditionalExpressionBacktrack");
+ doneLabel = backtrack;
+ MarkLabel(backtrack, emitSemicolon: false);
- using (EmitBlock(writer, $"switch ({StackPop()})"))
+ EmitStackPop(resumeAt);
+ using (EmitBlock(writer, $"switch ({resumeAt})"))
+ {
+ if (postYesDoneLabel != originalDoneLabel)
{
- if (postYesDoneLabel != postConditionalDoneLabel)
- {
- writer.WriteLine($"case 0: goto {postYesDoneLabel};");
- }
-
- if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel)
- {
- writer.WriteLine($"case 1: goto {postNoDoneLabel};");
- }
+ writer.WriteLine($"case 0: goto {postYesDoneLabel};");
+ }
- writer.WriteLine($"default: goto {postConditionalDoneLabel};");
+ if (postNoDoneLabel != originalDoneLabel)
+ {
+ writer.WriteLine($"case 1: goto {postNoDoneLabel};");
}
- }
- if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel)
- {
- EmitStackPush(resumeAt);
+ writer.WriteLine($"default: goto {originalDoneLabel};");
}
- }
- MarkLabel(end);
+ MarkLabel(endConditional, emitSemicolon: false);
+ EmitStackPush(resumeAt);
+ }
}
// Emits the code for a Capture node.
void EmitCapture(RegexNode node, RegexNode? subsequent = null)
{
Debug.Assert(node.Type is RegexNode.Capture, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps);
int uncapnum = RegexParser.MapCaptureNumber(node.N, rm.Code.Caps);
@@ -1542,7 +1570,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
// Emit a backtracking section that restores the capture's state and then jumps to the previous done label
string backtrack = ReserveName($"CaptureBacktrack");
- MarkLabel(backtrack);
+ MarkLabel(backtrack, emitSemicolon: false);
EmitStackPop(startingPos);
if (!childBacktracks)
{
@@ -1565,12 +1593,8 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
void EmitPositiveLookaheadAssertion(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Require, $"Unexpected type: {node.Type}");
- EmitPositiveLookaheadAssertionChild(node.Child(0));
- }
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
- // Emits the code to handle a node as if it's wrapped in a positive lookahead assertion.
- void EmitPositiveLookaheadAssertionChild(RegexNode child)
- {
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
string originalDoneLabel = doneLabel;
@@ -1581,7 +1605,7 @@ void EmitPositiveLookaheadAssertionChild(RegexNode child)
int startingSliceStaticPos = sliceStaticPos;
// Emit the child.
- EmitNode(child);
+ EmitNode(node.Child(0));
// After the child completes successfully, reset the text positions.
// Do not reset captures, which persist beyond the lookahead.
@@ -1597,6 +1621,7 @@ void EmitPositiveLookaheadAssertionChild(RegexNode child)
void EmitNegativeLookaheadAssertion(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Prevent, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
string originalDoneLabel = doneLabel;
@@ -1776,6 +1801,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
void EmitAtomic(RegexNode node, RegexNode? subsequent)
{
Debug.Assert(node.Type is RegexNode.Atomic, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
// Atomic simply outputs the code for the child, but it ensures that any done label left
// set by the child is reset to what it was prior to the node's processing. That way,
@@ -1800,6 +1826,7 @@ void EmitUpdateBumpalong(RegexNode node)
void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired)
{
Debug.Assert(node.Type is RegexNode.Concatenate, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}");
// Emit the code for each child one after the other.
string? prevDescription = null;
@@ -2217,7 +2244,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
SliceInputSpan(writer);
writer.WriteLine();
- MarkLabel(endLoop);
+ MarkLabel(endLoop, emitSemicolon: false);
EmitStackPush(expressionHasCaptures ?
new[] { startingPos, endingPos, "base.Crawlpos()" } :
new[] { startingPos, endingPos });
@@ -2348,7 +2375,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
// Emit a backtracking section that restores the capture's state and then jumps to the previous done label
string backtrack = ReserveName("CharLazyBacktrack");
- MarkLabel(backtrack);
+ MarkLabel(backtrack, emitSemicolon: false);
Array.Reverse(toPushPopArray);
EmitStackPop(toPushPopArray);
@@ -2366,6 +2393,8 @@ void EmitLazy(RegexNode node)
Debug.Assert(node.Type is RegexNode.Lazyloop, $"Unexpected type: {node.Type}");
Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}");
Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
int minIterations = node.M;
int maxIterations = node.N;
string originalDoneLabel = doneLabel;
@@ -2515,7 +2544,7 @@ void EmitLazy(RegexNode node)
// Emit a backtracking section that restores the capture's state and then jumps to the previous done label
string backtrack = ReserveName($"LazyLoopBacktrack");
- MarkLabel(backtrack);
+ MarkLabel(backtrack, emitSemicolon: false);
EmitStackPop(sawEmpty, iterationCount, startingPos);
@@ -2782,6 +2811,8 @@ void EmitLoop(RegexNode node)
Debug.Assert(node.Type is RegexNode.Loop or RegexNode.Lazyloop, $"Unexpected type: {node.Type}");
Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}");
Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
int minIterations = node.M;
int maxIterations = node.N;
bool isAtomic = node.IsAtomicByParent();
@@ -2901,7 +2932,7 @@ void EmitLoop(RegexNode node)
writer.WriteLine();
string backtrack = ReserveName("LoopBacktrack");
- MarkLabel(backtrack);
+ MarkLabel(backtrack, emitSemicolon: false);
using (EmitBlock(writer, $"if ({iterationCount} == 0)"))
{
writer.WriteLine($"goto {originalDoneLabel};");
@@ -2926,7 +2957,7 @@ void EmitLoop(RegexNode node)
// Emit a backtracking section that restores the capture's state and then jumps to the previous done label
string backtrack = ReserveName("LoopBacktrack");
- MarkLabel(backtrack);
+ MarkLabel(backtrack, emitSemicolon: false);
EmitStackPop(iterationCount, startingPos);
writer.WriteLine($"goto {doneLabel};");
@@ -3395,8 +3426,8 @@ private static string DescribeNode(RegexNode node) =>
RegexNode.Set => $"Match a character in the set {RegexCharClass.SetDescription(node.Str!)}.",
RegexNode.Setloop or RegexNode.Setloopatomic or RegexNode.Setlazy => $"Match a character in the set {RegexCharClass.SetDescription(node.Str!)} {DescribeLoop(node)}.",
RegexNode.Start => "Match if at the start position.",
- RegexNode.Testgroup => $"Conditionally match {(node.ChildCount() == 2 ? "an expression" : "one of two expressions")} depending on whether an initial expression matches.",
- RegexNode.Testref => $"Conditionally match {(node.ChildCount() == 1 ? "an expression" : "one of two expressions")} depending on whether the {DescribeNonNegative(node.M)} capture group matched.",
+ RegexNode.Testgroup => $"Conditionally match one of two expressions depending on whether an initial expression matches.",
+ RegexNode.Testref => $"Conditionally match one of two expressions depending on whether the {DescribeNonNegative(node.M)} capture group matched.",
RegexNode.UpdateBumpalong => $"Advance the next matching position.",
_ => $"Unknown node type {node.Type}",
};
@@ -3423,20 +3454,29 @@ RegexNode.Atomic when node.Child(0).Type is RegexNode.Loop or RegexNode.Lazyloop
if (!skip)
{
+ string tag = node.Next?.Type switch
+ {
+ RegexNode.Testgroup when node.Next.Child(0) == node => "Condition: ",
+ RegexNode.Testgroup when node.Next.Child(1) == node => "Matched: ",
+ RegexNode.Testgroup when node.Next.Child(2) == node => "Not Matched: ",
+
+ RegexNode.Testref when node.Next.Child(0) == node => "Matched: ",
+ RegexNode.Testref when node.Next.Child(1) == node => "Not Matched: ",
+
+ _ => "",
+ };
+
// Write out the line for the node.
const char BulletPoint = '\u25CB';
- writer.WriteLine($"{prefix}{new string(' ', depth * 4)}{BulletPoint} {DescribeNode(node)}");
+ writer.WriteLine($"{prefix}{new string(' ', depth * 4)}{BulletPoint} {tag}{DescribeNode(node)}");
}
// Recur into each of its children.
int childCount = node.ChildCount();
- if (childCount > 0)
+ for (int i = 0; i < childCount; i++)
{
- for (int i = 0; i < childCount; i++)
- {
- int childDepth = skip ? depth : depth + 1;
- DescribeExpression(writer, node.Child(i), prefix, childDepth);
- }
+ int childDepth = skip ? depth : depth + 1;
+ DescribeExpression(writer, node.Child(i), prefix, childDepth);
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index 5f6fb626e8de0c..29875367928e34 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -1075,6 +1075,9 @@ void TransferSliceStaticPosToPos()
// Emits the code for an alternation.
void EmitAlternation(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Alternate, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}");
+
int childCount = node.ChildCount();
Debug.Assert(childCount >= 2);
@@ -1169,13 +1172,13 @@ void EmitAlternation(RegexNode node)
// base.runstack[stackpos++] = i;
// base.runstack[stackpos++] = startingCapturePos;
// base.runstack[stackpos++] = startingPos;
- EmitRunstackResizeIfNeeded(3);
- EmitRunstackPush(() => Ldc(i));
+ EmitStackResizeIfNeeded(3);
+ EmitStackPush(() => Ldc(i));
if (startingCapturePos is not null)
{
- EmitRunstackPush(() => Ldloc(startingCapturePos));
+ EmitStackPush(() => Ldloc(startingCapturePos));
}
- EmitRunstackPush(() => Ldloc(startingPos));
+ EmitStackPush(() => Ldloc(startingPos));
}
labelMap[i] = doneLabel;
@@ -1230,14 +1233,14 @@ void EmitAlternation(RegexNode node)
// startingPos = base.runstack[--stackpos];
// startingCapturePos = base.runstack[--stackpos];
// switch (base.runstack[--stackpos]) { ... } // branch number
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
if (startingCapturePos is not null)
{
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingCapturePos);
}
- EmitRunstackPop();
+ EmitStackPop();
Switch(labelMap);
}
@@ -1249,6 +1252,8 @@ void EmitAlternation(RegexNode node)
// Emits the code to handle a backreference.
void EmitBackreference(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Ref, $"Unexpected type: {node.Type}");
+
int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps);
TransferSliceStaticPosToPos();
@@ -1339,6 +1344,9 @@ void EmitBackreference(RegexNode node)
// Emits the code for an if(backreference)-then-else conditional.
void EmitBackreferenceConditional(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Testref, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 2, $"Expected 2 children, found {node.ChildCount()}");
+
bool isAtomic = node.IsAtomicByParent();
// We're branching in a complicated fashion. Make sure sliceStaticPos is 0.
@@ -1347,9 +1355,14 @@ void EmitBackreferenceConditional(RegexNode node)
// Get the capture number to test.
int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps);
+ // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus
+ // somewhat likely to be Empty.
+ RegexNode yesBranch = node.Child(0);
+ RegexNode? noBranch = node.Child(1) is { Type: not RegexNode.Empty } childNo ? childNo : null;
Label originalDoneLabel = doneLabel;
- Label backreferenceConditionalEnd = DefineLabel();
- bool hasNo = node.ChildCount() > 1 && node.Child(1).Type != RegexNode.Empty;
+
+ Label refNotMatched = DefineLabel();
+ Label endConditional = DefineLabel();
// As with alternations, we have potentially multiple branches, each of which may contain
// backtracking constructs, but the expression after the conditional needs a single target
@@ -1358,7 +1371,6 @@ void EmitBackreferenceConditional(RegexNode node)
LocalBuilder resumeAt = DeclareInt32();
// if (!base.IsMatched(capnum)) goto refNotMatched;
- Label refNotMatched = DefineLabel();
Ldthis();
Ldc(capnum);
Call(s_isMatchedMethod);
@@ -1366,32 +1378,33 @@ void EmitBackreferenceConditional(RegexNode node)
// The specified capture was captured. Run the "yes" branch.
// If it successfully matches, jump to the end.
- EmitNode(node.Child(0));
+ EmitNode(yesBranch);
TransferSliceStaticPosToPos();
- Label postIfDoneLabel = doneLabel;
- if (postIfDoneLabel != originalDoneLabel)
+ Label postYesDoneLabel = doneLabel;
+ if (!isAtomic && postYesDoneLabel != originalDoneLabel)
{
// resumeAt = 0;
Ldc(0);
Stloc(resumeAt);
}
- if (postIfDoneLabel != originalDoneLabel || hasNo)
+
+ bool needsEndConditional = postYesDoneLabel != originalDoneLabel || noBranch is not null;
+ if (needsEndConditional)
{
- // goto endRef;
- BrFar(backreferenceConditionalEnd);
+ // goto endConditional;
+ BrFar(endConditional);
}
MarkLabel(refNotMatched);
- Label postElseDoneLabel = originalDoneLabel;
- if (hasNo)
+ Label postNoDoneLabel = originalDoneLabel;
+ if (noBranch is not null)
{
- // The earlier base.IsMatched returning false will jump to here.
// Output the no branch.
doneLabel = originalDoneLabel;
- EmitNode(node.Child(1));
+ EmitNode(noBranch);
TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch
- postElseDoneLabel = doneLabel;
- if (postElseDoneLabel != originalDoneLabel)
+ postNoDoneLabel = doneLabel;
+ if (!isAtomic && postNoDoneLabel != originalDoneLabel)
{
// resumeAt = 1;
Ldc(1);
@@ -1403,7 +1416,7 @@ void EmitBackreferenceConditional(RegexNode node)
// There's only a yes branch. If it's going to cause us to output a backtracking
// label but code may not end up taking the yes branch path, we need to emit a resumeAt
// that will cause the backtracking to immediately pass through this node.
- if (postIfDoneLabel != originalDoneLabel)
+ if (!isAtomic && postYesDoneLabel != originalDoneLabel)
{
// resumeAt = 2;
Ldc(2);
@@ -1411,92 +1424,107 @@ void EmitBackreferenceConditional(RegexNode node)
}
}
- if (isAtomic)
+ if (isAtomic || (postYesDoneLabel == originalDoneLabel && postNoDoneLabel == originalDoneLabel))
{
+ // We're atomic by our parent, so even if either child branch has backtracking constructs,
+ // we don't need to emit any backtracking logic in support, as nothing will backtrack in.
+ // Instead, we just ensure we revert back to the original done label so that any backtracking
+ // skips over this node.
doneLabel = originalDoneLabel;
+ if (needsEndConditional)
+ {
+ MarkLabel(endConditional);
+ }
}
else
{
- // If either the yes branch or the no branch contained backtracking, subsequent expressions
- // might try to backtrack to here, so output a backtracking map based on resumeAt.
- if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel)
- {
- // Skip the backtracking section
- // goto endRef;
- Br(backreferenceConditionalEnd);
+ // Subsequent expressions might try to backtrack to here, so output a backtracking map based on resumeAt.
- Label backtrack = DefineLabel();
- doneLabel = backtrack;
- MarkLabel(backtrack);
+ // Skip the backtracking section
+ // goto endConditional;
+ Debug.Assert(needsEndConditional);
+ Br(endConditional);
- // resumeAt = base.runstack[--stackpos];
- EmitRunstackPop();
- Stloc(resumeAt);
+ // Backtrack section
+ Label backtrack = DefineLabel();
+ doneLabel = backtrack;
+ MarkLabel(backtrack);
- if (postIfDoneLabel != originalDoneLabel)
- {
- // if (resumeAt == 0) goto postIfDoneLabel;
- Ldloc(resumeAt);
- Ldc(0);
- BeqFar(postIfDoneLabel);
- }
+ // Pop from the stack the branch that was used and jump back to its backtracking location.
- if (postElseDoneLabel != originalDoneLabel)
- {
- // if (resumeAt == 1) goto postElseDoneLabel;
- Ldloc(resumeAt);
- Ldc(1);
- BeqFar(postElseDoneLabel);
- }
+ // resumeAt = base.runstack[--stackpos];
+ EmitStackPop();
+ Stloc(resumeAt);
- // goto originalDoneLabel;
- BrFar(originalDoneLabel);
+ if (postYesDoneLabel != originalDoneLabel)
+ {
+ // if (resumeAt == 0) goto postIfDoneLabel;
+ Ldloc(resumeAt);
+ Ldc(0);
+ BeqFar(postYesDoneLabel);
}
- }
- if (postIfDoneLabel != originalDoneLabel || hasNo)
- {
- MarkLabel(backreferenceConditionalEnd);
- if (!isAtomic && (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel))
+ if (postNoDoneLabel != originalDoneLabel)
+ {
+ // if (resumeAt == 1) goto postNoDoneLabel;
+ Ldloc(resumeAt);
+ Ldc(1);
+ BeqFar(postNoDoneLabel);
+ }
+
+ // goto originalDoneLabel;
+ BrFar(originalDoneLabel);
+
+ if (needsEndConditional)
{
- // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2);
- // base.runstack[stackpos++] = resumeAt;
- EmitRunstackResizeIfNeeded(1);
- EmitRunstackPush(() => Ldloc(resumeAt));
+ MarkLabel(endConditional);
}
+
+ // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2);
+ // base.runstack[stackpos++] = resumeAt;
+ EmitStackResizeIfNeeded(1);
+ EmitStackPush(() => Ldloc(resumeAt));
}
}
// Emits the code for an if(expression)-then-else conditional.
void EmitExpressionConditional(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Testgroup, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 3, $"Expected 3 children, found {node.ChildCount()}");
+
bool isAtomic = node.IsAtomicByParent();
// We're branching in a complicated fashion. Make sure sliceStaticPos is 0.
TransferSliceStaticPosToPos();
- // The first child node is the conditional expression. If this matches, then we branch to the "yes" branch.
+ // The first child node is the condition expression. If this matches, then we branch to the "yes" branch.
// If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes"
- // branch, otherwise. The conditional is treated as a positive lookahead. If it's not already
- // such a node, wrap it in one.
- RegexNode conditional = node.Child(0);
- if (conditional is not { Type: RegexNode.Require })
- {
- var newConditional = new RegexNode(RegexNode.Require, conditional.Options);
- newConditional.AddChild(conditional);
- conditional = newConditional;
- }
+ // branch, otherwise. The condition is treated as a positive lookahead.
+ RegexNode condition = node.Child(0);
- // Get the "yes" branch and the optional "no" branch, if it exists.
+ // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus
+ // somewhat likely to be Empty.
RegexNode yesBranch = node.Child(1);
- RegexNode? noBranch = node.ChildCount() > 2 && node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null;
+ RegexNode? noBranch = node.Child(2) is { Type: not RegexNode.Empty } childNo ? childNo : null;
+ Label originalDoneLabel = doneLabel;
- Label expressionConditionalEnd = DefineLabel();
- Label no = DefineLabel();
+ Label expressionNotMatched = DefineLabel();
+ Label endConditional = DefineLabel();
+
+ // As with alternations, we have potentially multiple branches, each of which may contain
+ // backtracking constructs, but the expression after the condition needs a single target
+ // to backtrack to. So, we expose a single Backtrack label and track which branch was
+ // followed in this resumeAt local.
+ LocalBuilder? resumeAt = null;
+ if (!isAtomic)
+ {
+ resumeAt = DeclareInt32();
+ }
- // If the conditional expression has captures, we'll need to uncapture them in the case of no match.
+ // If the condition expression has captures, we'll need to uncapture them in the case of no match.
LocalBuilder? startingCapturePos = null;
- if ((conditional.Options & RegexNode.HasCapturesFlag) != 0)
+ if ((condition.Options & RegexNode.HasCapturesFlag) != 0)
{
// int startingCapturePos = base.Crawlpos();
startingCapturePos = DeclareInt32();
@@ -1505,62 +1533,73 @@ void EmitExpressionConditional(RegexNode node)
Stloc(startingCapturePos);
}
- // Emit the conditional expression. We need to reroute any match failures to either the "no" branch
- // if it exists, or to the end of the node (skipping the "yes" branch) if it doesn't.
- Label originalDoneLabel = doneLabel;
- Label tmpDoneLabel = noBranch is not null ? no : expressionConditionalEnd;
- doneLabel = tmpDoneLabel;
- EmitPositiveLookaheadAssertion(conditional);
- if (doneLabel == tmpDoneLabel)
- {
- doneLabel = originalDoneLabel;
- }
+ // Emit the condition expression. Route any failures to after the yes branch. This code is almost
+ // the same as for a positive lookahead; however, a positive lookahead only needs to reset the position
+ // on a successful match, as a failed match fails the whole expression; here, we need to reset the
+ // position on completion, regardless of whether the match is successful or not.
+ doneLabel = expressionNotMatched;
- Label postConditionalDoneLabel = doneLabel;
- LocalBuilder? resumeAt = !isAtomic ? DeclareInt32() : null;
+ // Save off pos. We'll need to reset this upon successful completion of the lookahead.
+ // startingPos = pos;
+ LocalBuilder startingPos = DeclareInt32();
+ Ldloc(pos);
+ Stloc(startingPos);
+ int startingSliceStaticPos = sliceStaticPos;
- // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch.
- // Since the "yes" branch may have a different execution path than the "no" branch or the lack of
- // any branch, we need to store the current sliceStaticPos and reset it prior to emitting the code
- // for what comes after the "yes" branch, so that everyone is on equal footing.
- int startingTextSpanPos = sliceStaticPos;
+ // Emit the child. The condition expression is a zero-width assertion, which is atomic,
+ // so prevent backtracking into it.
+ EmitNode(condition);
+ doneLabel = originalDoneLabel;
+
+ // After the condition completes successfully, reset the text positions.
+ // Do not reset captures, which persist beyond the lookahead.
+ // pos = startingPos;
+ // slice = inputSpan.Slice(pos, end - pos);
+ Ldloc(startingPos);
+ Stloc(pos);
+ SliceInputSpan();
+ sliceStaticPos = startingSliceStaticPos;
+
+ // The expression matched. Run the "yes" branch. If it successfully matches, jump to the end.
EmitNode(yesBranch);
- TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
+ TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch
Label postYesDoneLabel = doneLabel;
- if (resumeAt is not null && postYesDoneLabel != originalDoneLabel)
+ if (!isAtomic && postYesDoneLabel != originalDoneLabel)
{
// resumeAt = 0;
Ldc(0);
- Stloc(resumeAt);
+ Stloc(resumeAt!);
}
- if (postYesDoneLabel != originalDoneLabel || noBranch is not null)
+
+ // goto endConditional;
+ BrFar(endConditional);
+
+ // After the condition completes unsuccessfully, reset the text positions
+ // _and_ reset captures, which should not persist when the whole expression failed.
+ // pos = startingPos;
+ MarkLabel(expressionNotMatched);
+ Ldloc(startingPos);
+ Stloc(pos);
+ SliceInputSpan();
+ sliceStaticPos = startingSliceStaticPos;
+ if (startingCapturePos is not null)
{
- // goto end;
- BrFar(expressionConditionalEnd);
+ EmitUncaptureUntil(startingCapturePos);
}
- // If there's a no branch, we need to emit it, but skipping it from a successful "yes" branch match.
Label postNoDoneLabel = originalDoneLabel;
if (noBranch is not null)
{
- // Emit the no branch, first uncapturing any captures from the expression condition that failed
- // to match and emit the branch.
- MarkLabel(no);
- if (startingCapturePos is not null)
- {
- // while (base.Crawlpos() > startingCapturePos) base.Uncapture();
- EmitUncaptureUntil(startingCapturePos);
- }
-
- doneLabel = postConditionalDoneLabel;
- sliceStaticPos = startingTextSpanPos;
+ // Output the no branch.
+ doneLabel = originalDoneLabel;
EmitNode(noBranch);
- TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
+ TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch
postNoDoneLabel = doneLabel;
- if (postNoDoneLabel != originalDoneLabel)
+ if (!isAtomic && postNoDoneLabel != originalDoneLabel)
{
- // goto end;
- BrFar(expressionConditionalEnd);
+ // resumeAt = 1;
+ Ldc(1);
+ Stloc(resumeAt!);
}
}
else
@@ -1568,66 +1607,72 @@ void EmitExpressionConditional(RegexNode node)
// There's only a yes branch. If it's going to cause us to output a backtracking
// label but code may not end up taking the yes branch path, we need to emit a resumeAt
// that will cause the backtracking to immediately pass through this node.
- if (resumeAt is not null && postYesDoneLabel != originalDoneLabel)
+ if (!isAtomic && postYesDoneLabel != originalDoneLabel)
{
// resumeAt = 2;
Ldc(2);
- Stloc(resumeAt);
+ Stloc(resumeAt!);
}
}
- if (isAtomic)
+ // If either the yes branch or the no branch contained backtracking, subsequent expressions
+ // might try to backtrack to here, so output a backtracking map based on resumeAt.
+ if (isAtomic || (postYesDoneLabel == originalDoneLabel && postNoDoneLabel == originalDoneLabel))
{
+ // EndConditional:
doneLabel = originalDoneLabel;
+ MarkLabel(endConditional);
}
else
{
Debug.Assert(resumeAt is not null);
- if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel)
- {
- // Skip the backtracking section.
- BrFar(expressionConditionalEnd);
- Label backtrack = DefineLabel();
- doneLabel = backtrack;
- MarkLabel(backtrack);
+ // Skip the backtracking section.
+ BrFar(endConditional);
- if (postYesDoneLabel != postConditionalDoneLabel)
- {
- // if (resumeAt == 0) goto postYesDoneLabel;
- Ldloc(resumeAt);
- Ldc(0);
- BeqFar(postYesDoneLabel);
- }
+ Label backtrack = DefineLabel();
+ doneLabel = backtrack;
+ MarkLabel(backtrack);
- if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel)
- {
- // if (resumeAt == 1) goto postNoDoneLabel;
- Ldloc(resumeAt);
- Ldc(1);
- BeqFar(postNoDoneLabel);
- }
+ // resumeAt = StackPop();
+ EmitStackPop();
+ Stloc(resumeAt);
- // goto postConditionalDoneLabel;
- BrFar(postConditionalDoneLabel);
+ if (postYesDoneLabel != originalDoneLabel)
+ {
+ // if (resumeAt == 0) goto postYesDoneLabel;
+ Ldloc(resumeAt);
+ Ldc(0);
+ BeqFar(postYesDoneLabel);
}
- if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel)
+ if (postNoDoneLabel != originalDoneLabel)
{
- // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2);
- // base.runstack[stackpos++] = resumeAt;
- EmitRunstackResizeIfNeeded(1);
- EmitRunstackPush(() => Ldloc(resumeAt));
+ // if (resumeAt == 1) goto postNoDoneLabel;
+ Ldloc(resumeAt);
+ Ldc(1);
+ BeqFar(postNoDoneLabel);
}
- }
- MarkLabel(expressionConditionalEnd);
+ // goto postConditionalDoneLabel;
+ BrFar(originalDoneLabel);
+
+ // EndConditional:
+ MarkLabel(endConditional);
+
+ // if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2);
+ // base.runstack[stackpos++] = resumeAt;
+ EmitStackResizeIfNeeded(1);
+ EmitStackPush(() => Ldloc(resumeAt!));
+ }
}
// Emits the code for a Capture node.
void EmitCapture(RegexNode node, RegexNode? subsequent = null)
{
- Debug.Assert(node.Type == RegexNode.Capture);
+ Debug.Assert(node.Type is RegexNode.Capture, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps);
int uncapnum = RegexParser.MapCaptureNumber(node.N, _code.Caps);
bool isAtomic = node.IsAtomicByParent();
@@ -1685,8 +1730,8 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
{
// if (stackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2);
// base.runstack[stackpos++] = startingPos;
- EmitRunstackResizeIfNeeded(1);
- EmitRunstackPush(() => Ldloc(startingPos));
+ EmitStackResizeIfNeeded(1);
+ EmitStackPush(() => Ldloc(startingPos));
// Skip past the backtracking section
// goto backtrackingEnd;
@@ -1696,7 +1741,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
// Emit a backtracking section that restores the capture's state and then jumps to the previous done label
Label backtrack = DefineLabel();
MarkLabel(backtrack);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
if (!childBacktracks)
{
@@ -1742,6 +1787,9 @@ void EmitUncaptureUntil(LocalBuilder startingCapturePos)
// Emits the code to handle a positive lookahead assertion.
void EmitPositiveLookaheadAssertion(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Require, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
Label originalDoneLabel = doneLabel;
@@ -1770,6 +1818,9 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
// Emits the code to handle a negative lookahead assertion.
void EmitNegativeLookaheadAssertion(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Prevent, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
Label originalDoneLabel = doneLabel;
@@ -1916,7 +1967,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
break;
case RegexNode.UpdateBumpalong:
- EmitUpdateBumpalong();
+ EmitUpdateBumpalong(node);
break;
default:
@@ -1928,6 +1979,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
// Emits the node for an atomic.
void EmitAtomic(RegexNode node, RegexNode? subsequent)
{
+ Debug.Assert(node.Type is RegexNode.Atomic, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
// Atomic simply outputs the code for the child, but it ensures that any done label left
// set by the child is reset to what it was prior to the node's processing. That way,
// anything later that tries to jump back won't see labels set inside the atomic.
@@ -1939,8 +1993,10 @@ void EmitAtomic(RegexNode node, RegexNode? subsequent)
// Emits the code to handle updating base.runtextpos to pos in response to
// an UpdateBumpalong node. This is used when we want to inform the scan loop that
// it should bump from this location rather than from the original location.
- void EmitUpdateBumpalong()
+ void EmitUpdateBumpalong(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.UpdateBumpalong, $"Unexpected type: {node.Type}");
+
// base.runtextpos = pos;
TransferSliceStaticPosToPos();
Ldthis();
@@ -1951,6 +2007,9 @@ void EmitUpdateBumpalong()
// Emits code for a concatenation
void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired)
{
+ Debug.Assert(node.Type is RegexNode.Concatenate, $"Unexpected type: {node.Type}");
+ Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}");
+
// Emit the code for each child one after the other.
int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
@@ -1976,6 +2035,8 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe
// Emits the code to handle a single-character match.
void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null)
{
+ Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Type}");
+
// This only emits a single check, but it's called from the looping constructs in a loop
// to generate the code for a single check, so we check for each "family" (one, notone, set)
// rather than only for the specific single character nodes.
@@ -2017,6 +2078,8 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o
// Emits the code to handle a boundary check on a character.
void EmitBoundary(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary, $"Unexpected type: {node.Type}");
+
// if (!IsBoundary(pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel;
Ldthis();
Ldloc(pos);
@@ -2055,6 +2118,8 @@ void EmitBoundary(RegexNode node)
// Emits the code to handle various anchors.
void EmitAnchors(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Beginning or RegexNode.Start or RegexNode.Bol or RegexNode.End or RegexNode.EndZ or RegexNode.Eol, $"Unexpected type: {node.Type}");
+
Debug.Assert(sliceStaticPos >= 0);
switch (node.Type)
{
@@ -2147,6 +2212,8 @@ void EmitAnchors(RegexNode node)
// Emits the code to handle a multiple-character match.
void EmitMultiChar(RegexNode node, bool emitLengthCheck = true)
{
+ Debug.Assert(node.Type is RegexNode.Multi, $"Unexpected type: {node.Type}");
+
bool caseInsensitive = IsCaseInsensitive(node);
// If the multi string's length exceeds the maximum length we want to unroll, instead generate a call to StartsWith.
@@ -2242,6 +2309,8 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true)
// Emits the code to handle a backtracking, single-character loop.
void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
{
+ Debug.Assert(node.Type is RegexNode.Oneloop or RegexNode.Notoneloop or RegexNode.Setloop, $"Unexpected type: {node.Type}");
+
// If this is actually a repeater, emit that instead; no backtracking necessary.
if (node.M == node.N)
{
@@ -2310,16 +2379,16 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
{
// capturepos = base.runstack[--stackpos];
// while (base.Crawlpos() > capturepos) base.Uncapture();
- EmitRunstackPop();
+ EmitStackPop();
Stloc(capturepos);
EmitUncaptureUntil(capturepos);
}
// endingPos = base.runstack[--stackpos];
// startingPos = base.runstack[--stackpos];
- EmitRunstackPop();
+ EmitStackPop();
Stloc(endingPos);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
// if (startingPos >= endingPos) goto originalDoneLabel;
@@ -2372,17 +2441,19 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
SliceInputSpan();
MarkLabel(endLoop);
- EmitRunstackResizeIfNeeded(expressionHasCaptures ? 3 : 2);
- EmitRunstackPush(() => Ldloc(startingPos));
- EmitRunstackPush(() => Ldloc(endingPos));
+ EmitStackResizeIfNeeded(expressionHasCaptures ? 3 : 2);
+ EmitStackPush(() => Ldloc(startingPos));
+ EmitStackPush(() => Ldloc(endingPos));
if (capturepos is not null)
{
- EmitRunstackPush(() => Ldloc(capturepos!));
+ EmitStackPush(() => Ldloc(capturepos!));
}
}
void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
{
+ Debug.Assert(node.Type is RegexNode.Onelazy or RegexNode.Notonelazy or RegexNode.Setlazy, $"Unexpected type: {node.Type}");
+
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match, and there's no backtracking possible by the individual
// characters/iterations themselves.
@@ -2500,15 +2571,15 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
// base.runstack[stackpos++] = startingPos;
// base.runstack[stackpos++] = capturepos;
// base.runstack[stackpos++] = iterationCount;
- EmitRunstackResizeIfNeeded(3);
- EmitRunstackPush(() => Ldloc(startingPos));
+ EmitStackResizeIfNeeded(3);
+ EmitStackPush(() => Ldloc(startingPos));
if (capturepos is not null)
{
- EmitRunstackPush(() => Ldloc(capturepos));
+ EmitStackPush(() => Ldloc(capturepos));
}
if (iterationCount is not null)
{
- EmitRunstackPush(() => Ldloc(iterationCount));
+ EmitStackPush(() => Ldloc(iterationCount));
}
// Skip past the backtracking section
@@ -2524,15 +2595,15 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
// startingPos = base.runstack[--stackpos];
if (iterationCount is not null)
{
- EmitRunstackPop();
+ EmitStackPop();
Stloc(iterationCount);
}
if (capturepos is not null)
{
- EmitRunstackPop();
+ EmitStackPop();
Stloc(capturepos);
}
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
// goto doneLabel;
@@ -2548,6 +2619,8 @@ void EmitLazy(RegexNode node)
Debug.Assert(node.Type is RegexNode.Lazyloop, $"Unexpected type: {node.Type}");
Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}");
Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
int minIterations = node.M;
int maxIterations = node.N;
Label originalDoneLabel = doneLabel;
@@ -2610,18 +2683,18 @@ void EmitLazy(RegexNode node)
// base.runstack[stackpos++] = startingPos;
// base.runstack[stackpos++] = pos;
// base.runstack[stackpos++] = sawEmpty;
- EmitRunstackResizeIfNeeded(3);
+ EmitStackResizeIfNeeded(3);
if (expressionHasCaptures)
{
- EmitRunstackPush(() =>
+ EmitStackPush(() =>
{
Ldthis();
Call(s_crawlposMethod);
});
}
- EmitRunstackPush(() => Ldloc(startingPos));
- EmitRunstackPush(() => Ldloc(pos));
- EmitRunstackPush(() => Ldloc(sawEmpty));
+ EmitStackPush(() => Ldloc(startingPos));
+ EmitStackPush(() => Ldloc(pos));
+ EmitStackPush(() => Ldloc(sawEmpty));
// Save off some state. We need to store the current pos so we can compare it against
// pos after the iteration, in order to determine whether the iteration was empty. Empty
@@ -2702,16 +2775,16 @@ void EmitLazy(RegexNode node)
// startingPos = base.runstack[--stackpos];
// capturepos = base.runstack[--stackpos];
// while (base.Crawlpos() > capturepos) base.Uncapture();
- EmitRunstackPop();
+ EmitStackPop();
Stloc(sawEmpty);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(pos);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
if (expressionHasCaptures)
{
using RentedLocalBuilder poppedCrawlPos = RentInt32Local();
- EmitRunstackPop();
+ EmitStackPop();
Stloc(poppedCrawlPos);
EmitUncaptureUntil(poppedCrawlPos);
}
@@ -2737,10 +2810,10 @@ void EmitLazy(RegexNode node)
if (!isAtomic)
{
// Store the capture's state and skip the backtracking section
- EmitRunstackResizeIfNeeded(3);
- EmitRunstackPush(() => Ldloc(startingPos));
- EmitRunstackPush(() => Ldloc(iterationCount));
- EmitRunstackPush(() => Ldloc(sawEmpty));
+ EmitStackResizeIfNeeded(3);
+ EmitStackPush(() => Ldloc(startingPos));
+ EmitStackPush(() => Ldloc(iterationCount));
+ EmitStackPush(() => Ldloc(sawEmpty));
Label skipBacktrack = DefineLabel();
BrFar(skipBacktrack);
@@ -2751,11 +2824,11 @@ void EmitLazy(RegexNode node)
// sawEmpty = base.runstack[--stackpos];
// iterationCount = base.runstack[--stackpos];
// startingPos = base.runstack[--stackpos];
- EmitRunstackPop();
+ EmitStackPop();
Stloc(sawEmpty);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(iterationCount);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
if (maxIterations == int.MaxValue)
@@ -2788,8 +2861,9 @@ void EmitLazy(RegexNode node)
// RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true)
{
- int iterations = node.M;
+ Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Type}");
+ int iterations = node.M;
if (iterations == 0)
{
// No iterations, nothing to do.
@@ -2871,6 +2945,8 @@ void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired
// Emits the code to handle a non-backtracking, variable-length loop around a single character comparison.
void EmitSingleCharAtomicLoop(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Notoneloop or RegexNode.Notoneloopatomic or RegexNode.Setloop or RegexNode.Setloopatomic, $"Unexpected type: {node.Type}");
+
// If this is actually a repeater, emit that instead.
if (node.M == node.N)
{
@@ -3111,6 +3187,7 @@ void EmitSingleCharAtomicLoop(RegexNode node)
// Emits the code to handle a non-backtracking optional zero-or-one loop.
void EmitAtomicSingleCharZeroOrOne(RegexNode node)
{
+ Debug.Assert(node.Type is RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Notoneloop or RegexNode.Notoneloopatomic or RegexNode.Setloop or RegexNode.Setloopatomic, $"Unexpected type: {node.Type}");
Debug.Assert(node.M == 0 && node.N == 1);
Label skipUpdatesLabel = DefineLabel();
@@ -3168,6 +3245,8 @@ void EmitLoop(RegexNode node)
Debug.Assert(node.Type is RegexNode.Loop or RegexNode.Lazyloop, $"Unexpected type: {node.Type}");
Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}");
Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}");
+ Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
+
int minIterations = node.M;
int maxIterations = node.N;
bool isAtomic = node.IsAtomicByParent();
@@ -3198,14 +3277,14 @@ void EmitLoop(RegexNode node)
// We need to store the starting pos and crawl position so that it may
// be backtracked through later. This needs to be the starting position from
// the iteration we're leaving, so it's pushed before updating it to pos.
- EmitRunstackResizeIfNeeded(3);
+ EmitStackResizeIfNeeded(3);
if (expressionHasCaptures)
{
// base.runstack[stackpos++] = base.Crawlpos();
- EmitRunstackPush(() => { Ldthis(); Call(s_crawlposMethod); });
+ EmitStackPush(() => { Ldthis(); Call(s_crawlposMethod); });
}
- EmitRunstackPush(() => Ldloc(startingPos));
- EmitRunstackPush(() => Ldloc(pos));
+ EmitStackPush(() => Ldloc(startingPos));
+ EmitStackPush(() => Ldloc(pos));
// Save off some state. We need to store the current pos so we can compare it against
// pos after the iteration, in order to determine whether the iteration was empty. Empty
@@ -3310,16 +3389,16 @@ void EmitLoop(RegexNode node)
// pos = base.runstack[--stackpos];
// startingPos = base.runstack[--stackpos];
- EmitRunstackPop();
+ EmitStackPop();
Stloc(pos);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
if (expressionHasCaptures)
{
// int poppedCrawlPos = base.runstack[--stackpos];
// while (base.Crawlpos() > poppedCrawlPos) base.Uncapture();
using RentedLocalBuilder poppedCrawlPos = RentInt32Local();
- EmitRunstackPop();
+ EmitStackPop();
Stloc(poppedCrawlPos);
EmitUncaptureUntil(poppedCrawlPos);
}
@@ -3370,9 +3449,9 @@ void EmitLoop(RegexNode node)
if (node.IsInLoop())
{
// Store the capture's state
- EmitRunstackResizeIfNeeded(3);
- EmitRunstackPush(() => Ldloc(startingPos));
- EmitRunstackPush(() => Ldloc(iterationCount));
+ EmitStackResizeIfNeeded(3);
+ EmitStackPush(() => Ldloc(startingPos));
+ EmitStackPush(() => Ldloc(iterationCount));
// Skip past the backtracking section
// goto backtrackingEnd;
@@ -3385,9 +3464,9 @@ void EmitLoop(RegexNode node)
// iterationCount = base.runstack[--runstack];
// startingPos = base.runstack[--runstack];
- EmitRunstackPop();
+ EmitStackPop();
Stloc(iterationCount);
- EmitRunstackPop();
+ EmitStackPop();
Stloc(startingPos);
// goto doneLabel;
@@ -3399,7 +3478,7 @@ void EmitLoop(RegexNode node)
}
}
- void EmitRunstackResizeIfNeeded(int count)
+ void EmitStackResizeIfNeeded(int count)
{
Debug.Assert(count >= 1);
@@ -3431,7 +3510,7 @@ void EmitRunstackResizeIfNeeded(int count)
MarkLabel(skipResize);
}
- void EmitRunstackPush(Action load)
+ void EmitStackPush(Action load)
{
// base.runstack[stackpos] = load();
Ldthisfld(s_runstackField);
@@ -3446,7 +3525,7 @@ void EmitRunstackPush(Action load)
Stloc(stackpos);
}
- void EmitRunstackPop()
+ void EmitStackPop()
{
// ... = base.runstack[--stackpos];
Ldthisfld(s_runstackField);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index fe04caca5e7bbe..ea696c6680a2a7 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -308,11 +308,11 @@ private void ValidateFinalTreeInvariants()
break;
case Testref:
- Debug.Assert(childCount is 1 or 2, $"Expected one or two children for {node.TypeName}, got {childCount}");
+ Debug.Assert(childCount == 2, $"Expected two children for {node.TypeName}, got {childCount}");
break;
case Testgroup:
- Debug.Assert(childCount is 2 or 3, $"Expected two or three children for {node.TypeName}, got {childCount}");
+ Debug.Assert(childCount == 3, $"Expected three children for {node.TypeName}, got {childCount}");
break;
case Concatenate:
@@ -562,39 +562,20 @@ public bool IsAtomicByParent()
///
/// Removes redundant nodes from the subtree, and returns an optimized subtree.
///
- internal RegexNode Reduce()
- {
- switch (Type)
+ internal RegexNode Reduce() =>
+ Type switch
{
- case Alternate:
- return ReduceAlternation();
-
- case Concatenate:
- return ReduceConcatenation();
-
- case Loop:
- case Lazyloop:
- return ReduceLoops();
-
- case Atomic:
- return ReduceAtomic();
-
- case Group:
- return ReduceGroup();
-
- case Set:
- case Setloop:
- case Setloopatomic:
- case Setlazy:
- return ReduceSet();
-
- case Prevent:
- return ReducePrevent();
-
- default:
- return this;
- }
- }
+ Alternate => ReduceAlternation(),
+ Atomic => ReduceAtomic(),
+ Concatenate => ReduceConcatenation(),
+ Group => ReduceGroup(),
+ Loop or Lazyloop => ReduceLoops(),
+ Prevent => ReducePrevent(),
+ Set or Setloop or Setloopatomic or Setlazy => ReduceSet(),
+ Testgroup => ReduceTestgroup(),
+ Testref => ReduceTestref(),
+ _ => this,
+ };
/// Remove an unnecessary Concatenation or Alternation node
///
@@ -1819,6 +1800,53 @@ private RegexNode ReducePrevent()
return this;
}
+ /// Optimizations for backreference conditionals.
+ private RegexNode ReduceTestref()
+ {
+ Debug.Assert(Type == Testref);
+ Debug.Assert(ChildCount() is 1 or 2);
+
+ // This isn't so much an optimization as it is changing the tree for consistency.
+ // We want all engines to be able to trust that every Testref will have two children,
+ // even though it's optional in the syntax. If it's missing a "not matched" branch,
+ // we add one that will match empty.
+ if (ChildCount() == 1)
+ {
+ AddChild(new RegexNode(Empty, Options));
+ }
+
+ return this;
+ }
+
+ /// Optimizations for expression conditionals.
+ private RegexNode ReduceTestgroup()
+ {
+ Debug.Assert(Type == Testgroup);
+ Debug.Assert(ChildCount() is 2 or 3);
+
+ // This isn't so much an optimization as it is changing the tree for consistency.
+ // We want all engines to be able to trust that every Testgroup will have three children,
+ // even though it's optional in the syntax. If it's missing a "not matched" branch,
+ // we add one that will match empty.
+ if (ChildCount() == 2)
+ {
+ AddChild(new RegexNode(Empty, Options));
+ }
+
+ // It's common for the condition to be an explicit positive lookahead, as specifying
+ // that eliminates any ambiguity in syntax as to whether the expression is to be matched
+ // as an expression or to be a reference to a capture group. After parsing, however,
+ // there's no ambiguity, and we can remove an extra level of positive lookahead, as the
+ // engines need to treat the condition as a zero-width positive, atomic assertion regardless.
+ RegexNode condition = Child(0);
+ if (condition.Type == Require && (condition.Options & RegexOptions.RightToLeft) == 0)
+ {
+ ReplaceChild(0, condition.Child(0));
+ }
+
+ return this;
+ }
+
///
/// Determines whether node can be switched to an atomic loop. Subsequent is the node
/// immediately after 'node'.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
index 5ef7281b5884da..d3caec254a94fb 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
@@ -288,13 +288,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
Emit(RegexCode.Goto, 0);
PatchJump(Branchpos, _emitted.Length);
Emit(RegexCode.Forejump);
- if (node.ChildCount() > 1)
- {
- break;
- }
-
- // else fallthrough
- goto case 1;
+ break;
}
case 1:
PatchJump(_intStack.Pop(), _emitted.Length);
@@ -328,11 +322,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
PatchJump(Branchpos, _emitted.Length);
Emit(RegexCode.Getmark);
Emit(RegexCode.Forejump);
-
- if (node.ChildCount() > 2)
- break;
- // else fallthrough
- goto case 2;
+ break;
case 2:
PatchJump(_intStack.Pop(), _emitted.Length);
break;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs
index ab6e819b6a60ab..6c299578983d7c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs
@@ -484,7 +484,7 @@ SymbolicRegexNode ConvertSetloop(RegexNode node, bool isLazy)
bool IsDotStar(RegexNode node) => node.Type == RegexNode.Setloop && Convert(node, topLevel: false).IsAnyStar;
- bool IsIntersect(RegexNode node) => node.Type == RegexNode.Testgroup && node.ChildCount() > 2 && IsNothing(node.Child(2));
+ bool IsIntersect(RegexNode node) => node.Type == RegexNode.Testgroup && IsNothing(node.Child(2));
bool TryGetIntersection(RegexNode node, [Diagnostics.CodeAnalysis.NotNullWhen(true)] out List? conjuncts)
{
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs
index 723bb034acc6cd..d2a07d60dd04e3 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs
@@ -833,7 +833,7 @@ public static IEnumerable