Skip to content

Commit 070b7d8

Browse files
authored
Put back FindCaseSensitivePrefix regex alternation support (#64204)
* Put back FindCaseSensitivePrefix alternation support * Fix the bug from the initial version, and add more comments
1 parent 9eb1b0c commit 070b7d8

File tree

3 files changed

+61
-4
lines changed

3 files changed

+61
-4
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
5050
}
5151

5252
// If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations.
53-
string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree);
53+
string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree.Root);
5454
if (caseSensitivePrefix.Length > 1)
5555
{
5656
LeadingCaseSensitivePrefix = caseSensitivePrefix;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ private RegexPrefixAnalyzer(Span<int> intStack)
5050
_skipAllChildren = false;
5151
}
5252

53-
/// <summary>Computes the leading substring in <paramref name="tree"/>; may be empty.</summary>
54-
public static string FindCaseSensitivePrefix(RegexTree tree)
53+
/// <summary>Computes the leading substring in <paramref name="node"/>; may be empty.</summary>
54+
public static string FindCaseSensitivePrefix(RegexNode node)
5555
{
5656
var vsb = new ValueStringBuilder(stackalloc char[64]);
57-
Process(tree.Root, ref vsb);
57+
Process(node, ref vsb);
5858
return vsb.ToString();
5959

6060
// Processes the node, adding any prefix text to the builder.
@@ -87,6 +87,59 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
8787
return !rtl;
8888
}
8989

90+
// Alternation: find a string that's a shared prefix of all branches
91+
case RegexNodeKind.Alternate:
92+
{
93+
int childCount = node.ChildCount();
94+
95+
// Store the initial branch into the target builder, keeping track
96+
// of how much was appended. Any of this contents that doesn't overlap
97+
// will every other branch will be removed before returning.
98+
int initialLength = vsb.Length;
99+
Process(node.Child(0), ref vsb);
100+
int addedLength = vsb.Length - initialLength;
101+
102+
// Then explore the rest of the branches, finding the length
103+
// of prefix they all share in common with the initial branch.
104+
if (addedLength != 0)
105+
{
106+
var alternateSb = new ValueStringBuilder(64);
107+
108+
// Process each branch. If we reach a point where we've proven there's
109+
// no overlap, we can bail early.
110+
for (int i = 1; i < childCount && addedLength != 0; i++)
111+
{
112+
alternateSb.Length = 0;
113+
114+
// Process the branch into a temporary builder.
115+
Process(node.Child(i), ref alternateSb);
116+
117+
// Find how much overlap there is between this branch's prefix
118+
// and the smallest amount of prefix that overlapped with all
119+
// the previously seen branches.
120+
addedLength = Math.Min(addedLength, alternateSb.Length);
121+
for (int j = 0; j < addedLength; j++)
122+
{
123+
if (vsb[initialLength + j] != alternateSb[j])
124+
{
125+
addedLength = j;
126+
break;
127+
}
128+
}
129+
}
130+
131+
alternateSb.Dispose();
132+
133+
// Then cull back on what was added based on the other branches.
134+
vsb.Length = initialLength + addedLength;
135+
}
136+
137+
// Don't explore anything after the alternation. We could make this work if desirable,
138+
// but it's currently not worth the extra complication. The entire contents of every
139+
// branch would need to be identical other than zero-width anchors/assertions.
140+
return false;
141+
}
142+
90143
// One character
91144
case RegexNodeKind.One when (node.Options & RegexOptions.IgnoreCase) == 0:
92145
vsb.Append(node.Ch);

src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ public static IEnumerable<object[]> Match_MemberData()
205205
yield return (@"(^|($|a+))bc", " aabc", RegexOptions.None, 0, 5, true, "aabc");
206206
yield return (@"yz(^|a+)bc", " yzaabc", RegexOptions.None, 0, 7, true, "yzaabc");
207207
yield return (@"(^a|a$) bc", "a bc", RegexOptions.None, 0, 4, true, "a bc");
208+
yield return (@"(abcdefg|abcdef|abc|a)h", " ah ", RegexOptions.None, 0, 8, true, "ah");
209+
yield return (@"(^abcdefg|abcdef|^abc|a)h", " abcdefh ", RegexOptions.None, 0, 13, true, "abcdefh");
210+
yield return (@"(a|^abcdefg|abcdef|^abc)h", " abcdefh ", RegexOptions.None, 0, 13, true, "abcdefh");
211+
yield return (@"(abcdefg|abcdef)h", " abcdefghij ", RegexOptions.None, 0, 16, true, "abcdefgh");
208212

209213
if (!RegexHelpers.IsNonBacktracking(engine))
210214
{

0 commit comments

Comments
 (0)