diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index da78325562cd32..ee255e40f7f9c2 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -22,6 +22,10 @@ namespace System.Text.RegularExpressions.Generator
{
public partial class RegexGenerator
{
+ /// Escapes '&', '<' and '>' characters. We aren't using HtmlEncode as that would also escape single and double quotes.
+ private static string EscapeXmlComment(string text) =>
+ text.Replace("&", "&").Replace("<", "<").Replace(">", ">");
+
/// Emits the definition of the partial method. This method just delegates to the property cache on the generated Regex-derived type.
private static void EmitRegexPartialMethod(RegexMethod regexMethod, IndentedTextWriter writer)
{
@@ -363,6 +367,60 @@ private static void AddIsECMABoundaryHelper(Dictionary require
}
}
+ /// Adds an IndexOfAnyValues instance declaration to the required helpers collection if the chars are ASCII.
+ private static string EmitIndexOfAnyValuesOrLiteral(ReadOnlySpan chars, Dictionary requiredHelpers)
+ {
+ // IndexOfAnyValues is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
+ // Only emit IndexOfAnyValues instances when we know they'll be faster to avoid increasing the startup cost too much.
+ Debug.Assert(chars.Length is 4 or 5);
+
+ return RegexCharClass.IsAscii(chars)
+ ? EmitIndexOfAnyValues(chars.ToArray(), requiredHelpers)
+ : Literal(chars.ToString());
+ }
+
+ /// Adds an IndexOfAnyValues instance declaration to the required helpers collection.
+ private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary requiredHelpers)
+ {
+ Debug.Assert(RegexCharClass.IsAscii(asciiChars));
+
+ // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
+ byte[] bitmap = new byte[16];
+ foreach (char c in asciiChars)
+ {
+ bitmap[c >> 3] |= (byte)(1 << (c & 7));
+ }
+
+ string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
+
+ string fieldName = hexBitmap switch
+ {
+ "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
+ "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
+ "000000000000FF037E0000007E000000" => "s_asciiHexDigits",
+ "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
+ "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
+ _ => $"s_ascii_{hexBitmap.TrimStart('0')}"
+ };
+
+ string helperName = $"IndexOfAnyValues_{fieldName}";
+
+ if (!requiredHelpers.ContainsKey(helperName))
+ {
+ Array.Sort(asciiChars);
+
+ string setLiteral = Literal(new string(asciiChars));
+
+ requiredHelpers.Add(helperName, new string[]
+ {
+ $"/// Cached data to efficiently search for a character in the set {EscapeXmlComment(setLiteral)}.",
+ $"internal static readonly IndexOfAnyValues {fieldName} = IndexOfAnyValues.Create({setLiteral});",
+ });
+ }
+
+ return $"{HelpersTypeName}.{fieldName}";
+ }
+
/// Emits the body of the Scan method override.
private static (bool NeedsTryFind, bool NeedsTryMatch) EmitScan(IndentedTextWriter writer, RegexMethod rm)
{
@@ -810,7 +868,7 @@ void EmitFixedSet_LeftToRight()
int setIndex = 0;
bool canUseIndexOf =
primarySet.Set != RegexCharClass.NotNewLineClass &&
- (primarySet.Chars is not null || primarySet.Range is not null);
+ (primarySet.Chars is not null || primarySet.Range is not null || primarySet.AsciiSet is not null);
bool needLoop = !canUseIndexOf || setsToUse > 1;
FinishEmitBlock loopBlock = default;
@@ -835,15 +893,18 @@ void EmitFixedSet_LeftToRight()
(true, _) => $"{span}.Slice(i + {primarySet.Distance})",
};
+ Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
+
string indexOf =
- primarySet.Chars is not null ? primarySet.Chars!.Length switch
+ primarySet.Chars is not null ? primarySet.Chars.Length switch
{
1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})",
2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
- _ => $"{span}.IndexOfAny({Literal(new string(primarySet.Chars))})",
+ _ => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
} :
- (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Range.Value.Negated) switch
+ primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet, requiredHelpers)})" :
+ (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
{
(false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
(true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})",
@@ -1010,7 +1071,7 @@ void EmitLiteralAfterAtomicLoop()
{
2 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])});",
3 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])}, {Literal(literalChars[2])});",
- _ => $"IndexOfAny({Literal(new string(literalChars))});",
+ _ => $"IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literalChars, requiredHelpers)});",
});
FinishEmitBlock indexOfFoundBlock = default;
@@ -2920,7 +2981,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
if (!rtl &&
node.N > 1 && // no point in using IndexOf for small loops, in particular optionals
subsequent?.FindStartingLiteralNode() is RegexNode literalNode &&
- TryEmitIndexOf(literalNode, useLast: true, negate: false, out int literalLength, out string? indexOfExpr))
+ TryEmitIndexOf(requiredHelpers, literalNode, useLast: true, negate: false, out int literalLength, out string? indexOfExpr))
{
writer.WriteLine($"if ({startingPos} >= {endingPos} ||");
@@ -3079,6 +3140,7 @@ node.Kind is RegexNodeKind.Notonelazy &&
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
(literal.String is not null ||
literal.SetChars is not null ||
+ (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
literal.Range.LowInclusive == literal.Range.HighInclusive ||
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
{
@@ -3104,12 +3166,24 @@ literal.SetChars is not null ||
{
(true, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});",
(true, 3) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])});",
- (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars)});",
+ (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literal.SetChars.AsSpan(), requiredHelpers)});",
(false, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});",
- (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});",
+ (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral($"{node.Ch}{literal.SetChars}".AsSpan(), requiredHelpers)});",
});
}
+ else if (literal.AsciiChars is not null) // set of only ASCII characters
+ {
+ char[] asciiChars = literal.AsciiChars;
+ overlap = asciiChars.Contains(node.Ch);
+ if (!overlap)
+ {
+ Debug.Assert(node.Ch < 128);
+ Array.Resize(ref asciiChars, asciiChars.Length + 1);
+ asciiChars[asciiChars.Length - 1] = node.Ch;
+ }
+ writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValues(asciiChars, requiredHelpers)});");
+ }
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
{
overlap = literal.Range.LowInclusive == node.Ch;
@@ -3144,7 +3218,7 @@ literal.SetChars is not null ||
node.Kind is RegexNodeKind.Setlazy &&
node.Str == RegexCharClass.AnyClass &&
subsequent?.FindStartingLiteralNode() is RegexNode literal2 &&
- TryEmitIndexOf(literal2, useLast: false, negate: false, out _, out string? indexOfExpr))
+ TryEmitIndexOf(requiredHelpers, literal2, useLast: false, negate: false, out _, out string? indexOfExpr))
{
// e.g. ".*?string" with RegexOptions.Singleline
// This lazy loop will consume all characters until the subsequent literal. If the subsequent literal
@@ -3592,7 +3666,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true)
// For the loop, we're validating that each char matches the target node.
// For IndexOf, we're looking for the first thing that _doesn't_ match the target node,
// and thus similarly validating that everything does.
- if (TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr))
+ if (TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr))
{
using (EmitBlock(writer, $"if ({sliceSpan}.Slice({sliceStaticPos}, {iterations}).{indexOfExpr} >= 0)"))
{
@@ -3685,7 +3759,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
TransferSliceStaticPosToPos();
writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;");
}
- else if (maxIterations == int.MaxValue && TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr))
+ else if (maxIterations == int.MaxValue && TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr))
{
// We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
// purely for simplicity; it could be removed in the future with additional code to handle that case.
@@ -4342,6 +4416,7 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet
/// The resulting expression if it returns true; otherwise, null.
/// true if an expression could be produced; otherwise, false.
private static bool TryEmitIndexOf(
+ Dictionary requiredHelpers,
RegexNode node,
bool useLast, bool negate,
out int literalLength, [NotNullWhen(true)] out string? indexOfExpr)
@@ -4375,8 +4450,22 @@ private static bool TryEmitIndexOf(
bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
Span setChars = stackalloc char[5]; // current max that's vectorized
- int setCharsCount;
- if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
+ int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
+
+ // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
+ if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+ {
+ string indexOfAnyInRangeName = !negated ?
+ "IndexOfAnyInRange" :
+ "IndexOfAnyExceptInRange";
+
+ indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})";
+
+ literalLength = 1;
+ return true;
+ }
+
+ if (setCharsCount > 0)
{
(string indexOfName, string indexOfAnyName) = !negated ?
("IndexOf", "IndexOfAny") :
@@ -4388,20 +4477,20 @@ private static bool TryEmitIndexOf(
1 => $"{last}{indexOfName}({Literal(setChars[0])})",
2 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])})",
3 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})",
- _ => $"{last}{indexOfAnyName}({Literal(setChars.ToString())})",
+ _ => $"{last}{indexOfAnyName}({EmitIndexOfAnyValuesOrLiteral(setChars, requiredHelpers)})",
};
literalLength = 1;
return true;
}
- if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+ if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
{
- string indexOfAnyInRangeName = !negated ?
- "IndexOfAnyInRange" :
- "IndexOfAnyExceptInRange";
+ string indexOfAnyName = !negated ?
+ "IndexOfAny" :
+ "IndexOfAnyExcept";
- indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})";
+ indexOfExpr = $"{last}{indexOfAnyName}({EmitIndexOfAnyValues(asciiChars, requiredHelpers)})";
literalLength = 1;
return true;
@@ -4985,14 +5074,11 @@ RegexNodeKind.BackreferenceConditional when node.Parent.Child(1) == node => "Not
_ => "",
};
- // Get a textual description of the node, making it safe for an XML comment (escaping the minimal amount necessary to
- // avoid compilation failures: we don't want to escape single and double quotes, as HtmlEncode would do).
string nodeDescription = DescribeNode(node, rm);
- nodeDescription = nodeDescription.Replace("&", "&").Replace("<", "<").Replace(">", ">");
// Write out the line for the node.
const char BulletPoint = '\u25CB';
- writer.WriteLine($"/// {new string(' ', depth * 4)}{BulletPoint} {tag}{nodeDescription}
");
+ writer.WriteLine($"/// {new string(' ', depth * 4)}{BulletPoint} {tag}{EscapeXmlComment(nodeDescription)}
");
}
// Process each child.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs
index ed506320a1a8f6..2dd5c6d0d551b5 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs
@@ -198,6 +198,7 @@ x.Options is CSharpCompilationOptions options ?
// a user's partial type. We can now rely on binding rules mapping to these usings and don't need to
// use global-qualified names for the rest of the implementation.
writer.WriteLine($" using System;");
+ writer.WriteLine($" using System.Buffers;");
writer.WriteLine($" using System.CodeDom.Compiler;");
writer.WriteLine($" using System.Collections;");
writer.WriteLine($" using System.ComponentModel;");
@@ -240,7 +241,7 @@ x.Options is CSharpCompilationOptions options ?
writer.WriteLine($"{{");
writer.Indent++;
bool sawFirst = false;
- foreach (KeyValuePair helper in requiredHelpers)
+ foreach (KeyValuePair helper in requiredHelpers.OrderBy(h => h.Key, StringComparer.Ordinal))
{
if (sawFirst)
{
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs
index a0e66e369ec978..35171934777974 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Buffers;
using System.Globalization;
namespace System.Text.RegularExpressions
@@ -8,6 +9,9 @@ namespace System.Text.RegularExpressions
internal sealed class CompiledRegexRunner : RegexRunner
{
private readonly ScanDelegate _scanMethod;
+
+ private readonly IndexOfAnyValues[]? _indexOfAnyValues;
+
/// This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase
private readonly CultureInfo? _culture;
@@ -19,9 +23,10 @@ internal sealed class CompiledRegexRunner : RegexRunner
internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan text);
- public CompiledRegexRunner(ScanDelegate scan, CultureInfo? culture)
+ public CompiledRegexRunner(ScanDelegate scan, IndexOfAnyValues[]? indexOfAnyValues, CultureInfo? culture)
{
_scanMethod = scan;
+ _indexOfAnyValues = indexOfAnyValues;
_culture = culture;
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs
index b7ec852f4cdbe8..5d21799b339fe9 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Buffers;
using System.Globalization;
using System.Reflection.Emit;
@@ -9,20 +10,22 @@ namespace System.Text.RegularExpressions
internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory
{
private readonly DynamicMethod _scanMethod;
+ private readonly IndexOfAnyValues[]? _indexOfAnyValues;
/// This field will only be set if the pattern has backreferences and uses RegexOptions.IgnoreCase
private readonly CultureInfo? _culture;
// Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed.
private CompiledRegexRunner.ScanDelegate? _scan;
- public CompiledRegexRunnerFactory(DynamicMethod scanMethod, CultureInfo? culture)
+ public CompiledRegexRunnerFactory(DynamicMethod scanMethod, IndexOfAnyValues[]? indexOfAnyValues, CultureInfo? culture)
{
_scanMethod = scanMethod;
+ _indexOfAnyValues = indexOfAnyValues;
_culture = culture;
}
protected internal override RegexRunner CreateInstance() =>
new CompiledRegexRunner(
- _scan ??= _scanMethod.CreateDelegate(), _culture);
+ _scan ??= _scanMethod.CreateDelegate(), _indexOfAnyValues, _culture);
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index 894536bcd976fb..e2888ff7a650bf 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -3,6 +3,7 @@
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Numerics;
using System.Runtime.CompilerServices;
@@ -840,6 +841,22 @@ public static int GetSetChars(string set, Span chars)
return count;
}
+ public static bool TryGetAsciiSetChars(string set, [NotNullWhen(true)] out char[]? asciiChars)
+ {
+ Span chars = stackalloc char[128];
+
+ chars = chars.Slice(0, GetSetChars(set, chars));
+
+ if (chars.IsEmpty || !IsAscii(chars))
+ {
+ asciiChars = null;
+ return false;
+ }
+
+ asciiChars = chars.ToArray();
+ return true;
+ }
+
///
/// Determines whether two sets may overlap.
///
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index 0b363278a5d0b5..e94567b7ac9cca 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Buffers;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
@@ -23,6 +24,7 @@ internal abstract class RegexCompiler
private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack");
private static readonly FieldInfo s_cultureField = typeof(CompiledRegexRunner).GetField("_culture", BindingFlags.Instance | BindingFlags.NonPublic)!;
private static readonly FieldInfo s_caseBehaviorField = typeof(CompiledRegexRunner).GetField("_caseBehavior", BindingFlags.Instance | BindingFlags.NonPublic)!;
+ private static readonly FieldInfo s_indexOfAnyValuesArrayField = typeof(CompiledRegexRunner).GetField("_indexOfAnyValues", BindingFlags.Instance | BindingFlags.NonPublic)!;
private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture");
private static readonly MethodInfo s_transferCaptureMethod = RegexRunnerMethod("TransferCapture");
@@ -65,21 +67,25 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanIndexOfAnyIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyExceptChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanIndexOfAnyExceptIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanLastIndexOfAnyIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyExceptChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanLastIndexOfAnyExceptIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanLastIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!;
@@ -103,6 +109,9 @@ internal abstract class RegexCompiler
/// Whether this expression has a non-infinite timeout.
protected bool _hasTimeout;
+ /// instances used by the expression. For now these are only ASCII sets.
+ protected List>? _indexOfAnyValues;
+
/// Pool of Int32 LocalBuilders.
private Stack? _int32LocalsPool;
/// Pool of ReadOnlySpan of char locals.
@@ -829,7 +838,7 @@ void EmitFixedSet_LeftToRight()
int setIndex = 0;
bool canUseIndexOf =
primarySet.Set != RegexCharClass.NotNewLineClass &&
- (primarySet.Chars is not null || primarySet.Range is not null);
+ (primarySet.Chars is not null || primarySet.Range is not null || primarySet.AsciiSet is not null);
bool needLoop = !canUseIndexOf || setsToUse > 1;
Label checkSpanLengthLabel = default;
@@ -877,9 +886,11 @@ void EmitFixedSet_LeftToRight()
Ldloc(textSpanLocal);
}
+ Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
+
if (primarySet.Chars is not null)
{
- switch (primarySet.Chars!.Length)
+ switch (primarySet.Chars.Length)
{
case 1:
// tmp = ...IndexOf(setChars[0]);
@@ -909,20 +920,26 @@ void EmitFixedSet_LeftToRight()
break;
}
}
+ else if (primarySet.AsciiSet is not null)
+ {
+ Debug.Assert(!primarySet.Negated);
+ LoadIndexOfAnyValues(primarySet.AsciiSet);
+ Call(s_spanIndexOfAnyIndexOfAnyValues);
+ }
else
{
if (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive)
{
// tmp = ...IndexOf{AnyExcept}(low);
- Ldc(primarySet.Range!.Value.LowInclusive);
- Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
+ Ldc(primarySet.Range.Value.LowInclusive);
+ Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
}
else
{
// tmp = ...IndexOfAny{Except}InRange(low, high);
- Ldc(primarySet.Range!.Value.LowInclusive);
+ Ldc(primarySet.Range.Value.LowInclusive);
Ldc(primarySet.Range.Value.HighInclusive);
- Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange);
+ Call(primarySet.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange);
}
}
@@ -3385,6 +3402,7 @@ node.Kind is RegexNodeKind.Notonelazy &&
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
(literal.String is not null ||
literal.SetChars is not null ||
+ (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
literal.Range.LowInclusive == literal.Range.HighInclusive ||
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
{
@@ -3457,6 +3475,19 @@ literal.SetChars is not null ||
break;
}
}
+ else if (literal.AsciiChars is not null) // set of only ASCII characters
+ {
+ char[] asciiChars = literal.AsciiChars;
+ overlap = asciiChars.AsSpan().Contains(node.Ch);
+ if (!overlap)
+ {
+ Debug.Assert(node.Ch < 128);
+ Array.Resize(ref asciiChars, asciiChars.Length + 1);
+ asciiChars[^1] = node.Ch;
+ }
+ LoadIndexOfAnyValues(asciiChars);
+ Call(s_spanIndexOfAnyIndexOfAnyValues);
+ }
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
{
overlap = literal.Range.LowInclusive == node.Ch;
@@ -4929,6 +4960,12 @@ bool CanEmitIndexOf(RegexNode node, out int literalLength)
literalLength = 1;
return true;
}
+
+ if (RegexCharClass.TryGetAsciiSetChars(node.Str, out _))
+ {
+ literalLength = 1;
+ return true;
+ }
}
literalLength = 0;
@@ -4975,10 +5012,40 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
{
bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
- // IndexOfAny{Except}(ch1, ...)
Span setChars = stackalloc char[5]; // current max that's vectorized
- int setCharsCount;
- if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
+ int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
+
+ // IndexOfAny{Except}InRange
+ // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
+ if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+ {
+ if (lowInclusive == highInclusive)
+ {
+ Ldc(lowInclusive);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfChar,
+ (false, true) => s_spanIndexOfAnyExceptChar,
+ (true, false) => s_spanLastIndexOfChar,
+ (true, true) => s_spanLastIndexOfAnyExceptChar,
+ });
+ return;
+ }
+
+ Ldc(lowInclusive);
+ Ldc(highInclusive);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfAnyInRange,
+ (false, true) => s_spanIndexOfAnyExceptInRange,
+ (true, false) => s_spanLastIndexOfAnyInRange,
+ (true, true) => s_spanLastIndexOfAnyExceptInRange,
+ });
+ return;
+ }
+
+ // IndexOfAny{Except}(ch1, ...)
+ if (setCharsCount > 0)
{
setChars = setChars.Slice(0, setCharsCount);
switch (setChars.Length)
@@ -5033,30 +5100,16 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
}
}
- // IndexOfAny{Except}InRange
- if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+ // IndexOfAny{Except}(IndexOfAnyValues)
+ if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
{
- if (lowInclusive == highInclusive)
- {
- Ldc(lowInclusive);
- Call((useLast, negated) switch
- {
- (false, false) => s_spanIndexOfChar,
- (false, true) => s_spanIndexOfAnyExceptChar,
- (true, false) => s_spanLastIndexOfChar,
- (true, true) => s_spanLastIndexOfAnyExceptChar,
- });
- return;
- }
-
- Ldc(lowInclusive);
- Ldc(highInclusive);
+ LoadIndexOfAnyValues(asciiChars);
Call((useLast, negated) switch
{
- (false, false) => s_spanIndexOfAnyInRange,
- (false, true) => s_spanIndexOfAnyExceptInRange,
- (true, false) => s_spanLastIndexOfAnyInRange,
- (true, true) => s_spanLastIndexOfAnyExceptInRange,
+ (false, false) => s_spanIndexOfAnyIndexOfAnyValues,
+ (false, true) => s_spanIndexOfAnyExceptIndexOfAnyValues,
+ (true, false) => s_spanLastIndexOfAnyIndexOfAnyValues,
+ (true, true) => s_spanLastIndexOfAnyExceptIndexOfAnyValues,
});
return;
}
@@ -5951,5 +6004,20 @@ private void EmitTimeoutCheckIfNeeded()
Call(s_checkTimeoutMethod);
}
}
+
+ ///
+ /// Adds an entry in for the given and emits a load of that initialized value.
+ ///
+ private void LoadIndexOfAnyValues(char[] chars)
+ {
+ List> list = _indexOfAnyValues ??= new();
+ int index = list.Count;
+ list.Add(IndexOfAnyValues.Create(chars));
+
+ // this._indexOfAnyValues[index]
+ Ldthisfld(s_indexOfAnyValuesArrayField);
+ Ldc(index);
+ _ilg!.Emit(OpCodes.Ldelem_Ref);
+ }
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
index 099073da0cb822..d02c74a70c7b6e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -241,12 +241,16 @@ public FixedDistanceSet(char[]? chars, string set, int distance)
/// The character class description.
public string Set;
+ /// Whether the is negated.
+ public bool Negated;
/// Small list of all of the characters that make up the set, if known; otherwise, null.
public char[]? Chars;
/// The distance of the set from the beginning of the match.
public int Distance;
/// As an alternative to , a description of the single range the set represents, if it does.
- public (char LowInclusive, char HighInclusive, bool Negated)? Range;
+ public (char LowInclusive, char HighInclusive)? Range;
+ /// As an alternative to , a description of the set of ASCII characters it represents, if it does.
+ public char[]? AsciiSet;
}
/// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.
@@ -271,7 +275,7 @@ private static (string String, int Distance)? FindFixedDistanceString(List) });
EmitScan(options, tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod);
- return new CompiledRegexRunnerFactory(scanMethod, regexTree.Culture);
+ return new CompiledRegexRunnerFactory(scanMethod, _indexOfAnyValues?.ToArray(), regexTree.Culture);
}
/// Begins the definition of a new method (no args) with a specified return value.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index a94be746767a0d..d659026d0ae522 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -1428,10 +1428,10 @@ public char FirstCharOfOneOrMulti()
switch (node.Kind)
{
case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy:
- return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false);
+ return new StartingLiteralData(range: (node.Ch, node.Ch), negated: false);
case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy:
- return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true);
+ return new StartingLiteralData(range: (node.Ch, node.Ch), negated: true);
case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy:
Span setChars = stackalloc char[maxSetCharacters];
@@ -1439,18 +1439,23 @@ public char FirstCharOfOneOrMulti()
if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
{
setChars = setChars.Slice(0, numChars);
- return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!));
+ return new StartingLiteralData(setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!));
}
if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive))
{
Debug.Assert(lowInclusive < highInclusive);
- return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!));
+ return new StartingLiteralData(range: (lowInclusive, highInclusive), negated: RegexCharClass.IsNegated(node.Str!));
+ }
+
+ if (RegexCharClass.TryGetAsciiSetChars(node.Str!, out char[]? asciiChars))
+ {
+ return new StartingLiteralData(asciiChars: asciiChars, negated: RegexCharClass.IsNegated(node.Str!));
}
break;
case RegexNodeKind.Multi:
- return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false);
+ return new StartingLiteralData(@string: node.Str);
}
}
@@ -1463,15 +1468,34 @@ public readonly struct StartingLiteralData
public readonly (char LowInclusive, char HighInclusive) Range;
public readonly string? String;
public readonly string? SetChars;
+ public readonly char[]? AsciiChars;
public readonly bool Negated;
- public StartingLiteralData((char LowInclusive, char HighInclusive) range, string? @string, string? setChars, bool negated)
+ public StartingLiteralData((char LowInclusive, char HighInclusive) range, bool negated)
{
Range = range;
+ Negated = negated;
+ }
+
+ public StartingLiteralData(string? @string)
+ {
+ Debug.Assert(@string is not null);
String = @string;
+ }
+
+ public StartingLiteralData(string? setChars, bool negated)
+ {
+ Debug.Assert(setChars is not null);
SetChars = setChars;
Negated = negated;
}
+
+ public StartingLiteralData(char[]? asciiChars, bool negated)
+ {
+ Debug.Assert(asciiChars is not null);
+ AsciiChars = asciiChars;
+ Negated = negated;
+ }
}
///
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index cacf02d321ed53..0e0badd650ebfd 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -200,26 +200,30 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
for (int i = 0; i < results.Count; i++)
{
RegexFindOptimizations.FixedDistanceSet result = results[i];
- bool negated = RegexCharClass.IsNegated(result.Set);
+ result.Negated = RegexCharClass.IsNegated(result.Set);
- if (!negated)
+ int count = RegexCharClass.GetSetChars(result.Set, scratch);
+
+ if (!result.Negated && count > 0)
{
- int count = RegexCharClass.GetSetChars(result.Set, scratch);
- if (count != 0)
- {
- result.Chars = scratch.Slice(0, count).ToArray();
- results[i] = result;
- }
+ result.Chars = scratch.Slice(0, count).ToArray();
}
- if (thorough && result.Chars is null)
+ if (thorough)
{
- if (RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
+ // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
+ if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
+ {
+ result.Chars = null;
+ result.Range = (lowInclusive, highInclusive);
+ }
+ else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars))
{
- result.Range = (lowInclusive, highInclusive, negated);
- results[i] = result;
+ result.AsciiSet = asciiChars;
}
}
+
+ results[i] = result;
}
return results;
@@ -435,18 +439,38 @@ static bool TryFindFixedSets(RegexNode node, List results) =>
// Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search
// for the fastest and that have the best chance of matching as few false positives as possible.
- results.Sort((s1, s2) =>
+ results.Sort(static (s1, s2) =>
{
+ char[]? s1Chars = s1.Chars ?? s1.AsciiSet;
+ char[]? s2Chars = s2.Chars ?? s2.AsciiSet;
+ int s1CharsLength = s1Chars?.Length ?? 0;
+ int s2CharsLength = s2Chars?.Length ?? 0;
+ bool s1Negated = s1.Negated;
+ bool s2Negated = s2.Negated;
+ int s1RangeLength = s1.Range is not null ? GetRangeLength(s1.Range.Value, s1Negated) : 0;
+ int s2RangeLength = s2.Range is not null ? GetRangeLength(s2.Range.Value, s2Negated) : 0;
+
+ Debug.Assert(!s1Negated || s1Chars is null);
+ Debug.Assert(!s2Negated || s2Chars is null);
+
// If both have chars, prioritize the one with the smaller frequency for those chars.
- if (s1.Chars is not null && s2.Chars is not null)
+ if (s1Chars is not null && s2Chars is not null)
{
- // Then of the ones that are the same length, prefer those with less frequent values. The frequency is
- // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True
- // frequencies will vary widely based on the actual data being searched, the language of the data, etc.
- int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars));
- if (c != 0)
+ // Prefer sets with less frequent values. The frequency is only an approximation,
+ // used as a tie-breaker when we'd otherwise effectively be picking randomly.
+ // True frequencies will vary widely based on the actual data being searched, the language of the data, etc.
+ float s1Frequency = SumFrequencies(s1Chars);
+ float s2Frequency = SumFrequencies(s2Chars);
+
+ if (s1Frequency != s2Frequency)
{
- return c;
+ return s1Frequency.CompareTo(s2Frequency);
+ }
+
+ if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars))
+ {
+ // Prefer the set with fewer values.
+ return s1CharsLength.CompareTo(s2CharsLength);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -455,45 +479,59 @@ static float SumFrequencies(char[] chars)
float sum = 0;
foreach (char c in chars)
{
- // Lookup each character in the table. For values > 255, this will end up truncating
+ // Lookup each character in the table. Values >= 128 are ignored
// and thus we'll get skew in the data. It's already a gross approximation, though,
// and it is primarily meant for disambiguation of ASCII letters.
- sum += s_frequency[(byte)c];
+ if (c < 128)
+ {
+ sum += s_frequency[c];
+ }
}
return sum;
}
}
+ // If one has chars and the other has a range, prefer the shorter set.
+ if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0))
+ {
+ int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength));
+ if (c != 0)
+ {
+ return c;
+ }
+
+ // If lengths are the same, prefer the chars.
+ return s1CharsLength > 0 ? -1 : 1;
+ }
+
// If one has chars and the other doesn't, prioritize the one with chars.
- if ((s1.Chars is not null) != (s2.Chars is not null))
+ if ((s1CharsLength > 0) != (s2CharsLength > 0))
{
- return s1.Chars is not null ? -1 : 1;
+ return s1CharsLength > 0 ? -1 : 1;
}
// If one has a range and the other doesn't, prioritize the one with a range.
- if ((s1.Range is not null) != (s2.Range is not null))
+ if ((s1RangeLength > 0) != (s2RangeLength > 0))
{
- return s1.Range is not null ? -1 : 1;
+ return s1RangeLength > 0 ? -1 : 1;
}
// If both have ranges, prefer the one that includes fewer characters.
- if (s1.Range is not null)
+ if (s1RangeLength > 0)
{
- return
- GetRangeLength(s1.Range.GetValueOrDefault()).CompareTo(
- GetRangeLength(s2.Range.GetValueOrDefault()));
-
- static int GetRangeLength((char LowInclusive, char HighInclusive, bool Negated) range)
- {
- int length = range.HighInclusive - range.LowInclusive + 1;
- return range.Negated ?
- char.MaxValue + 1 - length :
- length;
- }
+ return s1RangeLength.CompareTo(s2RangeLength);
}
// As a tiebreaker, prioritize the earlier one.
return s1.Distance.CompareTo(s2.Distance);
+
+ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool negated)
+ {
+ int length = range.HighInclusive - range.LowInclusive + 1;
+ return negated ?
+ char.MaxValue + 1 - length :
+ length;
+ }
});
///
@@ -908,22 +946,6 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */,
1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */,
0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */,
- 0.000f /* '\x80' */, 0.000f /* '\x81' */, 0.000f /* '\x82' */, 0.000f /* '\x83' */, 0.000f /* '\x84' */, 0.000f /* '\x85' */, 0.000f /* '\x86' */, 0.000f /* '\x87' */,
- 0.000f /* '\x88' */, 0.000f /* '\x89' */, 0.000f /* '\x8A' */, 0.000f /* '\x8B' */, 0.000f /* '\x8C' */, 0.000f /* '\x8D' */, 0.000f /* '\x8E' */, 0.000f /* '\x8F' */,
- 0.000f /* '\x90' */, 0.000f /* '\x91' */, 0.000f /* '\x92' */, 0.000f /* '\x93' */, 0.000f /* '\x94' */, 0.000f /* '\x95' */, 0.000f /* '\x96' */, 0.000f /* '\x97' */,
- 0.000f /* '\x98' */, 0.000f /* '\x99' */, 0.000f /* '\x9A' */, 0.000f /* '\x9B' */, 0.000f /* '\x9C' */, 0.000f /* '\x9D' */, 0.000f /* '\x9E' */, 0.000f /* '\x9F' */,
- 0.000f /* '\xA0' */, 0.000f /* '\xA1' */, 0.000f /* '\xA2' */, 0.000f /* '\xA3' */, 0.000f /* '\xA4' */, 0.000f /* '\xA5' */, 0.000f /* '\xA6' */, 0.000f /* '\xA7' */,
- 0.000f /* '\xA8' */, 0.000f /* '\xA9' */, 0.000f /* '\xAA' */, 0.000f /* '\xAB' */, 0.000f /* '\xAC' */, 0.000f /* '\xAD' */, 0.000f /* '\xAE' */, 0.000f /* '\xAF' */,
- 0.000f /* '\xB0' */, 0.000f /* '\xB1' */, 0.000f /* '\xB2' */, 0.000f /* '\xB3' */, 0.000f /* '\xB4' */, 0.000f /* '\xB5' */, 0.000f /* '\xB6' */, 0.000f /* '\xB7' */,
- 0.000f /* '\xB8' */, 0.000f /* '\xB9' */, 0.000f /* '\xBA' */, 0.000f /* '\xBB' */, 0.000f /* '\xBC' */, 0.000f /* '\xBD' */, 0.000f /* '\xBE' */, 0.000f /* '\xBF' */,
- 0.000f /* '\xC0' */, 0.000f /* '\xC1' */, 0.000f /* '\xC2' */, 0.000f /* '\xC3' */, 0.000f /* '\xC4' */, 0.000f /* '\xC5' */, 0.000f /* '\xC6' */, 0.000f /* '\xC7' */,
- 0.000f /* '\xC8' */, 0.000f /* '\xC9' */, 0.000f /* '\xCA' */, 0.000f /* '\xCB' */, 0.000f /* '\xCC' */, 0.000f /* '\xCD' */, 0.000f /* '\xCE' */, 0.000f /* '\xCF' */,
- 0.000f /* '\xD0' */, 0.000f /* '\xD1' */, 0.000f /* '\xD2' */, 0.000f /* '\xD3' */, 0.000f /* '\xD4' */, 0.000f /* '\xD5' */, 0.000f /* '\xD6' */, 0.000f /* '\xD7' */,
- 0.000f /* '\xD8' */, 0.000f /* '\xD9' */, 0.000f /* '\xDA' */, 0.000f /* '\xDB' */, 0.000f /* '\xDC' */, 0.000f /* '\xDD' */, 0.000f /* '\xDE' */, 0.000f /* '\xDF' */,
- 0.000f /* '\xE0' */, 0.000f /* '\xE1' */, 0.000f /* '\xE2' */, 0.000f /* '\xE3' */, 0.000f /* '\xE4' */, 0.000f /* '\xE5' */, 0.000f /* '\xE6' */, 0.000f /* '\xE7' */,
- 0.000f /* '\xE8' */, 0.000f /* '\xE9' */, 0.000f /* '\xEA' */, 0.000f /* '\xEB' */, 0.000f /* '\xEC' */, 0.000f /* '\xED' */, 0.000f /* '\xEE' */, 0.000f /* '\xEF' */,
- 0.000f /* '\xF0' */, 0.000f /* '\xF1' */, 0.000f /* '\xF2' */, 0.000f /* '\xF3' */, 0.000f /* '\xF4' */, 0.000f /* '\xF5' */, 0.000f /* '\xF6' */, 0.000f /* '\xF7' */,
- 0.000f /* '\xF8' */, 0.000f /* '\xF9' */, 0.000f /* '\xFA' */, 0.000f /* '\xFB' */, 0.000f /* '\xFC' */, 0.000f /* '\xFD' */, 0.000f /* '\xFE' */, 0.000f /* '\xFF' */,
};
// The above table was generated programmatically with the following. This can be augmented to incorporate additional data sources,
@@ -953,7 +975,7 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
// Console.WriteLine("private static readonly float[] s_frequency = new float[]");
// Console.WriteLine("{");
// int i = 0;
- // for (int row = 0; row < 32; row++)
+ // for (int row = 0; row < 16; row++)
// {
// Console.Write(" ");
// for (int col = 0; col < 8; col++)
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index b75a249d6b5555..4c2294f0c0669a 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -382,6 +382,9 @@ public static IEnumerable