diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index da78325562cd32..ee255e40f7f9c2 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -22,6 +22,10 @@ namespace System.Text.RegularExpressions.Generator { public partial class RegexGenerator { + /// Escapes '&', '<' and '>' characters. We aren't using HtmlEncode as that would also escape single and double quotes. + private static string EscapeXmlComment(string text) => + text.Replace("&", "&").Replace("<", "<").Replace(">", ">"); + /// Emits the definition of the partial method. This method just delegates to the property cache on the generated Regex-derived type. private static void EmitRegexPartialMethod(RegexMethod regexMethod, IndentedTextWriter writer) { @@ -363,6 +367,60 @@ private static void AddIsECMABoundaryHelper(Dictionary require } } + /// Adds an IndexOfAnyValues instance declaration to the required helpers collection if the chars are ASCII. + private static string EmitIndexOfAnyValuesOrLiteral(ReadOnlySpan chars, Dictionary requiredHelpers) + { + // IndexOfAnyValues is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII. + // Only emit IndexOfAnyValues instances when we know they'll be faster to avoid increasing the startup cost too much. + Debug.Assert(chars.Length is 4 or 5); + + return RegexCharClass.IsAscii(chars) + ? EmitIndexOfAnyValues(chars.ToArray(), requiredHelpers) + : Literal(chars.ToString()); + } + + /// Adds an IndexOfAnyValues instance declaration to the required helpers collection. + private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary requiredHelpers) + { + Debug.Assert(RegexCharClass.IsAscii(asciiChars)); + + // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. + byte[] bitmap = new byte[16]; + foreach (char c in asciiChars) + { + bitmap[c >> 3] |= (byte)(1 << (c & 7)); + } + + string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty); + + string fieldName = hexBitmap switch + { + "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters", + "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits", + "000000000000FF037E0000007E000000" => "s_asciiHexDigits", + "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower", + "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper", + _ => $"s_ascii_{hexBitmap.TrimStart('0')}" + }; + + string helperName = $"IndexOfAnyValues_{fieldName}"; + + if (!requiredHelpers.ContainsKey(helperName)) + { + Array.Sort(asciiChars); + + string setLiteral = Literal(new string(asciiChars)); + + requiredHelpers.Add(helperName, new string[] + { + $"/// Cached data to efficiently search for a character in the set {EscapeXmlComment(setLiteral)}.", + $"internal static readonly IndexOfAnyValues {fieldName} = IndexOfAnyValues.Create({setLiteral});", + }); + } + + return $"{HelpersTypeName}.{fieldName}"; + } + /// Emits the body of the Scan method override. private static (bool NeedsTryFind, bool NeedsTryMatch) EmitScan(IndentedTextWriter writer, RegexMethod rm) { @@ -810,7 +868,7 @@ void EmitFixedSet_LeftToRight() int setIndex = 0; bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass && - (primarySet.Chars is not null || primarySet.Range is not null); + (primarySet.Chars is not null || primarySet.Range is not null || primarySet.AsciiSet is not null); bool needLoop = !canUseIndexOf || setsToUse > 1; FinishEmitBlock loopBlock = default; @@ -835,15 +893,18 @@ void EmitFixedSet_LeftToRight() (true, _) => $"{span}.Slice(i + {primarySet.Distance})", }; + Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null)); + string indexOf = - primarySet.Chars is not null ? primarySet.Chars!.Length switch + primarySet.Chars is not null ? primarySet.Chars.Length switch { 1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", 2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", 3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", - _ => $"{span}.IndexOfAny({Literal(new string(primarySet.Chars))})", + _ => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})", } : - (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Range.Value.Negated) switch + primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet, requiredHelpers)})" : + (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch { (false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})", (true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})", @@ -1010,7 +1071,7 @@ void EmitLiteralAfterAtomicLoop() { 2 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])});", 3 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])}, {Literal(literalChars[2])});", - _ => $"IndexOfAny({Literal(new string(literalChars))});", + _ => $"IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literalChars, requiredHelpers)});", }); FinishEmitBlock indexOfFoundBlock = default; @@ -2920,7 +2981,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL if (!rtl && node.N > 1 && // no point in using IndexOf for small loops, in particular optionals subsequent?.FindStartingLiteralNode() is RegexNode literalNode && - TryEmitIndexOf(literalNode, useLast: true, negate: false, out int literalLength, out string? indexOfExpr)) + TryEmitIndexOf(requiredHelpers, literalNode, useLast: true, negate: false, out int literalLength, out string? indexOfExpr)) { writer.WriteLine($"if ({startingPos} >= {endingPos} ||"); @@ -3079,6 +3140,7 @@ node.Kind is RegexNodeKind.Notonelazy && !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method (literal.String is not null || literal.SetChars is not null || + (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set literal.Range.LowInclusive == literal.Range.HighInclusive || (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { @@ -3104,12 +3166,24 @@ literal.SetChars is not null || { (true, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});", (true, 3) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])});", - (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars)});", + (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literal.SetChars.AsSpan(), requiredHelpers)});", (false, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});", - (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});", + (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral($"{node.Ch}{literal.SetChars}".AsSpan(), requiredHelpers)});", }); } + else if (literal.AsciiChars is not null) // set of only ASCII characters + { + char[] asciiChars = literal.AsciiChars; + overlap = asciiChars.Contains(node.Ch); + if (!overlap) + { + Debug.Assert(node.Ch < 128); + Array.Resize(ref asciiChars, asciiChars.Length + 1); + asciiChars[asciiChars.Length - 1] = node.Ch; + } + writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValues(asciiChars, requiredHelpers)});"); + } else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; @@ -3144,7 +3218,7 @@ literal.SetChars is not null || node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && subsequent?.FindStartingLiteralNode() is RegexNode literal2 && - TryEmitIndexOf(literal2, useLast: false, negate: false, out _, out string? indexOfExpr)) + TryEmitIndexOf(requiredHelpers, literal2, useLast: false, negate: false, out _, out string? indexOfExpr)) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal @@ -3592,7 +3666,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) // For the loop, we're validating that each char matches the target node. // For IndexOf, we're looking for the first thing that _doesn't_ match the target node, // and thus similarly validating that everything does. - if (TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr)) + if (TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr)) { using (EmitBlock(writer, $"if ({sliceSpan}.Slice({sliceStaticPos}, {iterations}).{indexOfExpr} >= 0)")) { @@ -3685,7 +3759,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = TransferSliceStaticPosToPos(); writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;"); } - else if (maxIterations == int.MaxValue && TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr)) + else if (maxIterations == int.MaxValue && TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr)) { // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is // purely for simplicity; it could be removed in the future with additional code to handle that case. @@ -4342,6 +4416,7 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet /// The resulting expression if it returns true; otherwise, null. /// true if an expression could be produced; otherwise, false. private static bool TryEmitIndexOf( + Dictionary requiredHelpers, RegexNode node, bool useLast, bool negate, out int literalLength, [NotNullWhen(true)] out string? indexOfExpr) @@ -4375,8 +4450,22 @@ private static bool TryEmitIndexOf( bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; Span setChars = stackalloc char[5]; // current max that's vectorized - int setCharsCount; - if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars); + + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + string indexOfAnyInRangeName = !negated ? + "IndexOfAnyInRange" : + "IndexOfAnyExceptInRange"; + + indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})"; + + literalLength = 1; + return true; + } + + if (setCharsCount > 0) { (string indexOfName, string indexOfAnyName) = !negated ? ("IndexOf", "IndexOfAny") : @@ -4388,20 +4477,20 @@ private static bool TryEmitIndexOf( 1 => $"{last}{indexOfName}({Literal(setChars[0])})", 2 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])})", 3 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})", - _ => $"{last}{indexOfAnyName}({Literal(setChars.ToString())})", + _ => $"{last}{indexOfAnyName}({EmitIndexOfAnyValuesOrLiteral(setChars, requiredHelpers)})", }; literalLength = 1; return true; } - if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars)) { - string indexOfAnyInRangeName = !negated ? - "IndexOfAnyInRange" : - "IndexOfAnyExceptInRange"; + string indexOfAnyName = !negated ? + "IndexOfAny" : + "IndexOfAnyExcept"; - indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})"; + indexOfExpr = $"{last}{indexOfAnyName}({EmitIndexOfAnyValues(asciiChars, requiredHelpers)})"; literalLength = 1; return true; @@ -4985,14 +5074,11 @@ RegexNodeKind.BackreferenceConditional when node.Parent.Child(1) == node => "Not _ => "", }; - // Get a textual description of the node, making it safe for an XML comment (escaping the minimal amount necessary to - // avoid compilation failures: we don't want to escape single and double quotes, as HtmlEncode would do). string nodeDescription = DescribeNode(node, rm); - nodeDescription = nodeDescription.Replace("&", "&").Replace("<", "<").Replace(">", ">"); // Write out the line for the node. const char BulletPoint = '\u25CB'; - writer.WriteLine($"/// {new string(' ', depth * 4)}{BulletPoint} {tag}{nodeDescription}
"); + writer.WriteLine($"/// {new string(' ', depth * 4)}{BulletPoint} {tag}{EscapeXmlComment(nodeDescription)}
"); } // Process each child. diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs index ed506320a1a8f6..2dd5c6d0d551b5 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs @@ -198,6 +198,7 @@ x.Options is CSharpCompilationOptions options ? // a user's partial type. We can now rely on binding rules mapping to these usings and don't need to // use global-qualified names for the rest of the implementation. writer.WriteLine($" using System;"); + writer.WriteLine($" using System.Buffers;"); writer.WriteLine($" using System.CodeDom.Compiler;"); writer.WriteLine($" using System.Collections;"); writer.WriteLine($" using System.ComponentModel;"); @@ -240,7 +241,7 @@ x.Options is CSharpCompilationOptions options ? writer.WriteLine($"{{"); writer.Indent++; bool sawFirst = false; - foreach (KeyValuePair helper in requiredHelpers) + foreach (KeyValuePair helper in requiredHelpers.OrderBy(h => h.Key, StringComparer.Ordinal)) { if (sawFirst) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index a0e66e369ec978..35171934777974 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Globalization; namespace System.Text.RegularExpressions @@ -8,6 +9,9 @@ namespace System.Text.RegularExpressions internal sealed class CompiledRegexRunner : RegexRunner { private readonly ScanDelegate _scanMethod; + + private readonly IndexOfAnyValues[]? _indexOfAnyValues; + /// This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase private readonly CultureInfo? _culture; @@ -19,9 +23,10 @@ internal sealed class CompiledRegexRunner : RegexRunner internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan text); - public CompiledRegexRunner(ScanDelegate scan, CultureInfo? culture) + public CompiledRegexRunner(ScanDelegate scan, IndexOfAnyValues[]? indexOfAnyValues, CultureInfo? culture) { _scanMethod = scan; + _indexOfAnyValues = indexOfAnyValues; _culture = culture; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs index b7ec852f4cdbe8..5d21799b339fe9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Globalization; using System.Reflection.Emit; @@ -9,20 +10,22 @@ namespace System.Text.RegularExpressions internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory { private readonly DynamicMethod _scanMethod; + private readonly IndexOfAnyValues[]? _indexOfAnyValues; /// This field will only be set if the pattern has backreferences and uses RegexOptions.IgnoreCase private readonly CultureInfo? _culture; // Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed. private CompiledRegexRunner.ScanDelegate? _scan; - public CompiledRegexRunnerFactory(DynamicMethod scanMethod, CultureInfo? culture) + public CompiledRegexRunnerFactory(DynamicMethod scanMethod, IndexOfAnyValues[]? indexOfAnyValues, CultureInfo? culture) { _scanMethod = scanMethod; + _indexOfAnyValues = indexOfAnyValues; _culture = culture; } protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner( - _scan ??= _scanMethod.CreateDelegate(), _culture); + _scan ??= _scanMethod.CreateDelegate(), _indexOfAnyValues, _culture); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 894536bcd976fb..e2888ff7a650bf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Numerics; using System.Runtime.CompilerServices; @@ -840,6 +841,22 @@ public static int GetSetChars(string set, Span chars) return count; } + public static bool TryGetAsciiSetChars(string set, [NotNullWhen(true)] out char[]? asciiChars) + { + Span chars = stackalloc char[128]; + + chars = chars.Slice(0, GetSetChars(set, chars)); + + if (chars.IsEmpty || !IsAscii(chars)) + { + asciiChars = null; + return false; + } + + asciiChars = chars.ToArray(); + return true; + } + /// /// Determines whether two sets may overlap. /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 0b363278a5d0b5..e94567b7ac9cca 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -23,6 +24,7 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack"); private static readonly FieldInfo s_cultureField = typeof(CompiledRegexRunner).GetField("_culture", BindingFlags.Instance | BindingFlags.NonPublic)!; private static readonly FieldInfo s_caseBehaviorField = typeof(CompiledRegexRunner).GetField("_caseBehavior", BindingFlags.Instance | BindingFlags.NonPublic)!; + private static readonly FieldInfo s_indexOfAnyValuesArrayField = typeof(CompiledRegexRunner).GetField("_indexOfAnyValues", BindingFlags.Instance | BindingFlags.NonPublic)!; private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture"); private static readonly MethodInfo s_transferCaptureMethod = RegexRunnerMethod("TransferCapture"); @@ -65,21 +67,25 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnyIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnyExceptIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfAnyIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfAnyExceptIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; @@ -103,6 +109,9 @@ internal abstract class RegexCompiler /// Whether this expression has a non-infinite timeout. protected bool _hasTimeout; + /// instances used by the expression. For now these are only ASCII sets. + protected List>? _indexOfAnyValues; + /// Pool of Int32 LocalBuilders. private Stack? _int32LocalsPool; /// Pool of ReadOnlySpan of char locals. @@ -829,7 +838,7 @@ void EmitFixedSet_LeftToRight() int setIndex = 0; bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass && - (primarySet.Chars is not null || primarySet.Range is not null); + (primarySet.Chars is not null || primarySet.Range is not null || primarySet.AsciiSet is not null); bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; @@ -877,9 +886,11 @@ void EmitFixedSet_LeftToRight() Ldloc(textSpanLocal); } + Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null)); + if (primarySet.Chars is not null) { - switch (primarySet.Chars!.Length) + switch (primarySet.Chars.Length) { case 1: // tmp = ...IndexOf(setChars[0]); @@ -909,20 +920,26 @@ void EmitFixedSet_LeftToRight() break; } } + else if (primarySet.AsciiSet is not null) + { + Debug.Assert(!primarySet.Negated); + LoadIndexOfAnyValues(primarySet.AsciiSet); + Call(s_spanIndexOfAnyIndexOfAnyValues); + } else { if (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive) { // tmp = ...IndexOf{AnyExcept}(low); - Ldc(primarySet.Range!.Value.LowInclusive); - Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); + Ldc(primarySet.Range.Value.LowInclusive); + Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); } else { // tmp = ...IndexOfAny{Except}InRange(low, high); - Ldc(primarySet.Range!.Value.LowInclusive); + Ldc(primarySet.Range.Value.LowInclusive); Ldc(primarySet.Range.Value.HighInclusive); - Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); + Call(primarySet.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); } } @@ -3385,6 +3402,7 @@ node.Kind is RegexNodeKind.Notonelazy && !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method (literal.String is not null || literal.SetChars is not null || + (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set literal.Range.LowInclusive == literal.Range.HighInclusive || (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { @@ -3457,6 +3475,19 @@ literal.SetChars is not null || break; } } + else if (literal.AsciiChars is not null) // set of only ASCII characters + { + char[] asciiChars = literal.AsciiChars; + overlap = asciiChars.AsSpan().Contains(node.Ch); + if (!overlap) + { + Debug.Assert(node.Ch < 128); + Array.Resize(ref asciiChars, asciiChars.Length + 1); + asciiChars[^1] = node.Ch; + } + LoadIndexOfAnyValues(asciiChars); + Call(s_spanIndexOfAnyIndexOfAnyValues); + } else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; @@ -4929,6 +4960,12 @@ bool CanEmitIndexOf(RegexNode node, out int literalLength) literalLength = 1; return true; } + + if (RegexCharClass.TryGetAsciiSetChars(node.Str, out _)) + { + literalLength = 1; + return true; + } } literalLength = 0; @@ -4975,10 +5012,40 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate) { bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; - // IndexOfAny{Except}(ch1, ...) Span setChars = stackalloc char[5]; // current max that's vectorized - int setCharsCount; - if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars); + + // IndexOfAny{Except}InRange + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + if (lowInclusive == highInclusive) + { + Ldc(lowInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + } + + Ldc(lowInclusive); + Ldc(highInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyInRange, + (false, true) => s_spanIndexOfAnyExceptInRange, + (true, false) => s_spanLastIndexOfAnyInRange, + (true, true) => s_spanLastIndexOfAnyExceptInRange, + }); + return; + } + + // IndexOfAny{Except}(ch1, ...) + if (setCharsCount > 0) { setChars = setChars.Slice(0, setCharsCount); switch (setChars.Length) @@ -5033,30 +5100,16 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate) } } - // IndexOfAny{Except}InRange - if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + // IndexOfAny{Except}(IndexOfAnyValues) + if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars)) { - if (lowInclusive == highInclusive) - { - Ldc(lowInclusive); - Call((useLast, negated) switch - { - (false, false) => s_spanIndexOfChar, - (false, true) => s_spanIndexOfAnyExceptChar, - (true, false) => s_spanLastIndexOfChar, - (true, true) => s_spanLastIndexOfAnyExceptChar, - }); - return; - } - - Ldc(lowInclusive); - Ldc(highInclusive); + LoadIndexOfAnyValues(asciiChars); Call((useLast, negated) switch { - (false, false) => s_spanIndexOfAnyInRange, - (false, true) => s_spanIndexOfAnyExceptInRange, - (true, false) => s_spanLastIndexOfAnyInRange, - (true, true) => s_spanLastIndexOfAnyExceptInRange, + (false, false) => s_spanIndexOfAnyIndexOfAnyValues, + (false, true) => s_spanIndexOfAnyExceptIndexOfAnyValues, + (true, false) => s_spanLastIndexOfAnyIndexOfAnyValues, + (true, true) => s_spanLastIndexOfAnyExceptIndexOfAnyValues, }); return; } @@ -5951,5 +6004,20 @@ private void EmitTimeoutCheckIfNeeded() Call(s_checkTimeoutMethod); } } + + /// + /// Adds an entry in for the given and emits a load of that initialized value. + /// + private void LoadIndexOfAnyValues(char[] chars) + { + List> list = _indexOfAnyValues ??= new(); + int index = list.Count; + list.Add(IndexOfAnyValues.Create(chars)); + + // this._indexOfAnyValues[index] + Ldthisfld(s_indexOfAnyValuesArrayField); + Ldc(index); + _ilg!.Emit(OpCodes.Ldelem_Ref); + } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 099073da0cb822..d02c74a70c7b6e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -241,12 +241,16 @@ public FixedDistanceSet(char[]? chars, string set, int distance) /// The character class description. public string Set; + /// Whether the is negated. + public bool Negated; /// Small list of all of the characters that make up the set, if known; otherwise, null. public char[]? Chars; /// The distance of the set from the beginning of the match. public int Distance; /// As an alternative to , a description of the single range the set represents, if it does. - public (char LowInclusive, char HighInclusive, bool Negated)? Range; + public (char LowInclusive, char HighInclusive)? Range; + /// As an alternative to , a description of the set of ASCII characters it represents, if it does. + public char[]? AsciiSet; } /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. @@ -271,7 +275,7 @@ private static (string String, int Distance)? FindFixedDistanceString(List) }); EmitScan(options, tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod); - return new CompiledRegexRunnerFactory(scanMethod, regexTree.Culture); + return new CompiledRegexRunnerFactory(scanMethod, _indexOfAnyValues?.ToArray(), regexTree.Culture); } /// Begins the definition of a new method (no args) with a specified return value. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index a94be746767a0d..d659026d0ae522 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1428,10 +1428,10 @@ public char FirstCharOfOneOrMulti() switch (node.Kind) { case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false); + return new StartingLiteralData(range: (node.Ch, node.Ch), negated: false); case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true); + return new StartingLiteralData(range: (node.Ch, node.Ch), negated: true); case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy: Span setChars = stackalloc char[maxSetCharacters]; @@ -1439,18 +1439,23 @@ public char FirstCharOfOneOrMulti() if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) { setChars = setChars.Slice(0, numChars); - return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); + return new StartingLiteralData(setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); } if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive)) { Debug.Assert(lowInclusive < highInclusive); - return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!)); + return new StartingLiteralData(range: (lowInclusive, highInclusive), negated: RegexCharClass.IsNegated(node.Str!)); + } + + if (RegexCharClass.TryGetAsciiSetChars(node.Str!, out char[]? asciiChars)) + { + return new StartingLiteralData(asciiChars: asciiChars, negated: RegexCharClass.IsNegated(node.Str!)); } break; case RegexNodeKind.Multi: - return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false); + return new StartingLiteralData(@string: node.Str); } } @@ -1463,15 +1468,34 @@ public readonly struct StartingLiteralData public readonly (char LowInclusive, char HighInclusive) Range; public readonly string? String; public readonly string? SetChars; + public readonly char[]? AsciiChars; public readonly bool Negated; - public StartingLiteralData((char LowInclusive, char HighInclusive) range, string? @string, string? setChars, bool negated) + public StartingLiteralData((char LowInclusive, char HighInclusive) range, bool negated) { Range = range; + Negated = negated; + } + + public StartingLiteralData(string? @string) + { + Debug.Assert(@string is not null); String = @string; + } + + public StartingLiteralData(string? setChars, bool negated) + { + Debug.Assert(setChars is not null); SetChars = setChars; Negated = negated; } + + public StartingLiteralData(char[]? asciiChars, bool negated) + { + Debug.Assert(asciiChars is not null); + AsciiChars = asciiChars; + Negated = negated; + } } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index cacf02d321ed53..0e0badd650ebfd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -200,26 +200,30 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) for (int i = 0; i < results.Count; i++) { RegexFindOptimizations.FixedDistanceSet result = results[i]; - bool negated = RegexCharClass.IsNegated(result.Set); + result.Negated = RegexCharClass.IsNegated(result.Set); - if (!negated) + int count = RegexCharClass.GetSetChars(result.Set, scratch); + + if (!result.Negated && count > 0) { - int count = RegexCharClass.GetSetChars(result.Set, scratch); - if (count != 0) - { - result.Chars = scratch.Slice(0, count).ToArray(); - results[i] = result; - } + result.Chars = scratch.Slice(0, count).ToArray(); } - if (thorough && result.Chars is null) + if (thorough) { - if (RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) + { + result.Chars = null; + result.Range = (lowInclusive, highInclusive); + } + else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars)) { - result.Range = (lowInclusive, highInclusive, negated); - results[i] = result; + result.AsciiSet = asciiChars; } } + + results[i] = result; } return results; @@ -435,18 +439,38 @@ static bool TryFindFixedSets(RegexNode node, List results) => // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search // for the fastest and that have the best chance of matching as few false positives as possible. - results.Sort((s1, s2) => + results.Sort(static (s1, s2) => { + char[]? s1Chars = s1.Chars ?? s1.AsciiSet; + char[]? s2Chars = s2.Chars ?? s2.AsciiSet; + int s1CharsLength = s1Chars?.Length ?? 0; + int s2CharsLength = s2Chars?.Length ?? 0; + bool s1Negated = s1.Negated; + bool s2Negated = s2.Negated; + int s1RangeLength = s1.Range is not null ? GetRangeLength(s1.Range.Value, s1Negated) : 0; + int s2RangeLength = s2.Range is not null ? GetRangeLength(s2.Range.Value, s2Negated) : 0; + + Debug.Assert(!s1Negated || s1Chars is null); + Debug.Assert(!s2Negated || s2Chars is null); + // If both have chars, prioritize the one with the smaller frequency for those chars. - if (s1.Chars is not null && s2.Chars is not null) + if (s1Chars is not null && s2Chars is not null) { - // Then of the ones that are the same length, prefer those with less frequent values. The frequency is - // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True - // frequencies will vary widely based on the actual data being searched, the language of the data, etc. - int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars)); - if (c != 0) + // Prefer sets with less frequent values. The frequency is only an approximation, + // used as a tie-breaker when we'd otherwise effectively be picking randomly. + // True frequencies will vary widely based on the actual data being searched, the language of the data, etc. + float s1Frequency = SumFrequencies(s1Chars); + float s2Frequency = SumFrequencies(s2Chars); + + if (s1Frequency != s2Frequency) { - return c; + return s1Frequency.CompareTo(s2Frequency); + } + + if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars)) + { + // Prefer the set with fewer values. + return s1CharsLength.CompareTo(s2CharsLength); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -455,45 +479,59 @@ static float SumFrequencies(char[] chars) float sum = 0; foreach (char c in chars) { - // Lookup each character in the table. For values > 255, this will end up truncating + // Lookup each character in the table. Values >= 128 are ignored // and thus we'll get skew in the data. It's already a gross approximation, though, // and it is primarily meant for disambiguation of ASCII letters. - sum += s_frequency[(byte)c]; + if (c < 128) + { + sum += s_frequency[c]; + } } return sum; } } + // If one has chars and the other has a range, prefer the shorter set. + if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0)) + { + int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength)); + if (c != 0) + { + return c; + } + + // If lengths are the same, prefer the chars. + return s1CharsLength > 0 ? -1 : 1; + } + // If one has chars and the other doesn't, prioritize the one with chars. - if ((s1.Chars is not null) != (s2.Chars is not null)) + if ((s1CharsLength > 0) != (s2CharsLength > 0)) { - return s1.Chars is not null ? -1 : 1; + return s1CharsLength > 0 ? -1 : 1; } // If one has a range and the other doesn't, prioritize the one with a range. - if ((s1.Range is not null) != (s2.Range is not null)) + if ((s1RangeLength > 0) != (s2RangeLength > 0)) { - return s1.Range is not null ? -1 : 1; + return s1RangeLength > 0 ? -1 : 1; } // If both have ranges, prefer the one that includes fewer characters. - if (s1.Range is not null) + if (s1RangeLength > 0) { - return - GetRangeLength(s1.Range.GetValueOrDefault()).CompareTo( - GetRangeLength(s2.Range.GetValueOrDefault())); - - static int GetRangeLength((char LowInclusive, char HighInclusive, bool Negated) range) - { - int length = range.HighInclusive - range.LowInclusive + 1; - return range.Negated ? - char.MaxValue + 1 - length : - length; - } + return s1RangeLength.CompareTo(s2RangeLength); } // As a tiebreaker, prioritize the earlier one. return s1.Distance.CompareTo(s2.Distance); + + static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool negated) + { + int length = range.HighInclusive - range.LowInclusive + 1; + return negated ? + char.MaxValue + 1 - length : + length; + } }); /// @@ -908,22 +946,6 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le 1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */, 1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */, 0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */, - 0.000f /* '\x80' */, 0.000f /* '\x81' */, 0.000f /* '\x82' */, 0.000f /* '\x83' */, 0.000f /* '\x84' */, 0.000f /* '\x85' */, 0.000f /* '\x86' */, 0.000f /* '\x87' */, - 0.000f /* '\x88' */, 0.000f /* '\x89' */, 0.000f /* '\x8A' */, 0.000f /* '\x8B' */, 0.000f /* '\x8C' */, 0.000f /* '\x8D' */, 0.000f /* '\x8E' */, 0.000f /* '\x8F' */, - 0.000f /* '\x90' */, 0.000f /* '\x91' */, 0.000f /* '\x92' */, 0.000f /* '\x93' */, 0.000f /* '\x94' */, 0.000f /* '\x95' */, 0.000f /* '\x96' */, 0.000f /* '\x97' */, - 0.000f /* '\x98' */, 0.000f /* '\x99' */, 0.000f /* '\x9A' */, 0.000f /* '\x9B' */, 0.000f /* '\x9C' */, 0.000f /* '\x9D' */, 0.000f /* '\x9E' */, 0.000f /* '\x9F' */, - 0.000f /* '\xA0' */, 0.000f /* '\xA1' */, 0.000f /* '\xA2' */, 0.000f /* '\xA3' */, 0.000f /* '\xA4' */, 0.000f /* '\xA5' */, 0.000f /* '\xA6' */, 0.000f /* '\xA7' */, - 0.000f /* '\xA8' */, 0.000f /* '\xA9' */, 0.000f /* '\xAA' */, 0.000f /* '\xAB' */, 0.000f /* '\xAC' */, 0.000f /* '\xAD' */, 0.000f /* '\xAE' */, 0.000f /* '\xAF' */, - 0.000f /* '\xB0' */, 0.000f /* '\xB1' */, 0.000f /* '\xB2' */, 0.000f /* '\xB3' */, 0.000f /* '\xB4' */, 0.000f /* '\xB5' */, 0.000f /* '\xB6' */, 0.000f /* '\xB7' */, - 0.000f /* '\xB8' */, 0.000f /* '\xB9' */, 0.000f /* '\xBA' */, 0.000f /* '\xBB' */, 0.000f /* '\xBC' */, 0.000f /* '\xBD' */, 0.000f /* '\xBE' */, 0.000f /* '\xBF' */, - 0.000f /* '\xC0' */, 0.000f /* '\xC1' */, 0.000f /* '\xC2' */, 0.000f /* '\xC3' */, 0.000f /* '\xC4' */, 0.000f /* '\xC5' */, 0.000f /* '\xC6' */, 0.000f /* '\xC7' */, - 0.000f /* '\xC8' */, 0.000f /* '\xC9' */, 0.000f /* '\xCA' */, 0.000f /* '\xCB' */, 0.000f /* '\xCC' */, 0.000f /* '\xCD' */, 0.000f /* '\xCE' */, 0.000f /* '\xCF' */, - 0.000f /* '\xD0' */, 0.000f /* '\xD1' */, 0.000f /* '\xD2' */, 0.000f /* '\xD3' */, 0.000f /* '\xD4' */, 0.000f /* '\xD5' */, 0.000f /* '\xD6' */, 0.000f /* '\xD7' */, - 0.000f /* '\xD8' */, 0.000f /* '\xD9' */, 0.000f /* '\xDA' */, 0.000f /* '\xDB' */, 0.000f /* '\xDC' */, 0.000f /* '\xDD' */, 0.000f /* '\xDE' */, 0.000f /* '\xDF' */, - 0.000f /* '\xE0' */, 0.000f /* '\xE1' */, 0.000f /* '\xE2' */, 0.000f /* '\xE3' */, 0.000f /* '\xE4' */, 0.000f /* '\xE5' */, 0.000f /* '\xE6' */, 0.000f /* '\xE7' */, - 0.000f /* '\xE8' */, 0.000f /* '\xE9' */, 0.000f /* '\xEA' */, 0.000f /* '\xEB' */, 0.000f /* '\xEC' */, 0.000f /* '\xED' */, 0.000f /* '\xEE' */, 0.000f /* '\xEF' */, - 0.000f /* '\xF0' */, 0.000f /* '\xF1' */, 0.000f /* '\xF2' */, 0.000f /* '\xF3' */, 0.000f /* '\xF4' */, 0.000f /* '\xF5' */, 0.000f /* '\xF6' */, 0.000f /* '\xF7' */, - 0.000f /* '\xF8' */, 0.000f /* '\xF9' */, 0.000f /* '\xFA' */, 0.000f /* '\xFB' */, 0.000f /* '\xFC' */, 0.000f /* '\xFD' */, 0.000f /* '\xFE' */, 0.000f /* '\xFF' */, }; // The above table was generated programmatically with the following. This can be augmented to incorporate additional data sources, @@ -953,7 +975,7 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le // Console.WriteLine("private static readonly float[] s_frequency = new float[]"); // Console.WriteLine("{"); // int i = 0; - // for (int row = 0; row < 32; row++) + // for (int row = 0; row < 16; row++) // { // Console.Write(" "); // for (int col = 0; col < 8; col++) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index b75a249d6b5555..4c2294f0c0669a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -382,6 +382,9 @@ public static IEnumerable Match_MemberData() yield return (@"a[^c]*?[bcdef]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e"); yield return (@"a[^b]*?[bcdef]", "xyza12345f6789", lineOption, 0, 14, true, "a12345f"); yield return (@"a[^c]*?[bcdef]", "xyza12345g6789", lineOption, 0, 14, false, ""); + + yield return ("a[^b]*?[cdefgz]", "xyza123bc4", lineOption, 0, 10, false, ""); + yield return ("a[^b]*?[bdefgz]", "xyza123bc4", lineOption, 0, 10, true, "a123b"); } // Nested loops