Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -227,16 +227,48 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
writer.WriteLine($" protected override bool FindFirstChar()");
writer.WriteLine($" {{");
writer.Indent += 4;
EmitFindFirstChar(writer, rm, id);
RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id);
writer.Indent -= 4;
writer.WriteLine($" }}");
writer.WriteLine();
writer.WriteLine($" protected override void Go()");
writer.WriteLine($" {{");
writer.Indent += 4;
EmitGo(writer, rm, id);
requiredHelpers |= EmitGo(writer, rm, id);
writer.Indent -= 4;
writer.WriteLine($" }}");

if ((requiredHelpers & RequiredHelperFunctions.IsWordChar) != 0)
{
writer.WriteLine();
writer.WriteLine($" /// <summary>Determines whether the character is part of the [\\w] set.</summary>");
writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]");
writer.WriteLine($" private static bool IsWordChar(char ch)");
writer.WriteLine($" {{");
writer.WriteLine($" global::System.ReadOnlySpan<byte> ascii = new byte[]");
writer.WriteLine($" {{");
writer.WriteLine($" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,");
writer.WriteLine($" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07");
writer.WriteLine($" }};");
writer.WriteLine();
writer.WriteLine($" int chDiv8 = ch >> 3;");
writer.WriteLine($" return (uint)chDiv8 < (uint)ascii.Length ?");
writer.WriteLine($" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :");
writer.WriteLine($" global::System.Globalization.CharUnicodeInfo.GetUnicodeCategory(ch) switch");
writer.WriteLine($" {{");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.UppercaseLetter or");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.LowercaseLetter or");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.TitlecaseLetter or");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.ModifierLetter or");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.OtherLetter or");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.NonSpacingMark or");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.DecimalDigitNumber or");
writer.WriteLine($" global::System.Globalization.UnicodeCategory.ConnectorPunctuation => true,");
writer.WriteLine($" _ => false,");
writer.WriteLine($" }};");
writer.WriteLine($" }}");
}

writer.WriteLine($" }}");
writer.WriteLine($" }}");
writer.WriteLine("}");
Expand Down Expand Up @@ -266,11 +298,12 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht)
}

/// <summary>Emits the body of the FindFirstChar override.</summary>
private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
{
RegexOptions options = (RegexOptions)rm.Options;
RegexCode code = rm.Code;
bool hasTextInfo = false;
RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None;

// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
Expand Down Expand Up @@ -344,7 +377,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,

// We're done. Patch up any additional declarations.
ReplaceAdditionalDeclarations(writer, additionalDeclarations, additionalDeclarationsPosition, additionalDeclarationsIndent);
return;
return requiredHelpers;

// Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further
// searching is required; otherwise, false.
Expand Down Expand Up @@ -518,7 +551,7 @@ void EmitFixedSet()
for (; setIndex < setsToUse; setIndex++)
{
string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]";
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations);
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations, ref requiredHelpers);

if (setIndex == start)
{
Expand Down Expand Up @@ -571,7 +604,7 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
}

/// <summary>Emits the body of the Go override.</summary>
private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
{
// In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled
// version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via
Expand Down Expand Up @@ -599,6 +632,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)

RegexOptions options = (RegexOptions)rm.Options;
RegexCode code = rm.Code;
RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None;

// Helper to define names. Names start unadorned, but as soon as there's repetition,
// they begin to have a numbered suffix.
Expand All @@ -622,14 +656,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
writer.WriteLine($"int end = start + {(node.Type == RegexNode.Multi ? node.Str!.Length : 1)};");
writer.WriteLine("base.Capture(0, start, end);");
writer.WriteLine("base.runtextpos = end;");
return;
return requiredHelpers;

case RegexNode.Empty:
// This case isn't common in production, but it's very common when first getting started with the
// source generator and seeing what happens as you add more to expressions. When approaching
// it from a learning perspective, this is very common, as it's the empty string you start with.
writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);");
return;
return requiredHelpers;
}

// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
Expand Down Expand Up @@ -717,7 +751,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
}
}

return;
return requiredHelpers;

// Helper to create a name guaranteed to be unique within the function.
string ReserveName(string prefix)
Expand Down Expand Up @@ -1864,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset

if (node.IsSetFamily)
{
expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations)}";
expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers)}";
}
else
{
Expand Down Expand Up @@ -2662,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
string expr = $"{sliceSpan}[{iterationLocal}]";
if (node.IsSetFamily)
{
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations);
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
}
else
{
Expand Down Expand Up @@ -2716,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node)
string expr = $"{sliceSpan}[{sliceStaticPos}]";
if (node.IsSetFamily)
{
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations);
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
}
else
{
Expand Down Expand Up @@ -3070,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri

private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression;

private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string>? additionalDeclarations)
private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string> additionalDeclarations, ref RequiredHelperFunctions requiredHelpers)
{
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
// but that call is relatively expensive. Before we fall back to it, we try to optimize
Expand All @@ -3097,6 +3131,14 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options

case RegexCharClass.NotSpaceClass:
return $"!char.IsWhiteSpace({chExpr})";

case RegexCharClass.WordClass:
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
return $"IsWordChar({chExpr})";

case RegexCharClass.NotWordClass:
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
return $"!IsWordChar({chExpr})";
}

// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
Expand Down Expand Up @@ -3146,11 +3188,11 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
{
return $"(({chExpr} | 0x20) == {Literal(setChars[1])})";
}
additionalDeclarations?.Add("char ch;");
additionalDeclarations.Add("char ch;");
return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";

case 3:
additionalDeclarations?.Add("char ch;");
additionalDeclarations.Add("char ch;");
return (setChars[0] | 0x20) == setChars[1] ?
$"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" :
$"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))";
Expand All @@ -3159,15 +3201,15 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
if (((setChars[0] | 0x20) == setChars[1]) &&
((setChars[2] | 0x20) == setChars[3]))
{
additionalDeclarations?.Add("char ch;");
additionalDeclarations.Add("char ch;");
return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
}
break;
}
}

// All options after this point require a ch local.
additionalDeclarations?.Add("char ch;");
additionalDeclarations.Add("char ch;");

// Analyze the character set more to determine what code to generate.
RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass);
Expand Down Expand Up @@ -3471,5 +3513,15 @@ public void Dispose()
}
}
}

/// <summary>Bit flags indicating which additional helpers should be emitted into the regex class.</summary>
[Flags]
private enum RequiredHelperFunctions
{
/// <summary>No additional functions are required.</summary>
None,
/// <summary>The IsWordChar helper is required.</summary>
IsWordChar
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ internal sealed partial class RegexCharClass
private const short SpaceConst = 100;
private const short NotSpaceConst = -100;

private const char ZeroWidthJoiner = '\u200D';
private const char ZeroWidthNonJoiner = '\u200C';

private const string InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__";
private const string Space = "\x64";
private const string NotSpace = "\uFF9C";
Expand Down Expand Up @@ -975,25 +972,59 @@ public static bool IsECMAWordChar(char ch) =>
ch == '_' || // underscore
ch == '\u0130'; // latin capital letter I with dot above

/// <summary>16 bytes, representing the chars 0 through 127, with a 1 for a bit where that char is a word char.</summary>
private static ReadOnlySpan<byte> WordCharAsciiLookup => new byte[]
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
};

/// <summary>Determines whether a character is considered a word character for the purposes of testing the \w set.</summary>
public static bool IsWordChar(char ch)
{
// This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
// returns true for \u200c and \u200d.

// Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the
// behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
int chDiv8 = ch >> 3;
if ((uint)chDiv8 < (uint)asciiLookup.Length)
{
return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
}

// For non-ASCII, fall back to checking the Unicode category.
switch (CharUnicodeInfo.GetUnicodeCategory(ch))
{
case UnicodeCategory.UppercaseLetter:
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.TitlecaseLetter:
case UnicodeCategory.ModifierLetter:
case UnicodeCategory.OtherLetter:
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.DecimalDigitNumber:
case UnicodeCategory.ConnectorPunctuation:
return true;

default:
return false;
}
}

/// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
public static bool IsBoundaryWordChar(char ch)
{
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.

// 16 bytes, representing the chars 0 through 127, with a 1 for a bit where that char is a word char
static ReadOnlySpan<byte> AsciiLookup() => new byte[]
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
};

// Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the
// behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
ReadOnlySpan<byte> asciiLookup = AsciiLookup();
ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
int chDiv8 = ch >> 3;
if ((uint)chDiv8 < asciiLookup.Length)
if ((uint)chDiv8 < (uint)asciiLookup.Length)
{
return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
}
Expand All @@ -1012,7 +1043,8 @@ public static bool IsWordChar(char ch)
return true;

default:
return ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner;
const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_matchLengthMethod = RegexRunnerMethod("MatchLength");
private static readonly MethodInfo s_matchIndexMethod = RegexRunnerMethod("MatchIndex");
private static readonly MethodInfo s_isBoundaryMethod = RegexRunnerMethod("IsBoundary");
private static readonly MethodInfo s_isWordCharMethod = RegexRunnerMethod("IsWordChar");
private static readonly MethodInfo s_isECMABoundaryMethod = RegexRunnerMethod("IsECMABoundary");
private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos");
private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass");
Expand Down Expand Up @@ -3529,6 +3530,18 @@ private void EmitMatchCharacterClass(string charClass, bool caseInsensitive)
Ldc(0);
Ceq();
return;

case RegexCharClass.WordClass:
// RegexRunner.IsWordChar(ch)
Call(s_isWordCharMethod);
return;

case RegexCharClass.NotWordClass:
// !RegexRunner.IsWordChar(ch)
Call(s_isWordCharMethod);
Ldc(0);
Ceq();
return;
}

// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
Expand Down
Loading