dotnet · stephentoub · Dec 10, 2021 · Dec 10, 2021
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -227,16 +227,48 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
             writer.WriteLine($"            protected override bool FindFirstChar()");
             writer.WriteLine($"            {{");
             writer.Indent += 4;
-            EmitFindFirstChar(writer, rm, id);
+            RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id);
             writer.Indent -= 4;
             writer.WriteLine($"            }}");
             writer.WriteLine();
             writer.WriteLine($"            protected override void Go()");
             writer.WriteLine($"            {{");
             writer.Indent += 4;
-            EmitGo(writer, rm, id);
+            requiredHelpers |= EmitGo(writer, rm, id);
             writer.Indent -= 4;
             writer.WriteLine($"            }}");
+
+            if ((requiredHelpers & RequiredHelperFunctions.IsWordChar) != 0)
+            {
+                writer.WriteLine();
+                writer.WriteLine($"            /// <summary>Determines whether the character is part of the [\\w] set.</summary>");
+                writer.WriteLine($"            [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]");
+                writer.WriteLine($"            private static bool IsWordChar(char ch)");
+                writer.WriteLine($"            {{");
+                writer.WriteLine($"                global::System.ReadOnlySpan<byte> ascii = new byte[]");
+                writer.WriteLine($"                {{");
+                writer.WriteLine($"                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,");
+                writer.WriteLine($"                    0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07");
+                writer.WriteLine($"                }};");
+                writer.WriteLine();
+                writer.WriteLine($"                int chDiv8 = ch >> 3;");
+                writer.WriteLine($"                return (uint)chDiv8 < (uint)ascii.Length ?");
+                writer.WriteLine($"                    (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :");
+                writer.WriteLine($"                    global::System.Globalization.CharUnicodeInfo.GetUnicodeCategory(ch) switch");
+                writer.WriteLine($"                    {{");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.UppercaseLetter or");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.LowercaseLetter or");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.TitlecaseLetter or");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.ModifierLetter or");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.OtherLetter or");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.NonSpacingMark or");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.DecimalDigitNumber or");
+                writer.WriteLine($"                        global::System.Globalization.UnicodeCategory.ConnectorPunctuation => true,");
+                writer.WriteLine($"                        _ => false,");
+                writer.WriteLine($"                    }};");
+                writer.WriteLine($"            }}");
+            }
+
             writer.WriteLine($"        }}");
             writer.WriteLine($"    }}");
             writer.WriteLine("}");
@@ -266,11 +298,12 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht)
         }
 
         /// <summary>Emits the body of the FindFirstChar override.</summary>
-        private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
+        private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
         {
             RegexOptions options = (RegexOptions)rm.Options;
             RegexCode code = rm.Code;
             bool hasTextInfo = false;
+            RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None;
 
             // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
             // To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
@@ -344,7 +377,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
 
             // We're done.  Patch up any additional declarations.
             ReplaceAdditionalDeclarations(writer, additionalDeclarations, additionalDeclarationsPosition, additionalDeclarationsIndent);
-            return;
+            return requiredHelpers;
 
             // Emits any anchors.  Returns true if the anchor roots any match to a specific location and thus no further
             // searching is required; otherwise, false.
@@ -518,7 +551,7 @@ void EmitFixedSet()
                         for (; setIndex < setsToUse; setIndex++)
                         {
                             string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]";
-                            string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations);
+                            string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations, ref requiredHelpers);
 
                             if (setIndex == start)
                             {
@@ -571,7 +604,7 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
         }
 
         /// <summary>Emits the body of the Go override.</summary>
-        private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
+        private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
         {
             // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled
             // version of what RegexInterpreter would process.  The RegexNode tree would be turned into a series of opcodes via
@@ -599,6 +632,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
 
             RegexOptions options = (RegexOptions)rm.Options;
             RegexCode code = rm.Code;
+            RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None;
 
             // Helper to define names.  Names start unadorned, but as soon as there's repetition,
             // they begin to have a numbered suffix.
@@ -622,14 +656,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
                     writer.WriteLine($"int end = start + {(node.Type == RegexNode.Multi ? node.Str!.Length : 1)};");
                     writer.WriteLine("base.Capture(0, start, end);");
                     writer.WriteLine("base.runtextpos = end;");
-                    return;
+                    return requiredHelpers;
 
                 case RegexNode.Empty:
                     // This case isn't common in production, but it's very common when first getting started with the
                     // source generator and seeing what happens as you add more to expressions.  When approaching
                     // it from a learning perspective, this is very common, as it's the empty string you start with.
                     writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);");
-                    return;
+                    return requiredHelpers;
             }
 
             // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
@@ -717,7 +751,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
                 }
             }
 
-            return;
+            return requiredHelpers;
 
             // Helper to create a name guaranteed to be unique within the function.
             string ReserveName(string prefix)
@@ -1864,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset
 
                 if (node.IsSetFamily)
                 {
-                    expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations)}";
+                    expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers)}";
                 }
                 else
                 {
@@ -2662,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
                     string expr = $"{sliceSpan}[{iterationLocal}]";
                     if (node.IsSetFamily)
                     {
-                        expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations);
+                        expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
                     }
                     else
                     {
@@ -2716,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node)
                 string expr = $"{sliceSpan}[{sliceStaticPos}]";
                 if (node.IsSetFamily)
                 {
-                    expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations);
+                    expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
                 }
                 else
                 {
@@ -3070,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri
 
         private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression;
 
-        private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string>? additionalDeclarations)
+        private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string> additionalDeclarations, ref RequiredHelperFunctions requiredHelpers)
         {
             // We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
             // but that call is relatively expensive.  Before we fall back to it, we try to optimize
@@ -3097,6 +3131,14 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
 
                 case RegexCharClass.NotSpaceClass:
                     return $"!char.IsWhiteSpace({chExpr})";
+
+                case RegexCharClass.WordClass:
+                    requiredHelpers |= RequiredHelperFunctions.IsWordChar;
+                    return $"IsWordChar({chExpr})";
+
+                case RegexCharClass.NotWordClass:
+                    requiredHelpers |= RequiredHelperFunctions.IsWordChar;
+                    return $"!IsWordChar({chExpr})";
             }
 
             // If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
@@ -3146,11 +3188,11 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
                         {
                             return $"(({chExpr} | 0x20) == {Literal(setChars[1])})";
                         }
-                        additionalDeclarations?.Add("char ch;");
+                        additionalDeclarations.Add("char ch;");
                         return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";
 
                     case 3:
-                        additionalDeclarations?.Add("char ch;");
+                        additionalDeclarations.Add("char ch;");
                         return (setChars[0] | 0x20) == setChars[1] ?
                             $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" :
                             $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))";
@@ -3159,15 +3201,15 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
                         if (((setChars[0] | 0x20) == setChars[1]) &&
                             ((setChars[2] | 0x20) == setChars[3]))
                         {
-                            additionalDeclarations?.Add("char ch;");
+                            additionalDeclarations.Add("char ch;");
                             return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
                         }
                         break;
                 }
             }
 
             // All options after this point require a ch local.
-            additionalDeclarations?.Add("char ch;");
+            additionalDeclarations.Add("char ch;");
 
             // Analyze the character set more to determine what code to generate.
             RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass);
@@ -3471,5 +3513,15 @@ public void Dispose()
                 }
             }
         }
+
+        /// <summary>Bit flags indicating which additional helpers should be emitted into the regex class.</summary>
+        [Flags]
+        private enum RequiredHelperFunctions
+        {
+            /// <summary>No additional functions are required.</summary>
+            None,
+            /// <summary>The IsWordChar helper is required.</summary>
+            IsWordChar
+        }
     }
 }
diff --git a/...aries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/...aries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -41,9 +41,6 @@ internal sealed partial class RegexCharClass
         private const short SpaceConst = 100;
         private const short NotSpaceConst = -100;
 
-        private const char ZeroWidthJoiner = '\u200D';
-        private const char ZeroWidthNonJoiner = '\u200C';
-
         private const string InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__";
         private const string Space = "\x64";
         private const string NotSpace = "\uFF9C";
@@ -975,25 +972,59 @@ public static bool IsECMAWordChar(char ch) =>
             ch == '_' || // underscore
             ch == '\u0130'; // latin capital letter I with dot above
 
+        /// <summary>16 bytes, representing the chars 0 through 127, with a 1 for a bit where that char is a word char.</summary>
+        private static ReadOnlySpan<byte> WordCharAsciiLookup => new byte[]
+        {
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+            0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+        };
+
+        /// <summary>Determines whether a character is considered a word character for the purposes of testing the \w set.</summary>
         public static bool IsWordChar(char ch)
+        {
+            // This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
+            // returns true for \u200c and \u200d.
+
+            // Fast lookup in our lookup table for ASCII characters.  This is purely an optimization, and has the
+            // behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
+            ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
+            int chDiv8 = ch >> 3;
+            if ((uint)chDiv8 < (uint)asciiLookup.Length)
+            {
+                return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
+            }
+
+            // For non-ASCII, fall back to checking the Unicode category.
+            switch (CharUnicodeInfo.GetUnicodeCategory(ch))
+            {
+                case UnicodeCategory.UppercaseLetter:
+                case UnicodeCategory.LowercaseLetter:
+                case UnicodeCategory.TitlecaseLetter:
+                case UnicodeCategory.ModifierLetter:
+                case UnicodeCategory.OtherLetter:
+                case UnicodeCategory.NonSpacingMark:
+                case UnicodeCategory.DecimalDigitNumber:
+                case UnicodeCategory.ConnectorPunctuation:
+                    return true;
+
+                default:
+                    return false;
+            }
+        }
+
+        /// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
+        public static bool IsBoundaryWordChar(char ch)
         {
             // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
             // RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
             // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
             // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
 
-            // 16 bytes, representing the chars 0 through 127, with a 1 for a bit where that char is a word char
-            static ReadOnlySpan<byte> AsciiLookup() => new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-
             // Fast lookup in our lookup table for ASCII characters.  This is purely an optimization, and has the
             // behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
-            ReadOnlySpan<byte> asciiLookup = AsciiLookup();
+            ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
             int chDiv8 = ch >> 3;
-            if ((uint)chDiv8 < asciiLookup.Length)
+            if ((uint)chDiv8 < (uint)asciiLookup.Length)
             {
                 return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
             }
@@ -1012,7 +1043,8 @@ public static bool IsWordChar(char ch)
                     return true;
 
                 default:
-                    return ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner;
+                    const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
+                    return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;
             }
         }
 

diff --git a/...raries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/...raries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -30,6 +30,7 @@ internal abstract class RegexCompiler
         private static readonly MethodInfo s_matchLengthMethod = RegexRunnerMethod("MatchLength");
         private static readonly MethodInfo s_matchIndexMethod = RegexRunnerMethod("MatchIndex");
         private static readonly MethodInfo s_isBoundaryMethod = RegexRunnerMethod("IsBoundary");
+        private static readonly MethodInfo s_isWordCharMethod = RegexRunnerMethod("IsWordChar");
         private static readonly MethodInfo s_isECMABoundaryMethod = RegexRunnerMethod("IsECMABoundary");
         private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos");
         private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass");
@@ -3529,6 +3530,18 @@ private void EmitMatchCharacterClass(string charClass, bool caseInsensitive)
                     Ldc(0);
                     Ceq();
                     return;
+
+                case RegexCharClass.WordClass:
+                    // RegexRunner.IsWordChar(ch)
+                    Call(s_isWordCharMethod);
+                    return;
+
+                case RegexCharClass.NotWordClass:
+                    // !RegexRunner.IsWordChar(ch)
+                    Call(s_isWordCharMethod);
+                    Ldc(0);
+                    Ceq();
+                    return;
             }
 
             // If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,