diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 92f32ea2eaccfe..d423785ec07843 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -551,7 +551,7 @@ void EmitFixedSet() for (; setIndex < setsToUse; setIndex++) { string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]"; - string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations, ref requiredHelpers); + string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, negate: false, additionalDeclarations, ref requiredHelpers); if (setIndex == start) { @@ -1898,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset if (node.IsSetFamily) { - expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers)}"; + expr = $"{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: true, additionalDeclarations, ref requiredHelpers)}"; } else { @@ -2696,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = string expr = $"{sliceSpan}[{iterationLocal}]"; if (node.IsSetFamily) { - expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers); + expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers); } else { @@ -2750,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) string expr = $"{sliceSpan}[{sliceStaticPos}]"; if (node.IsSetFamily) { - expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers); + expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers); } else { @@ -3104,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression; - private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet additionalDeclarations, ref RequiredHelperFunctions requiredHelpers) + private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, bool negate, HashSet additionalDeclarations, ref RequiredHelperFunctions requiredHelpers) { // We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass), // but that call is relatively expensive. Before we fall back to it, we try to optimize @@ -3118,27 +3118,23 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options { case RegexCharClass.AnyClass: // ideally this could just be "return true;", but we need to evaluate the expression for its side effects - return $"({chExpr} >= 0)"; // a char is unsigned and thus won't ever be negative, so this is equivalent to true + return $"({chExpr} {(negate ? "<" : ">=")} 0)"; // a char is unsigned and thus won't ever be negative case RegexCharClass.DigitClass: - return $"char.IsDigit({chExpr})"; - case RegexCharClass.NotDigitClass: - return $"!char.IsDigit({chExpr})"; + negate ^= charClass == RegexCharClass.NotDigitClass; + return $"{(negate ? "!" : "")}char.IsDigit({chExpr})"; case RegexCharClass.SpaceClass: - return $"char.IsWhiteSpace({chExpr})"; - case RegexCharClass.NotSpaceClass: - return $"!char.IsWhiteSpace({chExpr})"; + negate ^= charClass == RegexCharClass.NotSpaceClass; + return $"{(negate ? "!" : "")}char.IsWhiteSpace({chExpr})"; case RegexCharClass.WordClass: - requiredHelpers |= RequiredHelperFunctions.IsWordChar; - return $"IsWordChar({chExpr})"; - case RegexCharClass.NotWordClass: requiredHelpers |= RequiredHelperFunctions.IsWordChar; - return $"!IsWordChar({chExpr})"; + negate ^= charClass == RegexCharClass.NotWordClass; + return $"{(negate ? "!" : "")}IsWordChar({chExpr})"; } // If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture, @@ -3160,10 +3156,10 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options // Next, handle simple sets of one range, e.g. [A-Z], [0-9], etc. This includes some built-in classes, like ECMADigitClass. if (!invariant && RegexCharClass.TryGetSingleRange(charClass, out char lowInclusive, out char highInclusive)) { - bool invert = RegexCharClass.IsNegated(charClass); + negate ^= RegexCharClass.IsNegated(charClass); return lowInclusive == highInclusive ? - $"({chExpr} {(invert ? "!=" : "==")} {Literal(lowInclusive)})" : - $"(((uint){chExpr}) - {Literal(lowInclusive)} {(invert ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))"; + $"({chExpr} {(negate ? "!=" : "==")} {Literal(lowInclusive)})" : + $"(((uint){chExpr}) - {Literal(lowInclusive)} {(negate ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))"; } // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and @@ -3171,7 +3167,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass. if (!invariant && RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated)) { - return $"(char.GetUnicodeCategory({chExpr}) {(negated ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})"; + negate ^= negated; + return $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})"; } // Next, if there's only 2, 3, or 4 chars in the set (fairly common due to the sets we create for prefixes), @@ -3186,23 +3183,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options case 2: if ((setChars[0] | 0x20) == setChars[1]) { - return $"(({chExpr} | 0x20) == {Literal(setChars[1])})"; + return $"(({chExpr} | 0x20) {(negate ? "!=" : "==")} {Literal(setChars[1])})"; } additionalDeclarations.Add("char ch;"); - return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; + return negate ? + $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}))" : + $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; case 3: additionalDeclarations.Add("char ch;"); - return (setChars[0] | 0x20) == setChars[1] ? - $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" : - $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))"; + return (negate, (setChars[0] | 0x20) == setChars[1]) switch + { + (false, false) => $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))", + (true, false) => $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))", + (false, true) => $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))", + (true, true) => $"((((ch = {chExpr}) | 0x20) != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))", + }; case 4: if (((setChars[0] | 0x20) == setChars[1]) && ((setChars[2] | 0x20) == setChars[3])) { additionalDeclarations.Add("char ch;"); - return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))"; + return negate ? + $"(((ch = ({chExpr} | 0x20)) != {Literal(setChars[1])}) & (ch != {Literal(setChars[3])}))" : + $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))"; } break; } @@ -3223,8 +3228,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options // the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly // extend the analysis to produce a known lower-bound and compare against // that rather than always using 128 as the pivot point.) - return invariant ? - $"((ch = {chExpr}) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" : + return negate ? + $"((ch = {chExpr}) < 128 || !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : $"((ch = {chExpr}) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; } @@ -3233,8 +3238,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options // We determined that every ASCII character is in the class, for example // if the class were the negated example from case 1 above: // [^\p{IsGreek}\p{IsGreekExtended}]. - return invariant ? - $"((ch = {chExpr}) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" : + return negate ? + $"((ch = {chExpr}) >= 128 && !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : $"((ch = {chExpr}) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; } } @@ -3277,7 +3282,9 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options // We know that all inputs that could match are ASCII, for example if the // character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we // can just fail the comparison. - return $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; + return negate ? + $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : + $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; } if (analysis.AllNonAsciiContained) @@ -3285,15 +3292,21 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options // We know that all non-ASCII inputs match, for example if the character // class were [^\r\n], so since we just determined the ch to be >= 128, we can just // give back success. - return $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; + return negate ? + $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : + $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; } // We know that the whole class wasn't ASCII, and we don't know anything about the non-ASCII // characters other than that some might be included, for example if the character class // were [\w\d], so since ch >= 128, we need to fall back to calling CharInClass. - return invariant ? - $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" : - $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; + return (negate, invariant) switch + { + (false, false) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))", + (true, false) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))", + (false, true) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))", + (true, true) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))", + }; } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 527e89373ad82b..e00d03064a97a8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1924,25 +1924,16 @@ public static string CharDescription(char ch) => }; [ExcludeFromCodeCoverage] - private static string CategoryDescription(char ch) - { - if (ch == SpaceConst) - { - return "\\s"; - } - - if ((short)ch == NotSpaceConst) - { - return "\\S"; - } - - if ((short)ch < 0) - { - return "\\P{" + CategoryIdToName[(-((short)ch) - 1)] + "}"; - } - - return "\\p{" + CategoryIdToName[(ch - 1)] + "}"; - } + private static string CategoryDescription(char ch) => + (short)ch switch + { + SpaceConst => @"\s", + NotSpaceConst => @"\S", + (short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\d", + -(short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\D", + < 0 => $"\\P{{{CategoryIdToName[-(short)ch - 1]}}}", + _ => $"\\p{{{CategoryIdToName[ch - 1]}}}", + }; /// /// A first/last pair representing a single range of characters.