Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ void EmitFixedSet()
for (; setIndex < setsToUse; setIndex++)
{
string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]";
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations, ref requiredHelpers);
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, negate: false, additionalDeclarations, ref requiredHelpers);

if (setIndex == start)
{
Expand Down Expand Up @@ -1898,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset

if (node.IsSetFamily)
{
expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers)}";
expr = $"{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: true, additionalDeclarations, ref requiredHelpers)}";
}
else
{
Expand Down Expand Up @@ -2696,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
string expr = $"{sliceSpan}[{iterationLocal}]";
if (node.IsSetFamily)
{
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers);
}
else
{
Expand Down Expand Up @@ -2750,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node)
string expr = $"{sliceSpan}[{sliceStaticPos}]";
if (node.IsSetFamily)
{
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers);
}
else
{
Expand Down Expand Up @@ -3104,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri

private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression;

private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string> additionalDeclarations, ref RequiredHelperFunctions requiredHelpers)
private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, bool negate, HashSet<string> additionalDeclarations, ref RequiredHelperFunctions requiredHelpers)
{
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
// but that call is relatively expensive. Before we fall back to it, we try to optimize
Expand All @@ -3118,27 +3118,23 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
{
case RegexCharClass.AnyClass:
// ideally this could just be "return true;", but we need to evaluate the expression for its side effects
return $"({chExpr} >= 0)"; // a char is unsigned and thus won't ever be negative, so this is equivalent to true
return $"({chExpr} {(negate ? "<" : ">=")} 0)"; // a char is unsigned and thus won't ever be negative

case RegexCharClass.DigitClass:
return $"char.IsDigit({chExpr})";

case RegexCharClass.NotDigitClass:
return $"!char.IsDigit({chExpr})";
negate ^= charClass == RegexCharClass.NotDigitClass;
return $"{(negate ? "!" : "")}char.IsDigit({chExpr})";

case RegexCharClass.SpaceClass:
return $"char.IsWhiteSpace({chExpr})";

case RegexCharClass.NotSpaceClass:
return $"!char.IsWhiteSpace({chExpr})";
negate ^= charClass == RegexCharClass.NotSpaceClass;
return $"{(negate ? "!" : "")}char.IsWhiteSpace({chExpr})";

case RegexCharClass.WordClass:
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
return $"IsWordChar({chExpr})";

case RegexCharClass.NotWordClass:
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
return $"!IsWordChar({chExpr})";
negate ^= charClass == RegexCharClass.NotWordClass;
return $"{(negate ? "!" : "")}IsWordChar({chExpr})";
}

// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
Expand All @@ -3160,18 +3156,19 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
// Next, handle simple sets of one range, e.g. [A-Z], [0-9], etc. This includes some built-in classes, like ECMADigitClass.
if (!invariant && RegexCharClass.TryGetSingleRange(charClass, out char lowInclusive, out char highInclusive))
{
bool invert = RegexCharClass.IsNegated(charClass);
negate ^= RegexCharClass.IsNegated(charClass);
return lowInclusive == highInclusive ?
$"({chExpr} {(invert ? "!=" : "==")} {Literal(lowInclusive)})" :
$"(((uint){chExpr}) - {Literal(lowInclusive)} {(invert ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))";
$"({chExpr} {(negate ? "!=" : "==")} {Literal(lowInclusive)})" :
$"(((uint){chExpr}) - {Literal(lowInclusive)} {(negate ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))";
}

// Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and
// compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus
// we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass.
if (!invariant && RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated))
{
return $"(char.GetUnicodeCategory({chExpr}) {(negated ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})";
negate ^= negated;
return $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})";
}

// Next, if there's only 2, 3, or 4 chars in the set (fairly common due to the sets we create for prefixes),
Expand All @@ -3186,23 +3183,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
case 2:
if ((setChars[0] | 0x20) == setChars[1])
{
return $"(({chExpr} | 0x20) == {Literal(setChars[1])})";
return $"(({chExpr} | 0x20) {(negate ? "!=" : "==")} {Literal(setChars[1])})";
}
additionalDeclarations.Add("char ch;");
return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";
return negate ?
$"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}))" :
$"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";

case 3:
additionalDeclarations.Add("char ch;");
return (setChars[0] | 0x20) == setChars[1] ?
$"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" :
$"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))";
return (negate, (setChars[0] | 0x20) == setChars[1]) switch
{
(false, false) => $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))",
(true, false) => $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))",
(false, true) => $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))",
(true, true) => $"((((ch = {chExpr}) | 0x20) != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))",
};

case 4:
if (((setChars[0] | 0x20) == setChars[1]) &&
((setChars[2] | 0x20) == setChars[3]))
{
additionalDeclarations.Add("char ch;");
return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
return negate ?
$"(((ch = ({chExpr} | 0x20)) != {Literal(setChars[1])}) & (ch != {Literal(setChars[3])}))" :
$"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
}
break;
}
Expand All @@ -3223,8 +3228,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
// the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly
// extend the analysis to produce a known lower-bound and compare against
// that rather than always using 128 as the pivot point.)
return invariant ?
$"((ch = {chExpr}) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" :
return negate ?
$"((ch = {chExpr}) < 128 || !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
$"((ch = {chExpr}) >= 128 && global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
}

Expand All @@ -3233,8 +3238,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
// We determined that every ASCII character is in the class, for example
// if the class were the negated example from case 1 above:
// [^\p{IsGreek}\p{IsGreekExtended}].
return invariant ?
$"((ch = {chExpr}) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" :
return negate ?
$"((ch = {chExpr}) >= 128 && !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
$"((ch = {chExpr}) < 128 || global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
}
}
Expand Down Expand Up @@ -3277,23 +3282,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
// We know that all inputs that could match are ASCII, for example if the
// character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we
// can just fail the comparison.
return $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
return negate ?
$"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
$"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
}

if (analysis.AllNonAsciiContained)
{
// We know that all non-ASCII inputs match, for example if the character
// class were [^\r\n], so since we just determined the ch to be >= 128, we can just
// give back success.
return $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
return negate ?
$"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
$"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
}

// We know that the whole class wasn't ASCII, and we don't know anything about the non-ASCII
// characters other than that some might be included, for example if the character class
// were [\w\d], so since ch >= 128, we need to fall back to calling CharInClass.
return invariant ?
$"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))" :
$"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
return (negate, invariant) switch
{
(false, false) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))",
(true, false) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass((char)ch, {Literal(charClass)}))",
(false, true) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))",
(true, true) => $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !global::System.Text.RegularExpressions.RegexRunner.CharInClass(char.ToLowerInvariant((char)ch), {Literal(charClass)}))",
};
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1924,25 +1924,16 @@ public static string CharDescription(char ch) =>
};

[ExcludeFromCodeCoverage]
private static string CategoryDescription(char ch)
{
if (ch == SpaceConst)
{
return "\\s";
}

if ((short)ch == NotSpaceConst)
{
return "\\S";
}

if ((short)ch < 0)
{
return "\\P{" + CategoryIdToName[(-((short)ch) - 1)] + "}";
}

return "\\p{" + CategoryIdToName[(ch - 1)] + "}";
}
private static string CategoryDescription(char ch) =>
(short)ch switch
{
SpaceConst => @"\s",
NotSpaceConst => @"\S",
(short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\d",
-(short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\D",
< 0 => $"\\P{{{CategoryIdToName[-(short)ch - 1]}}}",
_ => $"\\p{{{CategoryIdToName[ch - 1]}}}",
};

/// <summary>
/// A first/last pair representing a single range of characters.
Expand Down