From bd4b981662062789232a517e69c5c6b8a94430fa Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Mon, 21 Nov 2022 22:31:35 +0100 Subject: [PATCH 1/2] Use IndexOfAnyInRange in StripBidiControlCharacters --- .../src/System/UriHelper.cs | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/src/libraries/System.Private.Uri/src/System/UriHelper.cs b/src/libraries/System.Private.Uri/src/System/UriHelper.cs index 4a07e754a4d070..2bc2da671d3ce7 100644 --- a/src/libraries/System.Private.Uri/src/System/UriHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/UriHelper.cs @@ -570,51 +570,58 @@ internal static bool IsLWS(char ch) return (ch <= ' ') && (ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t'); } - // + private const char BidiControlCharacterMin = '\u200E'; + private const char BidiControlCharacterMax = '\u202E'; + // Is this a Bidirectional control char.. These get stripped - // internal static bool IsBidiControlCharacter(char ch) { - return (ch == '\u200E' /*LRM*/ || ch == '\u200F' /*RLM*/ || ch == '\u202A' /*LRE*/ || - ch == '\u202B' /*RLE*/ || ch == '\u202C' /*PDF*/ || ch == '\u202D' /*LRO*/ || - ch == '\u202E' /*RLO*/); + return char.IsBetween(ch, BidiControlCharacterMin, BidiControlCharacterMax) + && IsBidiControlCharacterSlow(ch); + + static bool IsBidiControlCharacterSlow(char ch) + { + return (ch == '\u200E' /*LRM*/ || ch == '\u200F' /*RLM*/ || ch == '\u202A' /*LRE*/ || + ch == '\u202B' /*RLE*/ || ch == '\u202C' /*PDF*/ || ch == '\u202D' /*LRO*/ || + ch == '\u202E' /*RLO*/); + } } - // // Strip Bidirectional control characters from this string - // internal static unsafe string StripBidiControlCharacters(ReadOnlySpan strToClean, string? backingString = null) { Debug.Assert(backingString is null || strToClean.Length == backingString.Length); int charsToRemove = 0; - foreach (char c in strToClean) + + int indexOfPossibleCharToRemove = strToClean.IndexOfAnyInRange(BidiControlCharacterMin, BidiControlCharacterMax); + if (indexOfPossibleCharToRemove >= 0) { - if ((uint)(c - '\u200E') <= ('\u202E' - '\u200E') && IsBidiControlCharacter(c)) + // Slow path: Contains chars that fall in the [u200E, u202E] range (so likely Bidi) + foreach (char c in strToClean.Slice(indexOfPossibleCharToRemove)) { - charsToRemove++; + if (IsBidiControlCharacter(c)) + { + charsToRemove++; + } } } if (charsToRemove == 0) { + // Hot path return backingString ?? new string(strToClean); } - if (charsToRemove == strToClean.Length) - { - return string.Empty; - } - fixed (char* pStrToClean = &MemoryMarshal.GetReference(strToClean)) { - return string.Create(strToClean.Length - charsToRemove, (StrToClean: (IntPtr)pStrToClean, strToClean.Length), (buffer, state) => + return string.Create(strToClean.Length - charsToRemove, (StrToClean: (IntPtr)pStrToClean, strToClean.Length), static (buffer, state) => { var strToClean = new ReadOnlySpan((char*)state.StrToClean, state.Length); int destIndex = 0; foreach (char c in strToClean) { - if ((uint)(c - '\u200E') > ('\u202E' - '\u200E') || !IsBidiControlCharacter(c)) + if (!IsBidiControlCharacter(c)) { buffer[destIndex++] = c; } From 9b355b01ba56dad3ff0ce970ada8f13efe21db2a Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Mon, 21 Nov 2022 22:52:59 +0100 Subject: [PATCH 2/2] Simplify IsBidiControlCharacter --- .../src/System/UriHelper.cs | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/src/libraries/System.Private.Uri/src/System/UriHelper.cs b/src/libraries/System.Private.Uri/src/System/UriHelper.cs index 2bc2da671d3ce7..5d9665a67b7c09 100644 --- a/src/libraries/System.Private.Uri/src/System/UriHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/UriHelper.cs @@ -570,22 +570,9 @@ internal static bool IsLWS(char ch) return (ch <= ' ') && (ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t'); } - private const char BidiControlCharacterMin = '\u200E'; - private const char BidiControlCharacterMax = '\u202E'; - // Is this a Bidirectional control char.. These get stripped - internal static bool IsBidiControlCharacter(char ch) - { - return char.IsBetween(ch, BidiControlCharacterMin, BidiControlCharacterMax) - && IsBidiControlCharacterSlow(ch); - - static bool IsBidiControlCharacterSlow(char ch) - { - return (ch == '\u200E' /*LRM*/ || ch == '\u200F' /*RLM*/ || ch == '\u202A' /*LRE*/ || - ch == '\u202B' /*RLE*/ || ch == '\u202C' /*PDF*/ || ch == '\u202D' /*LRO*/ || - ch == '\u202E' /*RLO*/); - } - } + internal static bool IsBidiControlCharacter(char ch) => + char.IsBetween(ch, '\u200E', '\u202E') && !char.IsBetween(ch, '\u2010', '\u2029'); // Strip Bidirectional control characters from this string internal static unsafe string StripBidiControlCharacters(ReadOnlySpan strToClean, string? backingString = null) @@ -594,7 +581,7 @@ internal static unsafe string StripBidiControlCharacters(ReadOnlySpan strT int charsToRemove = 0; - int indexOfPossibleCharToRemove = strToClean.IndexOfAnyInRange(BidiControlCharacterMin, BidiControlCharacterMax); + int indexOfPossibleCharToRemove = strToClean.IndexOfAnyInRange('\u200E', '\u202E'); if (indexOfPossibleCharToRemove >= 0) { // Slow path: Contains chars that fall in the [u200E, u202E] range (so likely Bidi)