Skip to content

Commit c19e392

Browse files
committed
Properly handle non-BMP unicode escapes in JSON. Fixes microsoft#139
1 parent 2f2c64f commit c19e392

File tree

2 files changed

+60
-22
lines changed

2 files changed

+60
-22
lines changed

Release/src/json/json_parsing.cpp

Lines changed: 53 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -778,46 +778,77 @@ inline bool JSON_Parser::handle_unescape_char(Token &token)
778778
return true;
779779
case 'u':
780780
{
781-
// A four-hexdigit Unicode character.
782-
// Transform into a 16 bit code point.
783-
int decoded = 0;
784-
for (int i = 0; i < 4; ++i)
785-
{
786-
int ch_int = NextCharacter();
787-
if (ch_int < 0 || ch_int > 127)
788-
return false;
781+
auto decode_utf16_unit = [](JSON_Parser& parser, json_error& ec) {
782+
// A four-hexdigit Unicode character.
783+
// Transform into a 16 bit code point.
784+
int decoded = 0;
785+
for (int i = 0; i < 4; ++i)
786+
{
787+
int ch_int = parser.NextCharacter();
788+
if (ch_int < 0 || ch_int > 127) {
789+
ec = json_error::malformed_string_literal;
790+
return 0;
791+
}
789792
#ifdef _WIN32
790-
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
793+
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
791794
#else
792-
const int isxdigitResult = isxdigit(ch_int);
795+
const int isxdigitResult = isxdigit(ch_int);
793796
#endif
794-
if (!isxdigitResult)
795-
return false;
797+
if (!isxdigitResult)
798+
{
799+
ec = json_error::malformed_string_literal;
800+
return 0;
801+
}
796802

797-
int val = _hexval[static_cast<size_t>(ch_int)];
798-
_ASSERTE(val != -1);
803+
int val = _hexval[static_cast<size_t>(ch_int)];
804+
_ASSERTE(val != -1);
799805

800-
// Add the input char to the decoded number
801-
decoded |= (val << (4 * (3 - i)));
802-
}
806+
// Add the input char to the decoded number
807+
decoded |= (val << (4 * (3 - i)));
808+
}
809+
810+
return decoded;
811+
};
803812

804813
// Construct the character based on the decoded number
805814
// Convert the code unit into a UTF-8 sequence
806-
// TODO: Improve detection of surrogate pair + error handling
807815
utf16string utf16;
816+
auto decoded = decode_utf16_unit(*this, token.m_error);
817+
if (token.m_error)
818+
return false;
808819
utf16.push_back(static_cast<utf16char>(decoded));
809-
utf8string utf8;
820+
821+
if (decoded >= 0xD800)
822+
{
823+
// Decoded a high surrogate. Attempt to grab low surrogate.
824+
if (NextCharacter() != '\\')
825+
{
826+
token.m_error = json_error::malformed_string_literal;
827+
return false;
828+
}
829+
if (NextCharacter() != 'u')
830+
{
831+
token.m_error = json_error::malformed_string_literal;
832+
return false;
833+
}
834+
decoded = decode_utf16_unit(*this, token.m_error);
835+
if (token.m_error)
836+
return false;
837+
utf16.push_back(static_cast<utf16char>(decoded));
838+
}
839+
810840
try
811841
{
842+
utf8string utf8;
812843
utf8 = ::utility::conversions::utf16_to_utf8(utf16);
844+
token.string_val.append(utf8);
845+
return true;
813846
}
814847
catch (...)
815848
{
816849
token.m_error = json_error::malformed_string_literal;
850+
return false;
817851
}
818-
token.string_val.append(utf8);
819-
820-
return true;
821852
}
822853
default:
823854
// BUG: This is incorrect behavior; all characters MAY be escaped, and should be added as-is.

Release/tests/functional/json/parsing_tests.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,13 @@ TEST(escaped_unicode_string)
220220
const auto euro = "\xE2\x82\xAC";
221221
VERIFY_ARE_EQUAL(euro, str.as_string());
222222

223+
// Test for surrogate pairs of unicode escapes
224+
str = web::json::value::parse(U("\"\\ud83c\\uddee\""));
225+
VERIFY_ARE_EQUAL("\xF0\x9F\x87\xAE", str.as_string());
226+
227+
// Should error if a unicode escape is missing its lower surrogate
228+
VERIFY_PARSING_THROW(json::value::parse(U("\"\\ud83c\"")));
229+
223230
VERIFY_PARSING_THROW(json::value::parse(U("\"\\u0klB\"")));
224231
}
225232

0 commit comments

Comments
 (0)