Properly handle non-BMP unicode escapes in JSON. Fixes microsoft#139

ras0219-msft · ras0219-msft · commit c19e39249667 · 2016-05-02T16:08:13.000-07:00
diff --git a/Release/src/json/json_parsing.cpp b/Release/src/json/json_parsing.cpp
@@ -778,46 +778,77 @@ inline bool JSON_Parser::handle_unescape_char(Token &token)
             return true;
         case 'u':
         {
-            // A four-hexdigit Unicode character.
-            // Transform into a 16 bit code point.
-            int decoded = 0;
-            for (int i = 0; i < 4; ++i)
-            {
-                int ch_int = NextCharacter();
-                if (ch_int < 0 || ch_int > 127)
-                    return false;
+            auto decode_utf16_unit = [](JSON_Parser& parser, json_error& ec) {
+                // A four-hexdigit Unicode character.
+                // Transform into a 16 bit code point.
+                int decoded = 0;
+                for (int i = 0; i < 4; ++i)
+                {
+                    int ch_int = parser.NextCharacter();
+                    if (ch_int < 0 || ch_int > 127) {
+                        ec = json_error::malformed_string_literal;
+                        return 0;
+                    }
 #ifdef _WIN32
-                const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
+                    const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
 #else
-                const int isxdigitResult = isxdigit(ch_int);
+                    const int isxdigitResult = isxdigit(ch_int);
 #endif
-                if (!isxdigitResult)
-                    return false;
+                    if (!isxdigitResult)
+                    {
+                        ec = json_error::malformed_string_literal;
+                        return 0;
+                    }
 
-                int val = _hexval[static_cast<size_t>(ch_int)];
-                _ASSERTE(val != -1);
+                    int val = _hexval[static_cast<size_t>(ch_int)];
+                    _ASSERTE(val != -1);
 
-                // Add the input char to the decoded number
-                decoded |= (val << (4 * (3 - i)));
-            }
+                    // Add the input char to the decoded number
+                    decoded |= (val << (4 * (3 - i)));
+                }
+
+                return decoded;
+            };
 
             // Construct the character based on the decoded number
             // Convert the code unit into a UTF-8 sequence
-            // TODO: Improve detection of surrogate pair + error handling
             utf16string utf16;
+            auto decoded = decode_utf16_unit(*this, token.m_error);
+            if (token.m_error)
+                return false;
             utf16.push_back(static_cast<utf16char>(decoded));
-            utf8string utf8;
+
+            if (decoded >= 0xD800)
+            {
+                // Decoded a high surrogate. Attempt to grab low surrogate.
+                if (NextCharacter() != '\\')
+                {
+                    token.m_error = json_error::malformed_string_literal;
+                    return false;
+                }
+                if (NextCharacter() != 'u')
+                {
+                    token.m_error = json_error::malformed_string_literal;
+                    return false;
+                }
+                decoded = decode_utf16_unit(*this, token.m_error);
+                if (token.m_error)
+                    return false;
+                utf16.push_back(static_cast<utf16char>(decoded));
+            }
+
             try
             {
+                utf8string utf8;
                 utf8 = ::utility::conversions::utf16_to_utf8(utf16);
+                token.string_val.append(utf8);
+                return true;
             }
             catch (...)
             {
                 token.m_error = json_error::malformed_string_literal;
+                return false;
             }
-            token.string_val.append(utf8);
-
-            return true;
         }
         default:
             // BUG: This is incorrect behavior; all characters MAY be escaped, and should be added as-is.
diff --git a/Release/tests/functional/json/parsing_tests.cpp b/Release/tests/functional/json/parsing_tests.cpp
@@ -220,6 +220,13 @@ TEST(escaped_unicode_string)
     const auto euro = "\xE2\x82\xAC";
     VERIFY_ARE_EQUAL(euro, str.as_string());
 
+    // Test for surrogate pairs of unicode escapes
+    str = web::json::value::parse(U("\"\\ud83c\\uddee\""));
+    VERIFY_ARE_EQUAL("\xF0\x9F\x87\xAE", str.as_string());
+
+    // Should error if a unicode escape is missing its lower surrogate
+    VERIFY_PARSING_THROW(json::value::parse(U("\"\\ud83c\"")));
+
     VERIFY_PARSING_THROW(json::value::parse(U("\"\\u0klB\"")));
 }