Skip to content

Commit cf91c13

Browse files
authored
Merge pull request microsoft#251 from JasonYang-MSFT/development
Improve the performance of conversion between UTF16 and UTF8 strings.
2 parents 006c09f + f9d32df commit cf91c13

File tree

1 file changed

+187
-86
lines changed

1 file changed

+187
-86
lines changed

Release/src/utilities/asyncrt_utils.cpp

Lines changed: 187 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -280,144 +280,245 @@ const std::error_category & __cdecl linux_category()
280280
#define H_SURROGATE_END 0xDBFF
281281
#define SURROGATE_PAIR_START 0x10000
282282

283-
utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
283+
inline size_t count_utf8_to_utf16(const std::string& s)
284284
{
285-
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
286-
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
287-
return conversion.from_bytes(s);
288-
#else
289-
utf16string dest;
290-
// Save repeated heap allocations, use less than source string size assuming some
291-
// of the characters are not just ASCII and collapse.
292-
dest.reserve(static_cast<size_t>(static_cast<double>(s.size()) * .70));
293-
294-
for (auto src = s.begin(); src != s.end(); ++src)
285+
const size_t sSize = s.size();
286+
const char* const sData = s.data();
287+
size_t result{ sSize };
288+
for (size_t index = 0; index < sSize;)
295289
{
296-
if ((*src & BIT8) == 0) // single byte character, 0x0 to 0x7F
290+
const char c{ sData[index++] };
291+
if ((c & BIT8) == 0)
297292
{
298-
dest.push_back(utf16string::value_type(*src));
293+
continue;
299294
}
300-
else
295+
296+
if ((c & BIT7) == 0)
301297
{
302-
unsigned char numContBytes = 0;
303-
uint32_t codePoint;
304-
if ((*src & BIT7) == 0)
298+
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
299+
}
300+
else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
301+
{
302+
if (index == sSize)
305303
{
306-
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
304+
throw std::range_error("UTF-8 string is missing bytes in character");
307305
}
308-
else if ((*src & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
306+
307+
const char c2{ sData[index++] };
308+
if ((c2 & 0xC0) != BIT8)
309309
{
310-
codePoint = *src & LOW_5BITS;
311-
numContBytes = 1;
310+
throw std::range_error("UTF-8 continuation byte is missing leading byte");
312311
}
313-
else if ((*src & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
312+
313+
// can't require surrogates for 7FF
314+
--result;
315+
}
316+
else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
317+
{
318+
if (sSize - index < 2)
314319
{
315-
codePoint = *src & LOW_4BITS;
316-
numContBytes = 2;
320+
throw std::range_error("UTF-8 string is missing bytes in character");
317321
}
318-
else if ((*src & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
322+
323+
const char c2{ sData[index++] };
324+
const char c3{ sData[index++] };
325+
if (((c2 | c3) & 0xC0) != BIT8)
319326
{
320-
codePoint = *src & LOW_3BITS;
321-
numContBytes = 3;
327+
throw std::range_error("UTF-8 continuation byte is missing leading byte");
322328
}
323-
else
329+
330+
result -= 2;
331+
}
332+
else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
333+
{
334+
if (sSize - index < 3)
324335
{
325-
throw std::range_error("UTF-8 string has invalid Unicode code point");
336+
throw std::range_error("UTF-8 string is missing bytes in character");
326337
}
327338

328-
for (unsigned char i = 0; i < numContBytes; ++i)
339+
const char c2{ sData[index++] };
340+
const char c3{ sData[index++] };
341+
const char c4{ sData[index++] };
342+
if (((c2 | c3 | c4) & 0xC0) != BIT8)
343+
{
344+
throw std::range_error("UTF-8 continuation byte is missing leading byte");
345+
}
346+
347+
const uint32_t codePoint = ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
348+
result -= (3 - (codePoint >= SURROGATE_PAIR_START));
349+
}
350+
else
351+
{
352+
throw std::range_error("UTF-8 string has invalid Unicode code point");
353+
}
354+
}
355+
356+
return result;
357+
}
358+
359+
utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
360+
{
361+
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
362+
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
363+
return conversion.from_bytes(s);
364+
#else
365+
// Save repeated heap allocations, use the length of resulting sequence.
366+
const size_t srcSize = s.size();
367+
const std::string::value_type* const srcData = &s[0];
368+
utf16string dest(count_utf8_to_utf16(s), L'\0');
369+
utf16string::value_type* const destData = &dest[0];
370+
size_t destIndex = 0;
371+
372+
for (size_t index = 0; index < srcSize; ++index)
373+
{
374+
std::string::value_type src = srcData[index];
375+
switch (src & 0xF0)
376+
{
377+
case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF
329378
{
330-
if (++src == s.end())
379+
const char c2{ srcData[++index] };
380+
const char c3{ srcData[++index] };
381+
const char c4{ srcData[++index] };
382+
uint32_t codePoint = ((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
383+
if (codePoint >= SURROGATE_PAIR_START)
331384
{
332-
throw std::range_error("UTF-8 string is missing bytes in character");
385+
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
386+
// - 0x10000 is subtracted from the code point
387+
// - high surrogate is 0xD800 added to the top ten bits
388+
// - low surrogate is 0xDC00 added to the low ten bits
389+
codePoint -= SURROGATE_PAIR_START;
390+
destData[destIndex++] = static_cast<utf16string::value_type>((codePoint >> 10) | H_SURROGATE_START);
391+
destData[destIndex++] = static_cast<utf16string::value_type>((codePoint & 0x3FF) | L_SURROGATE_START);
333392
}
334-
if ((*src & BIT8) == 0 || (*src & BIT7) != 0)
393+
else
335394
{
336-
throw std::range_error("UTF-8 continuation byte is missing leading byte");
395+
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
396+
// U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
397+
// them if encountered.
398+
destData[destIndex++] = static_cast<utf16string::value_type>(codePoint);
337399
}
338-
codePoint <<= 6;
339-
codePoint |= *src & LOW_6BITS;
340400
}
341-
342-
if (codePoint >= SURROGATE_PAIR_START)
401+
break;
402+
case 0xE0: // 3 byte character, 0x800 to 0xFFFF
343403
{
344-
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
345-
// - 0x10000 is subtracted from the code point
346-
// - high surrogate is 0xD800 added to the top ten bits
347-
// - low surrogate is 0xDC00 added to the low ten bits
348-
codePoint -= SURROGATE_PAIR_START;
349-
dest.push_back(utf16string::value_type((codePoint >> 10) | H_SURROGATE_START));
350-
dest.push_back(utf16string::value_type((codePoint & 0x3FF) | L_SURROGATE_START));
404+
const char c2{ srcData[++index] };
405+
const char c3{ srcData[++index] };
406+
destData[destIndex++] = ((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS);
351407
}
352-
else
408+
break;
409+
case 0xD0: // 2 byte character, 0x80 to 0x7FF
410+
case 0xC0:
353411
{
354-
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
355-
// U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
356-
// them if encountered.
357-
dest.push_back(utf16string::value_type(codePoint));
412+
const char c2{ srcData[++index] };
413+
destData[destIndex++] = static_cast<utf16string::value_type>(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS));
358414
}
415+
break;
416+
default: // single byte character, 0x0 to 0x7F
417+
destData[destIndex++] = static_cast<utf16string::value_type>(src);
359418
}
360419
}
361420
return dest;
362421
#endif
363422
}
364423

365-
std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
424+
425+
inline size_t count_utf16_to_utf8(const utf16string &w)
366426
{
367-
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
368-
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
369-
return conversion.to_bytes(w);
370-
#else
371-
std::string dest;
372-
dest.reserve(w.size());
373-
for (auto src = w.begin(); src != w.end(); ++src)
427+
const utf16string::value_type * const srcData = &w[0];
428+
const size_t srcSize = w.size();
429+
size_t destSize(srcSize);
430+
for (size_t index = 0; index < srcSize; ++index)
374431
{
432+
const utf16string::value_type ch(srcData[index]);
433+
if (ch <= 0x7FF)
434+
{
435+
if (ch > 0x7F) // 2 bytes needed (11 bits used)
436+
{
437+
++destSize;
438+
}
439+
}
375440
// Check for high surrogate.
376-
if (*src >= H_SURROGATE_START && *src <= H_SURROGATE_END)
441+
else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // 4 bytes need using 21 bits
377442
{
378-
const auto highSurrogate = *src++;
379-
if (src == w.end())
443+
++index;
444+
if (index == srcSize)
380445
{
381446
throw std::range_error("UTF-16 string is missing low surrogate");
382447
}
383-
const auto lowSurrogate = *src;
448+
449+
const auto lowSurrogate = srcData[index];
384450
if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
385451
{
386452
throw std::range_error("UTF-16 string has invalid low surrogate");
387453
}
388454

389-
// To get from surrogate pair to Unicode code point:
390-
// - subract 0xD800 from high surrogate, this forms top ten bits
391-
// - subract 0xDC00 from low surrogate, this forms low ten bits
392-
// - add 0x10000
393-
// Leaves a code point in U+10000 to U+10FFFF range.
394-
uint32_t codePoint = highSurrogate - H_SURROGATE_START;
395-
codePoint <<= 10;
396-
codePoint |= lowSurrogate - L_SURROGATE_START;
397-
codePoint += SURROGATE_PAIR_START;
398-
399-
// 4 bytes need using 21 bits
400-
dest.push_back(char((codePoint >> 18) | 0xF0)); // leading 3 bits
401-
dest.push_back(char(((codePoint >> 12) & LOW_6BITS) | BIT8)); // next 6 bits
402-
dest.push_back(char(((codePoint >> 6) & LOW_6BITS) | BIT8)); // next 6 bits
403-
dest.push_back(char((codePoint & LOW_6BITS) | BIT8)); // trailing 6 bits
455+
destSize += 2;
404456
}
405-
else
457+
else // 3 bytes needed (16 bits used)
406458
{
407-
if (*src <= 0x7F) // single byte character
459+
destSize += 2;
460+
}
461+
}
462+
463+
return destSize;
464+
}
465+
466+
std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
467+
{
468+
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
469+
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
470+
return conversion.to_bytes(w);
471+
#else
472+
const size_t srcSize = w.size();
473+
const utf16string::value_type* const srcData = &w[0];
474+
std::string dest(count_utf16_to_utf8(w), '\0');
475+
std::string::value_type* const destData = &dest[0];
476+
size_t destIndex(0);
477+
478+
for (size_t index = 0; index < srcSize; ++index)
479+
{
480+
const utf16string::value_type src{ srcData[index] };
481+
if (src <= 0x7FF)
482+
{
483+
if (src <= 0x7F) // single byte character
408484
{
409-
dest.push_back(static_cast<char>(*src));
485+
destData[destIndex++] = static_cast<char>(src);
410486
}
411-
else if (*src <= 0x7FF) // 2 bytes needed (11 bits used)
487+
else // 2 bytes needed (11 bits used)
488+
{
489+
destData[destIndex++] = static_cast<char>(char((src >> 6) | 0xC0)); // leading 5 bits
490+
destData[destIndex++] = static_cast<char>(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits
491+
}
492+
}
493+
else
494+
{
495+
// Check for high surrogate.
496+
if (src >= H_SURROGATE_START && src <= H_SURROGATE_END)
412497
{
413-
dest.push_back(char((*src >> 6) | 0xC0)); // leading 5 bits
414-
dest.push_back(char((*src & LOW_6BITS) | BIT8)); // trailing 6 bits
498+
const auto highSurrogate{ src };
499+
const auto lowSurrogate{ srcData[++index] };
500+
501+
// To get from surrogate pair to Unicode code point:
502+
// - subract 0xD800 from high surrogate, this forms top ten bits
503+
// - subract 0xDC00 from low surrogate, this forms low ten bits
504+
// - add 0x10000
505+
// Leaves a code point in U+10000 to U+10FFFF range.
506+
uint32_t codePoint = highSurrogate - H_SURROGATE_START;
507+
codePoint <<= 10;
508+
codePoint |= lowSurrogate - L_SURROGATE_START;
509+
codePoint += SURROGATE_PAIR_START;
510+
511+
// 4 bytes need using 21 bits
512+
destData[destIndex++] = static_cast<char>((codePoint >> 18) | 0xF0); // leading 3 bits
513+
destData[destIndex++] = static_cast<char>(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits
514+
destData[destIndex++] = static_cast<char>(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits
515+
destData[destIndex++] = static_cast<char>((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits
415516
}
416517
else // 3 bytes needed (16 bits used)
417518
{
418-
dest.push_back(char((*src >> 12) | 0xE0)); // leading 4 bits
419-
dest.push_back(char(((*src >> 6) & LOW_6BITS) | BIT8)); // middle 6 bits
420-
dest.push_back(char((*src & LOW_6BITS) | BIT8)); // trailing 6 bits
519+
destData[destIndex++] = static_cast<char>((src >> 12) | 0xE0); // leading 4 bits
520+
destData[destIndex++] = static_cast<char>(((src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
521+
destData[destIndex++] = static_cast<char>((src & LOW_6BITS) | BIT8); // trailing 6 bits
421522
}
422523
}
423524
}

0 commit comments

Comments
 (0)