@@ -280,147 +280,245 @@ const std::error_category & __cdecl linux_category()
280280#define H_SURROGATE_END 0xDBFF
281281#define SURROGATE_PAIR_START 0x10000
282282
283- utf16string __cdecl conversions::utf8_to_utf16 (const std::string & s)
283+ inline size_t count_utf8_to_utf16 (const std::string& s)
284284{
285- #if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
286- std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
287- return conversion.from_bytes (s);
288- #else
289- utf16string dest;
290- // Save repeated heap allocations, use less than source string size assuming some
291- // of the characters are not just ASCII and collapse.
292- dest.reserve (static_cast <size_t >(static_cast <double >(s.size ()) * .70 ));
293-
294- for (auto src = s.begin (); src != s.end (); ++src)
285+ const size_t sSize = s.size ();
286+ const char * const sData = s.data ();
287+ size_t result{ sSize };
288+ for (size_t index = 0 ; index < sSize ;)
295289 {
296- if ((*src & BIT8) == 0 ) // single byte character, 0x0 to 0x7F
290+ const char c{ sData [index++] };
291+ if ((c & BIT8) == 0 )
297292 {
298- dest. push_back ( utf16string::value_type (*src)) ;
293+ continue ;
299294 }
300- else
295+
296+ if ((c & BIT7) == 0 )
301297 {
302- unsigned char numContBytes = 0 ;
303- uint32_t codePoint;
304- if ((*src & BIT7) == 0 )
298+ throw std::range_error (" UTF-8 string character can never start with 10xxxxxx" );
299+ }
300+ else if ((c & BIT6) == 0 ) // 2 byte character, 0x80 to 0x7FF
301+ {
302+ if (index == sSize )
305303 {
306- throw std::range_error (" UTF-8 string character can never start with 10xxxxxx " );
304+ throw std::range_error (" UTF-8 string is missing bytes in character " );
307305 }
308- else if ((*src & BIT6) == 0 ) // 2 byte character, 0x80 to 0x7FF
306+
307+ const char c2{ sData [index++] };
308+ if ((c2 & 0xC0 ) != BIT8)
309309 {
310- codePoint = *src & LOW_5BITS;
311- numContBytes = 1 ;
310+ throw std::range_error (" UTF-8 continuation byte is missing leading byte" );
312311 }
313- else if ((*src & BIT5) == 0 ) // 3 byte character, 0x800 to 0xFFFF
312+
313+ // can't require surrogates for 7FF
314+ --result;
315+ }
316+ else if ((c & BIT5) == 0 ) // 3 byte character, 0x800 to 0xFFFF
317+ {
318+ if (sSize - index < 2 )
314319 {
315- codePoint = *src & LOW_4BITS;
316- numContBytes = 2 ;
320+ throw std::range_error (" UTF-8 string is missing bytes in character" );
317321 }
318- else if ((*src & BIT4) == 0 ) // 4 byte character, 0x10000 to 0x10FFFF
322+
323+ const char c2{ sData [index++] };
324+ const char c3{ sData [index++] };
325+ if (((c2 | c3) & 0xC0 ) != BIT8)
319326 {
320- codePoint = *src & LOW_3BITS;
321- numContBytes = 3 ;
327+ throw std::range_error (" UTF-8 continuation byte is missing leading byte" );
322328 }
323- else
329+
330+ result -= 2 ;
331+ }
332+ else if ((c & BIT4) == 0 ) // 4 byte character, 0x10000 to 0x10FFFF
333+ {
334+ if (sSize - index < 3 )
324335 {
325- throw std::range_error (" UTF-8 string has invalid Unicode code point " );
336+ throw std::range_error (" UTF-8 string is missing bytes in character " );
326337 }
327338
328- for (unsigned char i = 0 ; i < numContBytes; ++i)
339+ const char c2{ sData [index++] };
340+ const char c3{ sData [index++] };
341+ const char c4{ sData [index++] };
342+ if (((c2 | c3 | c4) & 0xC0 ) != BIT8)
329343 {
330- if (++src == s.end ())
344+ throw std::range_error (" UTF-8 continuation byte is missing leading byte" );
345+ }
346+
347+ const uint32_t codePoint = ((c & LOW_3BITS) << 18 ) | ((c2 & LOW_6BITS) << 12 ) | ((c3 & LOW_6BITS) << 6 ) | (c4 & LOW_6BITS);
348+ result -= (3 - (codePoint >= SURROGATE_PAIR_START));
349+ }
350+ else
351+ {
352+ throw std::range_error (" UTF-8 string has invalid Unicode code point" );
353+ }
354+ }
355+
356+ return result;
357+ }
358+
359+ utf16string __cdecl conversions::utf8_to_utf16 (const std::string &s)
360+ {
361+ #if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
362+ std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
363+ return conversion.from_bytes (s);
364+ #else
365+ // Save repeated heap allocations, use the length of resulting sequence.
366+ const size_t srcSize = s.size ();
367+ const std::string::value_type* const srcData = &s[0 ];
368+ utf16string dest (count_utf8_to_utf16 (s), L' \0 ' );
369+ utf16string::value_type* const destData = &dest[0 ];
370+ size_t destIndex = 0 ;
371+
372+ for (size_t index = 0 ; index < srcSize; ++index)
373+ {
374+ std::string::value_type src = srcData[index];
375+ switch (src & 0xF0 )
376+ {
377+ case 0xF0 : // 4 byte character, 0x10000 to 0x10FFFF
378+ {
379+ const char c2{ srcData[++index] };
380+ const char c3{ srcData[++index] };
381+ const char c4{ srcData[++index] };
382+ uint32_t codePoint = ((src & LOW_3BITS) << 18 ) | ((c2 & LOW_6BITS) << 12 ) | ((c3 & LOW_6BITS) << 6 ) | (c4 & LOW_6BITS);
383+ if (codePoint >= SURROGATE_PAIR_START)
331384 {
332- throw std::range_error (" UTF-8 string is missing bytes in character" );
385+ // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
386+ // - 0x10000 is subtracted from the code point
387+ // - high surrogate is 0xD800 added to the top ten bits
388+ // - low surrogate is 0xDC00 added to the low ten bits
389+ codePoint -= SURROGATE_PAIR_START;
390+ destData[destIndex++] = static_cast <utf16string::value_type>((codePoint >> 10 ) | H_SURROGATE_START);
391+ destData[destIndex++] = static_cast <utf16string::value_type>((codePoint & 0x3FF ) | L_SURROGATE_START);
333392 }
334- if ((*src & BIT8) == 0 || (*src & BIT7) != 0 )
393+ else
335394 {
336- throw std::range_error (" UTF-8 continuation byte is missing leading byte" );
395+ // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
396+ // U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
397+ // them if encountered.
398+ destData[destIndex++] = static_cast <utf16string::value_type>(codePoint);
337399 }
338- codePoint <<= 6 ;
339- codePoint |= *src & LOW_6BITS;
340400 }
341-
342- if (codePoint >= SURROGATE_PAIR_START)
401+ break ;
402+ case 0xE0 : // 3 byte character, 0x800 to 0xFFFF
343403 {
344- // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
345- // - 0x10000 is subtracted from the code point
346- // - high surrogate is 0xD800 added to the top ten bits
347- // - low surrogate is 0xDC00 added to the low ten bits
348- codePoint -= SURROGATE_PAIR_START;
349- dest.push_back (utf16string::value_type ((codePoint >> 10 ) | H_SURROGATE_START));
350- dest.push_back (utf16string::value_type ((codePoint & 0x3FF ) | L_SURROGATE_START));
404+ const char c2{ srcData[++index] };
405+ const char c3{ srcData[++index] };
406+ destData[destIndex++] = ((src & LOW_4BITS) << 12 ) | ((c2 & LOW_6BITS) << 6 ) | (c3 & LOW_6BITS);
351407 }
352- else
408+ break ;
409+ case 0xD0 : // 2 byte character, 0x80 to 0x7FF
410+ case 0xC0 :
353411 {
354- // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
355- // U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
356- // them if encountered.
357- dest.push_back (utf16string::value_type (codePoint));
412+ const char c2{ srcData[++index] };
413+ destData[destIndex++] = static_cast <utf16string::value_type>(((src & LOW_5BITS) << 6 ) | (c2 & LOW_6BITS));
358414 }
415+ break ;
416+ default : // single byte character, 0x0 to 0x7F
417+ destData[destIndex++] = static_cast <utf16string::value_type>(src);
359418 }
360419 }
361420 return dest;
362421#endif
363422}
364423
365- std::string __cdecl conversions::utf16_to_utf8 (const utf16string &w)
366- {
367- #if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
368- std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
369- return conversion.to_bytes (w);
370- #else
371- std::string dest;
372- dest.reserve (w.size ());
373424
374- for (size_t index = 0 ; index < w.size (); ++index)
425+ inline size_t count_utf16_to_utf8 (const utf16string &w)
426+ {
427+ const utf16string::value_type * const srcData = &w[0 ];
428+ const size_t srcSize = w.size ();
429+ size_t destSize (srcSize);
430+ for (size_t index = 0 ; index < srcSize; ++index)
375431 {
376- auto src = w[index];
432+ const utf16string::value_type ch (srcData[index]);
433+ if (ch <= 0x7FF )
434+ {
435+ if (ch > 0x7F ) // 2 bytes needed (11 bits used)
436+ {
437+ ++destSize;
438+ }
439+ }
377440 // Check for high surrogate.
378- if (src >= H_SURROGATE_START && src <= H_SURROGATE_END)
441+ else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // 4 bytes need using 21 bits
379442 {
380- const auto highSurrogate = src;
381- ++index;
382- if (index == w.size ())
443+ ++index;
444+ if (index == srcSize)
383445 {
384446 throw std::range_error (" UTF-16 string is missing low surrogate" );
385447 }
386- const auto lowSurrogate = w[index];
448+
449+ const auto lowSurrogate = srcData[index];
387450 if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
388451 {
389452 throw std::range_error (" UTF-16 string has invalid low surrogate" );
390453 }
391454
392- // To get from surrogate pair to Unicode code point:
393- // - subract 0xD800 from high surrogate, this forms top ten bits
394- // - subract 0xDC00 from low surrogate, this forms low ten bits
395- // - add 0x10000
396- // Leaves a code point in U+10000 to U+10FFFF range.
397- uint32_t codePoint = highSurrogate - H_SURROGATE_START;
398- codePoint <<= 10 ;
399- codePoint |= lowSurrogate - L_SURROGATE_START;
400- codePoint += SURROGATE_PAIR_START;
401-
402- // 4 bytes need using 21 bits
403- dest.push_back (char ((codePoint >> 18 ) | 0xF0 )); // leading 3 bits
404- dest.push_back (char (((codePoint >> 12 ) & LOW_6BITS) | BIT8)); // next 6 bits
405- dest.push_back (char (((codePoint >> 6 ) & LOW_6BITS) | BIT8)); // next 6 bits
406- dest.push_back (char ((codePoint & LOW_6BITS) | BIT8)); // trailing 6 bits
455+ destSize += 2 ;
407456 }
408- else
457+ else // 3 bytes needed (16 bits used)
458+ {
459+ destSize += 2 ;
460+ }
461+ }
462+
463+ return destSize;
464+ }
465+
466+ std::string __cdecl conversions::utf16_to_utf8 (const utf16string &w)
467+ {
468+ #if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
469+ std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
470+ return conversion.to_bytes (w);
471+ #else
472+ const size_t srcSize = w.size ();
473+ const utf16string::value_type* const srcData = &w[0 ];
474+ std::string dest (count_utf16_to_utf8 (w), ' \0 ' );
475+ std::string::value_type* const destData = &dest[0 ];
476+ size_t destIndex (0 );
477+
478+ for (size_t index = 0 ; index < srcSize; ++index)
479+ {
480+ const utf16string::value_type src{ srcData[index] };
481+ if (src <= 0x7FF )
409482 {
410483 if (src <= 0x7F ) // single byte character
411484 {
412- dest. push_back ( static_cast <char >(src) );
485+ destData[destIndex++] = static_cast <char >(src);
413486 }
414- else if (src <= 0x7FF ) // 2 bytes needed (11 bits used)
487+ else // 2 bytes needed (11 bits used)
488+ {
489+ destData[destIndex++] = static_cast <char >(char ((src >> 6 ) | 0xC0 )); // leading 5 bits
490+ destData[destIndex++] = static_cast <char >(char ((src & LOW_6BITS) | BIT8)); // trailing 6 bits
491+ }
492+ }
493+ else
494+ {
495+ // Check for high surrogate.
496+ if (src >= H_SURROGATE_START && src <= H_SURROGATE_END)
415497 {
416- dest.push_back (char ((src >> 6 ) | 0xC0 )); // leading 5 bits
417- dest.push_back (char ((src & LOW_6BITS) | BIT8)); // trailing 6 bits
498+ const auto highSurrogate{ src };
499+ const auto lowSurrogate{ srcData[++index] };
500+
501+ // To get from surrogate pair to Unicode code point:
502+ // - subract 0xD800 from high surrogate, this forms top ten bits
503+ // - subract 0xDC00 from low surrogate, this forms low ten bits
504+ // - add 0x10000
505+ // Leaves a code point in U+10000 to U+10FFFF range.
506+ uint32_t codePoint = highSurrogate - H_SURROGATE_START;
507+ codePoint <<= 10 ;
508+ codePoint |= lowSurrogate - L_SURROGATE_START;
509+ codePoint += SURROGATE_PAIR_START;
510+
511+ // 4 bytes need using 21 bits
512+ destData[destIndex++] = static_cast <char >((codePoint >> 18 ) | 0xF0 ); // leading 3 bits
513+ destData[destIndex++] = static_cast <char >(((codePoint >> 12 ) & LOW_6BITS) | BIT8); // next 6 bits
514+ destData[destIndex++] = static_cast <char >(((codePoint >> 6 ) & LOW_6BITS) | BIT8); // next 6 bits
515+ destData[destIndex++] = static_cast <char >((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits
418516 }
419517 else // 3 bytes needed (16 bits used)
420518 {
421- dest. push_back ( char ((src >> 12 ) | 0xE0 ) ); // leading 4 bits
422- dest. push_back ( char (((src >> 6 ) & LOW_6BITS) | BIT8) ); // middle 6 bits
423- dest. push_back ( char ((src & LOW_6BITS) | BIT8) ); // trailing 6 bits
519+ destData[destIndex++] = static_cast < char > ((src >> 12 ) | 0xE0 ); // leading 4 bits
520+ destData[destIndex++] = static_cast < char > (((src >> 6 ) & LOW_6BITS) | BIT8); // middle 6 bits
521+ destData[destIndex++] = static_cast < char > ((src & LOW_6BITS) | BIT8); // trailing 6 bits
424522 }
425523 }
426524 }
0 commit comments