Skip to content

Commit f9d32df

Browse files
use pointers instead of operator[], extend to utf8_to_utf16
1 parent b2aadf6 commit f9d32df

File tree

1 file changed

+186
-88
lines changed

1 file changed

+186
-88
lines changed

Release/src/utilities/asyncrt_utils.cpp

Lines changed: 186 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -280,147 +280,245 @@ const std::error_category & __cdecl linux_category()
280280
#define H_SURROGATE_END 0xDBFF
281281
#define SURROGATE_PAIR_START 0x10000
282282

283-
utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
283+
inline size_t count_utf8_to_utf16(const std::string& s)
284284
{
285-
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
286-
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
287-
return conversion.from_bytes(s);
288-
#else
289-
utf16string dest;
290-
// Save repeated heap allocations, use less than source string size assuming some
291-
// of the characters are not just ASCII and collapse.
292-
dest.reserve(static_cast<size_t>(static_cast<double>(s.size()) * .70));
293-
294-
for (auto src = s.begin(); src != s.end(); ++src)
285+
const size_t sSize = s.size();
286+
const char* const sData = s.data();
287+
size_t result{ sSize };
288+
for (size_t index = 0; index < sSize;)
295289
{
296-
if ((*src & BIT8) == 0) // single byte character, 0x0 to 0x7F
290+
const char c{ sData[index++] };
291+
if ((c & BIT8) == 0)
297292
{
298-
dest.push_back(utf16string::value_type(*src));
293+
continue;
299294
}
300-
else
295+
296+
if ((c & BIT7) == 0)
301297
{
302-
unsigned char numContBytes = 0;
303-
uint32_t codePoint;
304-
if ((*src & BIT7) == 0)
298+
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
299+
}
300+
else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
301+
{
302+
if (index == sSize)
305303
{
306-
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
304+
throw std::range_error("UTF-8 string is missing bytes in character");
307305
}
308-
else if ((*src & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
306+
307+
const char c2{ sData[index++] };
308+
if ((c2 & 0xC0) != BIT8)
309309
{
310-
codePoint = *src & LOW_5BITS;
311-
numContBytes = 1;
310+
throw std::range_error("UTF-8 continuation byte is missing leading byte");
312311
}
313-
else if ((*src & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
312+
313+
// can't require surrogates for 7FF
314+
--result;
315+
}
316+
else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
317+
{
318+
if (sSize - index < 2)
314319
{
315-
codePoint = *src & LOW_4BITS;
316-
numContBytes = 2;
320+
throw std::range_error("UTF-8 string is missing bytes in character");
317321
}
318-
else if ((*src & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
322+
323+
const char c2{ sData[index++] };
324+
const char c3{ sData[index++] };
325+
if (((c2 | c3) & 0xC0) != BIT8)
319326
{
320-
codePoint = *src & LOW_3BITS;
321-
numContBytes = 3;
327+
throw std::range_error("UTF-8 continuation byte is missing leading byte");
322328
}
323-
else
329+
330+
result -= 2;
331+
}
332+
else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
333+
{
334+
if (sSize - index < 3)
324335
{
325-
throw std::range_error("UTF-8 string has invalid Unicode code point");
336+
throw std::range_error("UTF-8 string is missing bytes in character");
326337
}
327338

328-
for (unsigned char i = 0; i < numContBytes; ++i)
339+
const char c2{ sData[index++] };
340+
const char c3{ sData[index++] };
341+
const char c4{ sData[index++] };
342+
if (((c2 | c3 | c4) & 0xC0) != BIT8)
329343
{
330-
if (++src == s.end())
344+
throw std::range_error("UTF-8 continuation byte is missing leading byte");
345+
}
346+
347+
const uint32_t codePoint = ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
348+
result -= (3 - (codePoint >= SURROGATE_PAIR_START));
349+
}
350+
else
351+
{
352+
throw std::range_error("UTF-8 string has invalid Unicode code point");
353+
}
354+
}
355+
356+
return result;
357+
}
358+
359+
utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
360+
{
361+
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
362+
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
363+
return conversion.from_bytes(s);
364+
#else
365+
// Save repeated heap allocations, use the length of resulting sequence.
366+
const size_t srcSize = s.size();
367+
const std::string::value_type* const srcData = &s[0];
368+
utf16string dest(count_utf8_to_utf16(s), L'\0');
369+
utf16string::value_type* const destData = &dest[0];
370+
size_t destIndex = 0;
371+
372+
for (size_t index = 0; index < srcSize; ++index)
373+
{
374+
std::string::value_type src = srcData[index];
375+
switch (src & 0xF0)
376+
{
377+
case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF
378+
{
379+
const char c2{ srcData[++index] };
380+
const char c3{ srcData[++index] };
381+
const char c4{ srcData[++index] };
382+
uint32_t codePoint = ((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
383+
if (codePoint >= SURROGATE_PAIR_START)
331384
{
332-
throw std::range_error("UTF-8 string is missing bytes in character");
385+
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
386+
// - 0x10000 is subtracted from the code point
387+
// - high surrogate is 0xD800 added to the top ten bits
388+
// - low surrogate is 0xDC00 added to the low ten bits
389+
codePoint -= SURROGATE_PAIR_START;
390+
destData[destIndex++] = static_cast<utf16string::value_type>((codePoint >> 10) | H_SURROGATE_START);
391+
destData[destIndex++] = static_cast<utf16string::value_type>((codePoint & 0x3FF) | L_SURROGATE_START);
333392
}
334-
if ((*src & BIT8) == 0 || (*src & BIT7) != 0)
393+
else
335394
{
336-
throw std::range_error("UTF-8 continuation byte is missing leading byte");
395+
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
396+
// U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
397+
// them if encountered.
398+
destData[destIndex++] = static_cast<utf16string::value_type>(codePoint);
337399
}
338-
codePoint <<= 6;
339-
codePoint |= *src & LOW_6BITS;
340400
}
341-
342-
if (codePoint >= SURROGATE_PAIR_START)
401+
break;
402+
case 0xE0: // 3 byte character, 0x800 to 0xFFFF
343403
{
344-
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
345-
// - 0x10000 is subtracted from the code point
346-
// - high surrogate is 0xD800 added to the top ten bits
347-
// - low surrogate is 0xDC00 added to the low ten bits
348-
codePoint -= SURROGATE_PAIR_START;
349-
dest.push_back(utf16string::value_type((codePoint >> 10) | H_SURROGATE_START));
350-
dest.push_back(utf16string::value_type((codePoint & 0x3FF) | L_SURROGATE_START));
404+
const char c2{ srcData[++index] };
405+
const char c3{ srcData[++index] };
406+
destData[destIndex++] = ((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS);
351407
}
352-
else
408+
break;
409+
case 0xD0: // 2 byte character, 0x80 to 0x7FF
410+
case 0xC0:
353411
{
354-
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
355-
// U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
356-
// them if encountered.
357-
dest.push_back(utf16string::value_type(codePoint));
412+
const char c2{ srcData[++index] };
413+
destData[destIndex++] = static_cast<utf16string::value_type>(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS));
358414
}
415+
break;
416+
default: // single byte character, 0x0 to 0x7F
417+
destData[destIndex++] = static_cast<utf16string::value_type>(src);
359418
}
360419
}
361420
return dest;
362421
#endif
363422
}
364423

365-
std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
366-
{
367-
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
368-
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
369-
return conversion.to_bytes(w);
370-
#else
371-
std::string dest;
372-
dest.reserve(w.size());
373424

374-
for (size_t index = 0; index < w.size(); ++index)
425+
inline size_t count_utf16_to_utf8(const utf16string &w)
426+
{
427+
const utf16string::value_type * const srcData = &w[0];
428+
const size_t srcSize = w.size();
429+
size_t destSize(srcSize);
430+
for (size_t index = 0; index < srcSize; ++index)
375431
{
376-
auto src = w[index];
432+
const utf16string::value_type ch(srcData[index]);
433+
if (ch <= 0x7FF)
434+
{
435+
if (ch > 0x7F) // 2 bytes needed (11 bits used)
436+
{
437+
++destSize;
438+
}
439+
}
377440
// Check for high surrogate.
378-
if (src >= H_SURROGATE_START && src <= H_SURROGATE_END)
441+
else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // 4 bytes need using 21 bits
379442
{
380-
const auto highSurrogate = src;
381-
++index;
382-
if (index == w.size())
443+
++index;
444+
if (index == srcSize)
383445
{
384446
throw std::range_error("UTF-16 string is missing low surrogate");
385447
}
386-
const auto lowSurrogate = w[index];
448+
449+
const auto lowSurrogate = srcData[index];
387450
if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
388451
{
389452
throw std::range_error("UTF-16 string has invalid low surrogate");
390453
}
391454

392-
// To get from surrogate pair to Unicode code point:
393-
// - subract 0xD800 from high surrogate, this forms top ten bits
394-
// - subract 0xDC00 from low surrogate, this forms low ten bits
395-
// - add 0x10000
396-
// Leaves a code point in U+10000 to U+10FFFF range.
397-
uint32_t codePoint = highSurrogate - H_SURROGATE_START;
398-
codePoint <<= 10;
399-
codePoint |= lowSurrogate - L_SURROGATE_START;
400-
codePoint += SURROGATE_PAIR_START;
401-
402-
// 4 bytes need using 21 bits
403-
dest.push_back(char((codePoint >> 18) | 0xF0)); // leading 3 bits
404-
dest.push_back(char(((codePoint >> 12) & LOW_6BITS) | BIT8)); // next 6 bits
405-
dest.push_back(char(((codePoint >> 6) & LOW_6BITS) | BIT8)); // next 6 bits
406-
dest.push_back(char((codePoint & LOW_6BITS) | BIT8)); // trailing 6 bits
455+
destSize += 2;
407456
}
408-
else
457+
else // 3 bytes needed (16 bits used)
458+
{
459+
destSize += 2;
460+
}
461+
}
462+
463+
return destSize;
464+
}
465+
466+
std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
467+
{
468+
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
469+
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
470+
return conversion.to_bytes(w);
471+
#else
472+
const size_t srcSize = w.size();
473+
const utf16string::value_type* const srcData = &w[0];
474+
std::string dest(count_utf16_to_utf8(w), '\0');
475+
std::string::value_type* const destData = &dest[0];
476+
size_t destIndex(0);
477+
478+
for (size_t index = 0; index < srcSize; ++index)
479+
{
480+
const utf16string::value_type src{ srcData[index] };
481+
if (src <= 0x7FF)
409482
{
410483
if (src <= 0x7F) // single byte character
411484
{
412-
dest.push_back(static_cast<char>(src));
485+
destData[destIndex++] = static_cast<char>(src);
413486
}
414-
else if (src <= 0x7FF) // 2 bytes needed (11 bits used)
487+
else // 2 bytes needed (11 bits used)
488+
{
489+
destData[destIndex++] = static_cast<char>(char((src >> 6) | 0xC0)); // leading 5 bits
490+
destData[destIndex++] = static_cast<char>(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits
491+
}
492+
}
493+
else
494+
{
495+
// Check for high surrogate.
496+
if (src >= H_SURROGATE_START && src <= H_SURROGATE_END)
415497
{
416-
dest.push_back(char((src >> 6) | 0xC0)); // leading 5 bits
417-
dest.push_back(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits
498+
const auto highSurrogate{ src };
499+
const auto lowSurrogate{ srcData[++index] };
500+
501+
// To get from surrogate pair to Unicode code point:
502+
// - subract 0xD800 from high surrogate, this forms top ten bits
503+
// - subract 0xDC00 from low surrogate, this forms low ten bits
504+
// - add 0x10000
505+
// Leaves a code point in U+10000 to U+10FFFF range.
506+
uint32_t codePoint = highSurrogate - H_SURROGATE_START;
507+
codePoint <<= 10;
508+
codePoint |= lowSurrogate - L_SURROGATE_START;
509+
codePoint += SURROGATE_PAIR_START;
510+
511+
// 4 bytes need using 21 bits
512+
destData[destIndex++] = static_cast<char>((codePoint >> 18) | 0xF0); // leading 3 bits
513+
destData[destIndex++] = static_cast<char>(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits
514+
destData[destIndex++] = static_cast<char>(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits
515+
destData[destIndex++] = static_cast<char>((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits
418516
}
419517
else // 3 bytes needed (16 bits used)
420518
{
421-
dest.push_back(char((src >> 12) | 0xE0)); // leading 4 bits
422-
dest.push_back(char(((src >> 6) & LOW_6BITS) | BIT8)); // middle 6 bits
423-
dest.push_back(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits
519+
destData[destIndex++] = static_cast<char>((src >> 12) | 0xE0); // leading 4 bits
520+
destData[destIndex++] = static_cast<char>(((src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
521+
destData[destIndex++] = static_cast<char>((src & LOW_6BITS) | BIT8); // trailing 6 bits
424522
}
425523
}
426524
}

0 commit comments

Comments
 (0)