From 829e99beff84a400cd005dbe22f3cf4be01e9198 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Fri, 31 Aug 2018 16:57:03 +0200 Subject: [PATCH 1/2] src: allow UTF-16 in generic StringBytes decode call This allows removing a number of special cases in other parts of the code, at least one of which was incorrect in expecting aligned input when that was not guaranteed. Fixes: https://github.com/nodejs/node/issues/22358 --- src/node.h | 1 - src/node_buffer.cc | 59 ---------------------------- src/string_bytes.cc | 52 ++++++++++++------------ src/string_bytes.h | 2 +- src/string_decoder.cc | 10 ----- test/parallel/test-string-decoder.js | 8 ++++ 6 files changed, 36 insertions(+), 96 deletions(-) diff --git a/src/node.h b/src/node.h index 74403a6e48d3b8..1a844a6c7cb9cc 100644 --- a/src/node.h +++ b/src/node.h @@ -407,7 +407,6 @@ NODE_DEPRECATED("Use FatalException(isolate, ...)", return FatalException(v8::Isolate::GetCurrent(), try_catch); }) -// Don't call with encoding=UCS2. NODE_EXTERN v8::Local Encode(v8::Isolate* isolate, const char* buf, size_t len, diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 9a280f7c127cc6..816cbf73605c78 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -464,65 +464,6 @@ void StringSlice(const FunctionCallbackInfo& args) { } -template <> -void StringSlice(const FunctionCallbackInfo& args) { - Isolate* isolate = args.GetIsolate(); - Environment* env = Environment::GetCurrent(isolate); - - THROW_AND_RETURN_UNLESS_BUFFER(env, args.This()); - SPREAD_BUFFER_ARG(args.This(), ts_obj); - - if (ts_obj_length == 0) - return args.GetReturnValue().SetEmptyString(); - - SLICE_START_END(args[0], args[1], ts_obj_length) - length /= 2; - - const char* data = ts_obj_data + start; - const uint16_t* buf; - bool release = false; - - // Node's "ucs2" encoding expects LE character data inside a Buffer, so we - // need to reorder on BE platforms. See https://nodejs.org/api/buffer.html - // regarding Node's "ucs2" encoding specification. - const bool aligned = (reinterpret_cast(data) % sizeof(*buf) == 0); - if (IsLittleEndian() && !aligned) { - // Make a copy to avoid unaligned accesses in v8::String::NewFromTwoByte(). - // This applies ONLY to little endian platforms, as misalignment will be - // handled by a byte-swapping operation in StringBytes::Encode on - // big endian platforms. - uint16_t* copy = new uint16_t[length]; - for (size_t i = 0, k = 0; i < length; i += 1, k += 2) { - // Assumes that the input is little endian. - const uint8_t lo = static_cast(data[k + 0]); - const uint8_t hi = static_cast(data[k + 1]); - copy[i] = lo | hi << 8; - } - buf = copy; - release = true; - } else { - buf = reinterpret_cast(data); - } - - Local error; - MaybeLocal ret = - StringBytes::Encode(isolate, - buf, - length, - &error); - - if (release) - delete[] buf; - - if (ret.IsEmpty()) { - CHECK(!error.IsEmpty()); - isolate->ThrowException(error); - return; - } - args.GetReturnValue().Set(ret.ToLocalChecked()); -} - - // bytesCopied = copy(buffer, target[, targetStart][, sourceStart][, sourceEnd]) void Copy(const FunctionCallbackInfo &args) { Environment* env = Environment::GetCurrent(args); diff --git a/src/string_bytes.cc b/src/string_bytes.cc index b7f009fe1fcc16..f5e30dbed104ef 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -619,7 +619,6 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, size_t buflen, enum encoding encoding, Local* error) { - CHECK_NE(encoding, UCS2); CHECK_BUFLEN_IN_RANGE(buflen); if (!buflen && encoding != BUFFER) { @@ -697,6 +696,32 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, return ExternOneByteString::New(isolate, dst, dlen, error); } + case UCS2: { + if (IsBigEndian()) { + // For BE we always need to swap the byte order. + uint16_t* dst = node::UncheckedMalloc(buflen / 2); + if (dst == nullptr) { + *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate); + return MaybeLocal(); + } + for (size_t i = 0, k = 0; k < buflen / 2; i += 2, k += 1) { + const uint8_t hi = static_cast(buf[i + 0]); + const uint8_t lo = static_cast(buf[i + 1]); + dst[k] = static_cast(hi) << 8 | lo; + } + return ExternTwoByteString::New(isolate, dst, buflen, error); + } + if (reinterpret_cast(buf) % 2 != 0) { + // Unaligned data still means we can't directly pass it to V8. + char* dst = node::UncheckedMalloc(buflen); + memcpy(dst, buf, buflen); + return ExternTwoByteString::New( + isolate, reinterpret_cast(dst), buflen / 2, error); + } + return ExternTwoByteString::NewFromCopy( + isolate, reinterpret_cast(buf), buflen / 2, error); + } + default: CHECK(0 && "unknown encoding"); break; @@ -736,30 +761,7 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, enum encoding encoding, Local* error) { const size_t len = strlen(buf); - MaybeLocal ret; - if (encoding == UCS2) { - // In Node, UCS2 means utf16le. The data must be in little-endian - // order and must be aligned on 2-bytes. This returns an empty - // value if it's not aligned and ensures the appropriate byte order - // on big endian architectures. - const bool be = IsBigEndian(); - if (len % 2 != 0) - return ret; - std::vector vec(len / 2); - for (size_t i = 0, k = 0; i < len; i += 2, k += 1) { - const uint8_t hi = static_cast(buf[i + 0]); - const uint8_t lo = static_cast(buf[i + 1]); - vec[k] = be ? - static_cast(hi) << 8 | lo - : static_cast(lo) << 8 | hi; - } - ret = vec.empty() ? - static_cast< Local >(String::Empty(isolate)) - : StringBytes::Encode(isolate, &vec[0], vec.size(), error); - } else { - ret = StringBytes::Encode(isolate, buf, len, encoding, error); - } - return ret; + return Encode(isolate, buf, len, encoding, error); } } // namespace node diff --git a/src/string_bytes.h b/src/string_bytes.h index 5f5fcd9fa0f05e..726f99907e62bc 100644 --- a/src/string_bytes.h +++ b/src/string_bytes.h @@ -93,7 +93,7 @@ class StringBytes { int* chars_written = nullptr); // Take the bytes in the src, and turn it into a Buffer or String. - // Don't call with encoding=UCS2. + // If the encoding is UTF16-LE, input is considered to be in host endianness. static v8::MaybeLocal Encode(v8::Isolate* isolate, const char* buf, size_t buflen, diff --git a/src/string_decoder.cc b/src/string_decoder.cc index b75169ff00cafb..fa8201faff2944 100644 --- a/src/string_decoder.cc +++ b/src/string_decoder.cc @@ -30,16 +30,6 @@ MaybeLocal MakeString(Isolate* isolate, data, v8::NewStringType::kNormal, length); - } else if (encoding == UCS2) { -#ifdef DEBUG - CHECK_EQ(reinterpret_cast(data) % 2, 0); - CHECK_EQ(length % 2, 0); -#endif - ret = StringBytes::Encode( - isolate, - reinterpret_cast(data), - length / 2, - &error); } else { ret = StringBytes::Encode( isolate, diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index 5571aaeeb78dc2..6e4f4b121d20d7 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -143,6 +143,14 @@ decoder = new StringDecoder('utf16le'); assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d'); assert.strictEqual(decoder.end(), ''); +// Regression test for https://github.com/nodejs/node/issues/22358 +// (unaligned UTF-16 access). +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.alloc(1)), ''); +assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10)); +assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24)); +assert.strictEqual(decoder.end(), ''); + common.expectsError( () => new StringDecoder(1), { From 8471226464554664e5a2a7b72007ec05217b7963 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Fri, 31 Aug 2018 17:30:55 +0200 Subject: [PATCH 2/2] fixup! src: allow UTF-16 in generic StringBytes decode call --- src/string_bytes.cc | 13 +++++++++---- src/string_bytes.h | 1 - 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index f5e30dbed104ef..4b06177ac7f76d 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -698,22 +698,27 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, case UCS2: { if (IsBigEndian()) { - // For BE we always need to swap the byte order. uint16_t* dst = node::UncheckedMalloc(buflen / 2); if (dst == nullptr) { *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate); return MaybeLocal(); } for (size_t i = 0, k = 0; k < buflen / 2; i += 2, k += 1) { - const uint8_t hi = static_cast(buf[i + 0]); - const uint8_t lo = static_cast(buf[i + 1]); + // The input is in *little endian*, because that's what Node.js + // expects, so the high byte comes after the low byte. + const uint8_t hi = static_cast(buf[i + 1]); + const uint8_t lo = static_cast(buf[i + 0]); dst[k] = static_cast(hi) << 8 | lo; } - return ExternTwoByteString::New(isolate, dst, buflen, error); + return ExternTwoByteString::New(isolate, dst, buflen / 2, error); } if (reinterpret_cast(buf) % 2 != 0) { // Unaligned data still means we can't directly pass it to V8. char* dst = node::UncheckedMalloc(buflen); + if (dst == nullptr) { + *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate); + return MaybeLocal(); + } memcpy(dst, buf, buflen); return ExternTwoByteString::New( isolate, reinterpret_cast(dst), buflen / 2, error); diff --git a/src/string_bytes.h b/src/string_bytes.h index 726f99907e62bc..8652a0a29fb06c 100644 --- a/src/string_bytes.h +++ b/src/string_bytes.h @@ -93,7 +93,6 @@ class StringBytes { int* chars_written = nullptr); // Take the bytes in the src, and turn it into a Buffer or String. - // If the encoding is UTF16-LE, input is considered to be in host endianness. static v8::MaybeLocal Encode(v8::Isolate* isolate, const char* buf, size_t buflen,