From 829e99beff84a400cd005dbe22f3cf4be01e9198 Mon Sep 17 00:00:00 2001
From: Anna Henningsen <anna@addaleax.net>
Date: Fri, 31 Aug 2018 16:57:03 +0200
Subject: [PATCH 1/2] src: allow UTF-16 in generic StringBytes decode call

This allows removing a number of special cases in other
parts of the code, at least one of which was incorrect
in expecting aligned input when that was not guaranteed.

Fixes: https://github.com/nodejs/node/issues/22358
---
 src/node.h                           |  1 -
 src/node_buffer.cc                   | 59 ----------------------------
 src/string_bytes.cc                  | 52 ++++++++++++------------
 src/string_bytes.h                   |  2 +-
 src/string_decoder.cc                | 10 -----
 test/parallel/test-string-decoder.js |  8 ++++
 6 files changed, 36 insertions(+), 96 deletions(-)
diff --git a/src/node.h b/src/node.h
index 74403a6e48d3b8..1a844a6c7cb9cc 100644
--- a/src/node.h
+++ b/src/node.h
@@ -407,7 +407,6 @@ NODE_DEPRECATED("Use FatalException(isolate, ...)",
   return FatalException(v8::Isolate::GetCurrent(), try_catch);
 })
 
-// Don't call with encoding=UCS2.
 NODE_EXTERN v8::Local<v8::Value> Encode(v8::Isolate* isolate,
                                         const char* buf,
                                         size_t len,
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
index 9a280f7c127cc6..816cbf73605c78 100644
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@@ -464,65 +464,6 @@ void StringSlice(const FunctionCallbackInfo<Value>& args) {
 }
 
 
-template <>
-void StringSlice<UCS2>(const FunctionCallbackInfo<Value>& args) {
-  Isolate* isolate = args.GetIsolate();
-  Environment* env = Environment::GetCurrent(isolate);
-
-  THROW_AND_RETURN_UNLESS_BUFFER(env, args.This());
-  SPREAD_BUFFER_ARG(args.This(), ts_obj);
-
-  if (ts_obj_length == 0)
-    return args.GetReturnValue().SetEmptyString();
-
-  SLICE_START_END(args[0], args[1], ts_obj_length)
-  length /= 2;
-
-  const char* data = ts_obj_data + start;
-  const uint16_t* buf;
-  bool release = false;
-
-  // Node's "ucs2" encoding expects LE character data inside a Buffer, so we
-  // need to reorder on BE platforms.  See https://nodejs.org/api/buffer.html
-  // regarding Node's "ucs2" encoding specification.
-  const bool aligned = (reinterpret_cast<uintptr_t>(data) % sizeof(*buf) == 0);
-  if (IsLittleEndian() && !aligned) {
-    // Make a copy to avoid unaligned accesses in v8::String::NewFromTwoByte().
-    // This applies ONLY to little endian platforms, as misalignment will be
-    // handled by a byte-swapping operation in StringBytes::Encode on
-    // big endian platforms.
-    uint16_t* copy = new uint16_t[length];
-    for (size_t i = 0, k = 0; i < length; i += 1, k += 2) {
-      // Assumes that the input is little endian.
-      const uint8_t lo = static_cast<uint8_t>(data[k + 0]);
-      const uint8_t hi = static_cast<uint8_t>(data[k + 1]);
-      copy[i] = lo | hi << 8;
-    }
-    buf = copy;
-    release = true;
-  } else {
-    buf = reinterpret_cast<const uint16_t*>(data);
-  }
-
-  Local<Value> error;
-  MaybeLocal<Value> ret =
-      StringBytes::Encode(isolate,
-                          buf,
-                          length,
-                          &error);
-
-  if (release)
-    delete[] buf;
-
-  if (ret.IsEmpty()) {
-    CHECK(!error.IsEmpty());
-    isolate->ThrowException(error);
-    return;
-  }
-  args.GetReturnValue().Set(ret.ToLocalChecked());
-}
-
-
 // bytesCopied = copy(buffer, target[, targetStart][, sourceStart][, sourceEnd])
 void Copy(const FunctionCallbackInfo<Value> &args) {
   Environment* env = Environment::GetCurrent(args);
diff --git a/src/string_bytes.cc b/src/string_bytes.cc
index b7f009fe1fcc16..f5e30dbed104ef 100644
--- a/src/string_bytes.cc
+++ b/src/string_bytes.cc
@@ -619,7 +619,6 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
                                       size_t buflen,
                                       enum encoding encoding,
                                       Local<Value>* error) {
-  CHECK_NE(encoding, UCS2);
   CHECK_BUFLEN_IN_RANGE(buflen);
 
   if (!buflen && encoding != BUFFER) {
@@ -697,6 +696,32 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
       return ExternOneByteString::New(isolate, dst, dlen, error);
     }
 
+    case UCS2: {
+      if (IsBigEndian()) {
+        // For BE we always need to swap the byte order.
+        uint16_t* dst = node::UncheckedMalloc<uint16_t>(buflen / 2);
+        if (dst == nullptr) {
+          *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate);
+          return MaybeLocal<Value>();
+        }
+        for (size_t i = 0, k = 0; k < buflen / 2; i += 2, k += 1) {
+          const uint8_t hi = static_cast<uint8_t>(buf[i + 0]);
+          const uint8_t lo = static_cast<uint8_t>(buf[i + 1]);
+          dst[k] = static_cast<uint16_t>(hi) << 8 | lo;
+        }
+        return ExternTwoByteString::New(isolate, dst, buflen, error);
+      }
+      if (reinterpret_cast<uintptr_t>(buf) % 2 != 0) {
+        // Unaligned data still means we can't directly pass it to V8.
+        char* dst = node::UncheckedMalloc(buflen);
+        memcpy(dst, buf, buflen);
+        return ExternTwoByteString::New(
+            isolate, reinterpret_cast<uint16_t*>(dst), buflen / 2, error);
+      }
+      return ExternTwoByteString::NewFromCopy(
+          isolate, reinterpret_cast<const uint16_t*>(buf), buflen / 2, error);
+    }
+
     default:
       CHECK(0 && "unknown encoding");
       break;
@@ -736,30 +761,7 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
                                       enum encoding encoding,
                                       Local<Value>* error) {
   const size_t len = strlen(buf);
-  MaybeLocal<Value> ret;
-  if (encoding == UCS2) {
-    // In Node, UCS2 means utf16le. The data must be in little-endian
-    // order and must be aligned on 2-bytes. This returns an empty
-    // value if it's not aligned and ensures the appropriate byte order
-    // on big endian architectures.
-    const bool be = IsBigEndian();
-    if (len % 2 != 0)
-      return ret;
-    std::vector<uint16_t> vec(len / 2);
-    for (size_t i = 0, k = 0; i < len; i += 2, k += 1) {
-      const uint8_t hi = static_cast<uint8_t>(buf[i + 0]);
-      const uint8_t lo = static_cast<uint8_t>(buf[i + 1]);
-      vec[k] = be ?
-          static_cast<uint16_t>(hi) << 8 | lo
-          : static_cast<uint16_t>(lo) << 8 | hi;
-    }
-    ret = vec.empty() ?
-        static_cast< Local<Value> >(String::Empty(isolate))
-        : StringBytes::Encode(isolate, &vec[0], vec.size(), error);
-  } else {
-    ret = StringBytes::Encode(isolate, buf, len, encoding, error);
-  }
-  return ret;
+  return Encode(isolate, buf, len, encoding, error);
 }
 
 }  // namespace node
diff --git a/src/string_bytes.h b/src/string_bytes.h
index 5f5fcd9fa0f05e..726f99907e62bc 100644
--- a/src/string_bytes.h
+++ b/src/string_bytes.h
@@ -93,7 +93,7 @@ class StringBytes {
                       int* chars_written = nullptr);
 
   // Take the bytes in the src, and turn it into a Buffer or String.
-  // Don't call with encoding=UCS2.
+  // If the encoding is UTF16-LE, input is considered to be in host endianness.
   static v8::MaybeLocal<v8::Value> Encode(v8::Isolate* isolate,
                                           const char* buf,
                                           size_t buflen,
diff --git a/src/string_decoder.cc b/src/string_decoder.cc
index b75169ff00cafb..fa8201faff2944 100644
--- a/src/string_decoder.cc
+++ b/src/string_decoder.cc
@@ -30,16 +30,6 @@ MaybeLocal<String> MakeString(Isolate* isolate,
         data,
         v8::NewStringType::kNormal,
         length);
-  } else if (encoding == UCS2) {
-#ifdef DEBUG
-    CHECK_EQ(reinterpret_cast<uintptr_t>(data) % 2, 0);
-    CHECK_EQ(length % 2, 0);
-#endif
-    ret = StringBytes::Encode(
-        isolate,
-        reinterpret_cast<const uint16_t*>(data),
-        length / 2,
-        &error);
   } else {
     ret = StringBytes::Encode(
         isolate,
diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js
index 5571aaeeb78dc2..6e4f4b121d20d7 100644
--- a/test/parallel/test-string-decoder.js
+++ b/test/parallel/test-string-decoder.js
@@ -143,6 +143,14 @@ decoder = new StringDecoder('utf16le');
 assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d');
 assert.strictEqual(decoder.end(), '');
 
+// Regression test for https://github.com/nodejs/node/issues/22358
+// (unaligned UTF-16 access).
+decoder = new StringDecoder('utf16le');
+assert.strictEqual(decoder.write(Buffer.alloc(1)), '');
+assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10));
+assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24));
+assert.strictEqual(decoder.end(), '');
+
 common.expectsError(
   () => new StringDecoder(1),
   {

From 8471226464554664e5a2a7b72007ec05217b7963 Mon Sep 17 00:00:00 2001
From: Anna Henningsen <anna@addaleax.net>
Date: Fri, 31 Aug 2018 17:30:55 +0200
Subject: [PATCH 2/2] fixup! src: allow UTF-16 in generic StringBytes decode
 call

---
 src/string_bytes.cc | 13 +++++++++----
 src/string_bytes.h  |  1 -
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/string_bytes.cc b/src/string_bytes.cc
index f5e30dbed104ef..4b06177ac7f76d 100644
--- a/src/string_bytes.cc
+++ b/src/string_bytes.cc
@@ -698,22 +698,27 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
 
     case UCS2: {
       if (IsBigEndian()) {
-        // For BE we always need to swap the byte order.
         uint16_t* dst = node::UncheckedMalloc<uint16_t>(buflen / 2);
         if (dst == nullptr) {
           *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate);
           return MaybeLocal<Value>();
         }
         for (size_t i = 0, k = 0; k < buflen / 2; i += 2, k += 1) {
-          const uint8_t hi = static_cast<uint8_t>(buf[i + 0]);
-          const uint8_t lo = static_cast<uint8_t>(buf[i + 1]);
+          // The input is in *little endian*, because that's what Node.js
+          // expects, so the high byte comes after the low byte.
+          const uint8_t hi = static_cast<uint8_t>(buf[i + 1]);
+          const uint8_t lo = static_cast<uint8_t>(buf[i + 0]);
           dst[k] = static_cast<uint16_t>(hi) << 8 | lo;
         }
-        return ExternTwoByteString::New(isolate, dst, buflen, error);
+        return ExternTwoByteString::New(isolate, dst, buflen / 2, error);
       }
       if (reinterpret_cast<uintptr_t>(buf) % 2 != 0) {
         // Unaligned data still means we can't directly pass it to V8.
         char* dst = node::UncheckedMalloc(buflen);
+        if (dst == nullptr) {
+          *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate);
+          return MaybeLocal<Value>();
+        }
         memcpy(dst, buf, buflen);
         return ExternTwoByteString::New(
             isolate, reinterpret_cast<uint16_t*>(dst), buflen / 2, error);
diff --git a/src/string_bytes.h b/src/string_bytes.h
index 726f99907e62bc..8652a0a29fb06c 100644
--- a/src/string_bytes.h
+++ b/src/string_bytes.h
@@ -93,7 +93,6 @@ class StringBytes {
                       int* chars_written = nullptr);
 
   // Take the bytes in the src, and turn it into a Buffer or String.
-  // If the encoding is UTF16-LE, input is considered to be in host endianness.
   static v8::MaybeLocal<v8::Value> Encode(v8::Isolate* isolate,
                                           const char* buf,
                                           size_t buflen,