Skip to content
Merged
13 changes: 11 additions & 2 deletions lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const kDecoder = Symbol('decoder');
const kEncoder = Symbol('encoder');
const kFatal = Symbol('kFatal');
const kUTF8FastPath = Symbol('kUTF8FastPath');
const kLatin1FastPath = Symbol('kLatin1FastPath');
const kIgnoreBOM = Symbol('kIgnoreBOM');

const {
Expand All @@ -55,6 +56,7 @@ const {
encodeIntoResults,
encodeUtf8String,
decodeUTF8,
decodeLatin1,
} = binding;

const { Buffer } = require('buffer');
Expand Down Expand Up @@ -419,10 +421,13 @@ function makeTextDecoderICU() {
this[kFatal] = Boolean(options?.fatal);
// Only support fast path for UTF-8.
this[kUTF8FastPath] = enc === 'utf-8';
this[kLatin1FastPath] = enc === 'windows-1252';
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line specifically uses the wrong assumption that Latin1 and windows-1252 are the same encoding

They aren't

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm catching now in MDN api docs, say this latin1 same windows-1252: https://developer.mozilla.org/en-US/docs/Web/API/Encoding_API/Encodings

Copy link
Member Author

@mertcanaltin mertcanaltin Nov 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was following the WHATWG Encoding Standard, which
explicitly states that in web-compatible software, latin1, iso-8859-1, and ascii
are all just labels for windows-1252

from the spec: "these are synonyms: latin1 and ascii are just labels for
windows-1252, and any software following this standard will, for example, decode 0x80
as U+20AC (€)

however, my implementation was still flawed - it didn't properly handle the 0x80-0x9F
range mapping. The fast path needs a lookup table for those bytes to correctly map them
to their windows-1252 code points (€, smart quotes, etc)

so while the WHATWG standard does treat them as equivalent, my implementation failed to
properly decode them. I support the revert, and if there's interest in a performance
optimization here, it would need to properly implement the windows-1252 mapping table

Copy link
Member

@ChALkeR ChALkeR Nov 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WHATWG mapping Latin1 to windows-1252 doesn't mean that Latin1 and windows-1252 are equivalent.

windows-1252 decoder is compatible with decoding Latin1 bytes, this is why the standard can use that mapping, the same way it can alias ascii to windows-1252

There is no Latin1 (or ascii) in TextDecoder, new TextDecoder('Latin1') creates a windows-1252 decoder which is capable of decoding Latin1, as it's a subset of windows-1252.

new TextDecoder('ascii') also returns a windows-1252 decoder by spec, which is reasonable as ascii is also a subset of windows-1252 and that decoder can well decode ascii bytes.

But we can't implement windows-1252 with Latin1 (or with ascii-only).

We can't even make new TextDecoder('Latin1') / new TextDecoder('ascii') return a Latin1-only (or ascii-only) decoder, as that would break compat.

It should fully support windows-1252

See also our own doc:

node/doc/api/buffer.md

Lines 229 to 234 in 7643c2a

Modern Web browsers follow the [WHATWG Encoding Standard][] which aliases
both `'latin1'` and `'ISO-8859-1'` to `'win-1252'`. This means that while doing
something like `http.get()`, if the returned charset is one of those listed in
the WHATWG specification it is possible that the server actually returned
`'win-1252'`-encoded data, and using `'latin1'` encoding may incorrectly decode
the characters.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for detail!

this[kHandle] = undefined;

if (!this[kUTF8FastPath]) {
this.#prepareConverter();
if (this[kUTF8FastPath]) {
decodeUTF8(this.input, this[kIgnoreBOM], this[kFatal]);
} else if (this[kLatin1FastPath]) {
decodeLatin1(this.input);
}
}

Expand All @@ -443,6 +448,10 @@ function makeTextDecoderICU() {
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
}

if (this[kLatin1FastPath]) {
return decodeLatin1(input);
}

this.#prepareConverter();

validateObject(options, 'options', kValidateObjectAllowObjectsAndNull);
Expand Down
40 changes: 40 additions & 0 deletions src/encoding_binding.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "encoding_binding.h"
#include "ada.h"
#include "env-inl.h"
#include "node_buffer.h"
#include "node_errors.h"
#include "node_external_reference.h"
#include "simdutf.h"
Expand Down Expand Up @@ -226,6 +227,7 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1);
}

void BindingData::CreatePerContextProperties(Local<Object> target,
Expand All @@ -243,6 +245,44 @@ void BindingData::RegisterTimerExternalReferences(
registry->Register(DecodeUTF8);
registry->Register(ToASCII);
registry->Register(ToUnicode);
registry->Register(DecodeLatin1);
}

void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);

CHECK_GE(args.Length(), 1);
if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
args[0]->IsArrayBufferView())) {
return node::THROW_ERR_INVALID_ARG_TYPE(
env->isolate(),
"The \"input\" argument must be an instance of ArrayBuffer, "
"SharedArrayBuffer, or ArrayBufferView.");
}

ArrayBufferViewContents<uint8_t> buffer(args[0]);
const uint8_t* data = buffer.data();
size_t length = buffer.length();

if (length == 0) {
return args.GetReturnValue().SetEmptyString();
}

std::string result(length * 2, '\0');

size_t written = simdutf::convert_latin1_to_utf8(
reinterpret_cast<const char*>(data), length, &result[0]);

if (written == 0) {
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
env->isolate(), "The encoded data was not valid for encoding latin1");
}

result.resize(written);

Local<Object> buffer_result =
node::Buffer::Copy(env, result.c_str(), result.length()).ToLocalChecked();
args.GetReturnValue().Set(buffer_result);
}

} // namespace encoding_binding
Expand Down
1 change: 1 addition & 0 deletions src/encoding_binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class BindingData : public SnapshotableObject {
static void EncodeInto(const v8::FunctionCallbackInfo<v8::Value>& args);
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeLatin1(const v8::FunctionCallbackInfo<v8::Value>& args);

static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);
Expand Down
75 changes: 75 additions & 0 deletions test/cctest/test_encoding_binding.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include "encoding_binding.h"
#include "env-inl.h"
#include "gtest/gtest.h"
#include "node_test_fixture.h"
#include "v8.h"

namespace node {
namespace encoding_binding {

bool RunDecodeLatin1(Environment* env,
Local<Value> args[],
Local<Value>* result) {
Isolate* isolate = env->isolate();
TryCatch try_catch(isolate);

BindingData::DecodeLatin1(FunctionCallbackInfo<Value>(args));

if (try_catch.HasCaught()) {
return false;
}

*result = try_catch.Exception();
return true;
}

class EncodingBindingTest : public NodeTestFixture {};

TEST_F(EncodingBindingTest, DecodeLatin1_ValidInput) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

const uint8_t latin1_data[] = {0xC1, 0xE9, 0xF3};
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));

Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_TRUE(RunDecodeLatin1(env, args, &result));

String::Utf8Value utf8_result(isolate, result);
EXPECT_STREQ(*utf8_result, "Áéó");
}

TEST_F(EncodingBindingTest, DecodeLatin1_EmptyInput) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, 0);
Local<Uint8Array> array = Uint8Array::New(ab, 0, 0);
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_TRUE(RunDecodeLatin1(env, args, &result));

String::Utf8Value utf8_result(isolate, result);
EXPECT_STREQ(*utf8_result, "");
}

TEST_F(EncodingBindingTest, DecodeLatin1_InvalidInput) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

Local<Value> args[] = {String::NewFromUtf8Literal(isolate, "Invalid input")};

Local<Value> result;
EXPECT_FALSE(RunDecodeLatin1(env, args, &result));
}

} // namespace encoding_binding
} // namespace node