diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index e054ca4dd5202d..61f48f3395fba7 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -28,7 +28,7 @@ const kEncoding = Symbol('encoding'); const kDecoder = Symbol('decoder'); const kFatal = Symbol('kFatal'); const kUTF8FastPath = Symbol('kUTF8FastPath'); -const kLatin1FastPath = Symbol('kLatin1FastPath'); +const kWindows1252FastPath = Symbol('kWindows1252FastPath'); const kIgnoreBOM = Symbol('kIgnoreBOM'); const { @@ -55,7 +55,7 @@ const { encodeIntoResults, encodeUtf8String, decodeUTF8, - decodeLatin1, + decodeWindows1252, } = binding; const { Buffer } = require('buffer'); @@ -420,10 +420,10 @@ function makeTextDecoderICU() { this[kFatal] = Boolean(options?.fatal); // Only support fast path for UTF-8. this[kUTF8FastPath] = enc === 'utf-8'; - this[kLatin1FastPath] = enc === 'windows-1252'; + this[kWindows1252FastPath] = enc === 'windows-1252'; this[kHandle] = undefined; - if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) { + if (!this[kUTF8FastPath] && !this[kWindows1252FastPath]) { this.#prepareConverter(); } } @@ -440,14 +440,14 @@ function makeTextDecoderICU() { validateDecoder(this); this[kUTF8FastPath] &&= !(options?.stream); - this[kLatin1FastPath] &&= !(options?.stream); + this[kWindows1252FastPath] &&= !(options?.stream); if (this[kUTF8FastPath]) { return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]); } - if (this[kLatin1FastPath]) { - return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]); + if (this[kWindows1252FastPath]) { + return decodeWindows1252(input, this[kIgnoreBOM], this[kFatal]); } this.#prepareConverter(); diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 877ae8a18f6b8f..197d7a6522d383 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -221,7 +221,8 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data, SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8); SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII); SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode); - SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1); + SetMethodNoSideEffect( + isolate, target, "decodeWindows1252", DecodeWindows1252); } void BindingData::CreatePerContextProperties(Local target, @@ -239,10 +240,10 @@ void BindingData::RegisterTimerExternalReferences( registry->Register(DecodeUTF8); registry->Register(ToASCII); registry->Register(ToUnicode); - registry->Register(DecodeLatin1); + registry->Register(DecodeWindows1252); } -void BindingData::DecodeLatin1(const FunctionCallbackInfo& args) { +void BindingData::DecodeWindows1252(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); CHECK_GE(args.Length(), 1); @@ -255,7 +256,6 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo& args) { } bool ignore_bom = args[1]->IsTrue(); - bool has_fatal = args[2]->IsTrue(); ArrayBufferViewContents buffer(args[0]); const uint8_t* data = buffer.data(); @@ -270,20 +270,45 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo& args) { return args.GetReturnValue().SetEmptyString(); } - std::string result(length * 2, '\0'); - - size_t written = simdutf::convert_latin1_to_utf8( - reinterpret_cast(data), length, result.data()); + // Windows-1252 specific mapping for bytes 128-159 + // These differ from Latin-1/ISO-8859-1 + static const uint16_t windows1252_mapping[32] = { + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F + }; + + std::string result; + result.reserve(length * 3); // Reserve space for UTF-8 output + + for (size_t i = 0; i < length; i++) { + uint8_t byte = data[i]; + uint32_t codepoint; + + // Check if byte is in the special Windows-1252 range (128-159) + if (byte >= 0x80 && byte <= 0x9F) { + codepoint = windows1252_mapping[byte - 0x80]; + } else { + // For all other bytes, Windows-1252 is identical to Latin-1 + codepoint = byte; + } - if (has_fatal && written == 0) { - return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA( - env->isolate(), "The encoded data was not valid for encoding latin1"); + // Convert codepoint to UTF-8 + if (codepoint < 0x80) { + result.push_back(static_cast(codepoint)); + } else if (codepoint < 0x800) { + result.push_back(static_cast(0xC0 | (codepoint >> 6))); + result.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else { + result.push_back(static_cast(0xE0 | (codepoint >> 12))); + result.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } } - std::string_view view(result.c_str(), written); - Local ret; - if (ToV8Value(env->context(), view, env->isolate()).ToLocal(&ret)) { + if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) { args.GetReturnValue().Set(ret); } } diff --git a/src/encoding_binding.h b/src/encoding_binding.h index 97f55394d27641..8393702cce855f 100644 --- a/src/encoding_binding.h +++ b/src/encoding_binding.h @@ -31,7 +31,8 @@ class BindingData : public SnapshotableObject { static void EncodeInto(const v8::FunctionCallbackInfo& args); static void EncodeUtf8String(const v8::FunctionCallbackInfo& args); static void DecodeUTF8(const v8::FunctionCallbackInfo& args); - static void DecodeLatin1(const v8::FunctionCallbackInfo& args); + static void DecodeWindows1252( + const v8::FunctionCallbackInfo& args); static void ToASCII(const v8::FunctionCallbackInfo& args); static void ToUnicode(const v8::FunctionCallbackInfo& args); diff --git a/test/parallel/test-internal-encoding-binding.js b/test/parallel/test-internal-encoding-binding.js index b7483bf1d22820..7d5397d213c205 100644 --- a/test/parallel/test-internal-encoding-binding.js +++ b/test/parallel/test-internal-encoding-binding.js @@ -8,41 +8,46 @@ const assert = require('node:assert'); const { internalBinding } = require('internal/test/binding'); const binding = internalBinding('encoding_binding'); +// Windows-1252 specific tests { - // Valid input - const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]); - assert.strictEqual(binding.decodeLatin1(buf, false, false), 'Áéó'); + // Test Windows-1252 special characters in 128-159 range + // These differ from Latin-1 + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€'); + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚'); + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ'); + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ'); } { - // Empty input - const buf = Uint8Array.from([]); - assert.strictEqual(binding.decodeLatin1(buf, false, false), ''); + // Test Windows-1252 characters outside 128-159 range (same as Latin-1) + const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]); + assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó'); } { - // Invalid input, but Latin1 has no invalid chars and should never throw. - const buf = new TextEncoder().encode('Invalid Latin1 🧑‍🧑‍🧒‍🧒'); - assert.strictEqual( - binding.decodeLatin1(buf, false, false), - 'Invalid Latin1 ð\x9F§\x91â\x80\x8Dð\x9F§\x91â\x80\x8Dð\x9F§\x92â\x80\x8Dð\x9F§\x92' - ); + // Empty input + const buf = Uint8Array.from([]); + assert.strictEqual(binding.decodeWindows1252(buf, false, false), ''); } +// Windows-1252 specific tests { - // IgnoreBOM with BOM - const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]); - assert.strictEqual(binding.decodeLatin1(buf, true, false), 'þÿÁéó'); + // Test Windows-1252 special characters in 128-159 range + // These differ from Latin-1 + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€'); + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚'); + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ'); + assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ'); } { - // Fatal and InvalidInput, but Latin1 has no invalid chars and should never throw. - const buf = Uint8Array.from([0xFF, 0xFF, 0xFF]); - assert.strictEqual(binding.decodeLatin1(buf, false, true), 'ÿÿÿ'); + // Test Windows-1252 characters outside 128-159 range (same as Latin-1) + const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]); + assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó'); } { - // IgnoreBOM and Fatal, but Latin1 has no invalid chars and should never throw. - const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]); - assert.strictEqual(binding.decodeLatin1(buf, true, true), 'þÿÁéó'); + // Empty input + const buf = Uint8Array.from([]); + assert.strictEqual(binding.decodeWindows1252(buf, false, false), ''); } diff --git a/test/parallel/test-util-text-decoder.js b/test/parallel/test-util-text-decoder.js index 0f6d0463f9da48..adf52241c391cd 100644 --- a/test/parallel/test-util-text-decoder.js +++ b/test/parallel/test-util-text-decoder.js @@ -15,3 +15,49 @@ test('TextDecoder correctly decodes windows-1252 encoded data', { skip: !common. assert.strictEqual(decodedString, expectedString); }); + +// Test for the difference between Latin1 and Windows-1252 in the 128-159 +// range +// Ref: https://github.com/nodejs/node/issues/60888 +test('TextDecoder correctly decodes windows-1252 special characters in ' + + '128-159 range', { skip: !common.hasIntl }, () => { + const decoder = new TextDecoder('windows-1252'); + + // Test specific characters that differ between Latin1 and Windows-1252. + // € Euro sign + assert.strictEqual(decoder.decode(Uint8Array.of(128)).codePointAt(0), + 8364); + // ‚ Single low-9 quotation mark + assert.strictEqual(decoder.decode(Uint8Array.of(130)).codePointAt(0), + 8218); + // Latin small letter f with hook (ƒ) + assert.strictEqual(decoder.decode(Uint8Array.of(131)).codePointAt(0), + 402); + // Ÿ Latin capital letter Y with diaeresis + assert.strictEqual(decoder.decode(Uint8Array.of(159)).codePointAt(0), + 376); + + // Test the full range to ensure no character is treated as Latin1 + // Directly. + const expectedMappings = [ + [128, 8364], [129, 129], [130, 8218], [131, 402], [132, 8222], + [133, 8230], [134, 8224], [135, 8225], [136, 710], [137, 8240], + [138, 352], [139, 8249], [140, 338], [141, 141], [142, 381], + [143, 143], [144, 144], [145, 8216], [146, 8217], [147, 8220], + [148, 8221], [149, 8226], [150, 8211], [151, 8212], [152, 732], + [153, 8482], [154, 353], [155, 8250], [156, 339], [157, 157], + [158, 382], [159, 376], + ]; + + for (const [byte, expectedCodePoint] of expectedMappings) { + const result = decoder.decode(Uint8Array.of(byte)); + const actualCodePoint = result.codePointAt(0); + assert.strictEqual( + actualCodePoint, + expectedCodePoint, + `Byte 0x${byte.toString(16)} should decode to ` + + `U+${expectedCodePoint.toString(16)} but got ` + + `U+${actualCodePoint.toString(16)}` + ); + } +}); diff --git a/typings/internalBinding/encoding_binding.d.ts b/typings/internalBinding/encoding_binding.d.ts index 6e1d48efd81529..6833c9ac0557b1 100644 --- a/typings/internalBinding/encoding_binding.d.ts +++ b/typings/internalBinding/encoding_binding.d.ts @@ -4,5 +4,5 @@ export interface EncodingBinding { decodeUTF8(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string; toASCII(input: string): string; toUnicode(input: string): string; - decodeLatin1(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string; + decodeWindows1252(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string; }