diff --git a/crates/oxc_codegen/src/lib.rs b/crates/oxc_codegen/src/lib.rs index acb413068bff7..e44f332956879 100644 --- a/crates/oxc_codegen/src/lib.rs +++ b/crates/oxc_codegen/src/lib.rs @@ -242,6 +242,12 @@ impl<'a> Codegen<'a> { self.code.print_str(s); } + /// Push `char` into the buffer. + #[inline] + pub fn print_char(&mut self, ch: char) { + self.code.print_char(ch); + } + /// Print a single [`Expression`], adding it to the code generator's /// internal buffer. Unlike [`Codegen::build`], this does not consume `self`. #[inline] @@ -578,14 +584,7 @@ impl<'a> Codegen<'a> { fn print_string_literal(&mut self, s: &StringLiteral<'_>, allow_backtick: bool) { self.add_source_mapping(s.span); - if s.lone_surrogates { - self.print_str(s.raw.unwrap().as_str()); - return; - } - self.print_quoted_utf16(s, allow_backtick); - } - fn print_quoted_utf16(&mut self, s: &StringLiteral<'_>, allow_backtick: bool) { let quote = if self.options.minify { let mut single_cost: i32 = 0; let mut double_cost: i32 = 0; @@ -680,6 +679,26 @@ impl<'a> Codegen<'a> { } self.print_ascii_byte(b'$'); } + '\u{FFFD}' if s.lone_surrogates => { + // If `lone_surrogates` is set, string contains lone surrogates which are escaped + // using the lossy replacement character (U+FFFD) as an escape marker. + // The lone surrogate is encoded as `\u{FFFD}XXXX` where `XXXX` is the code point as hex. + let hex1 = chars.next().unwrap(); + let hex2 = chars.next().unwrap(); + let hex3 = chars.next().unwrap(); + let hex4 = chars.next().unwrap(); + if [hex1, hex2, hex3, hex4] == ['f', 'f', 'f', 'd'] { + // Actual lossy replacement character + self.print_char('\u{FFFD}'); + } else { + // Lossy replacement character representing a lone surrogate + self.print_str("\\u"); + self.print_char(hex1); + self.print_char(hex2); + self.print_char(hex3); + self.print_char(hex4); + } + } _ => self.print_str(c.encode_utf8([0; 4].as_mut())), } } diff --git a/crates/oxc_codegen/tests/integration/esbuild.rs b/crates/oxc_codegen/tests/integration/esbuild.rs index 172f0290475be..6e75a376d4939 100644 --- a/crates/oxc_codegen/tests/integration/esbuild.rs +++ b/crates/oxc_codegen/tests/integration/esbuild.rs @@ -363,15 +363,15 @@ fn test_string() { test("let x = '\\U000123AB'", "let x = \"U000123AB\";\n"); test("let x = '\\u{123AB}'", "let x = \"\u{123ab}\";\n"); test("let x = '\\uD808\\uDFAB'", "let x = \"\u{123ab}\";\n"); - test("let x = '\\uD808'", "let x = '\\uD808';\n"); // lone surrogate - test("let x = '\\uD808X'", "let x = '\\uD808X';\n"); - test("let x = '\\uDFAB'", "let x = '\\uDFAB';\n"); - test("let x = '\\uDFABX'", "let x = '\\uDFABX';\n"); + test("let x = '\\uD808'", "let x = \"\\ud808\";\n"); // lone surrogate + test("let x = '\\uD808X'", "let x = \"\\ud808X\";\n"); + test("let x = '\\uDFAB'", "let x = \"\\udfab\";\n"); + test("let x = '\\uDFABX'", "let x = \"\\udfabX\";\n"); test("let x = '\\x80'", "let x = \"\u{80}\";\n"); test("let x = '\\xFF'", "let x = \"ΓΏ\";\n"); test("let x = '\\xF0\\x9F\\x8D\\x95'", "let x = \"Γ°\u{9f}\u{8d}\u{95}\";\n"); - test("let x = '\\uD801\\uDC02\\uDC03\\uD804'", "let x = '\\uD801\\uDC02\\uDC03\\uD804';\n"); // lossy + test("let x = '\\uD801\\uDC02\\uDC03\\uD804'", "let x = \"𐐂\\udc03\\ud804\";\n"); // surrogates } #[test] diff --git a/crates/oxc_codegen/tests/integration/unit.rs b/crates/oxc_codegen/tests/integration/unit.rs index 567949256d029..d66dfbf3bf81e 100644 --- a/crates/oxc_codegen/tests/integration/unit.rs +++ b/crates/oxc_codegen/tests/integration/unit.rs @@ -143,7 +143,7 @@ fn unicode_escape() { test("console.log('こんにけは');", "console.log(\"こんにけは\");\n"); test("console.log('μ•ˆλ…•ν•˜μ„Έμš”');", "console.log(\"μ•ˆλ…•ν•˜μ„Έμš”\");\n"); test("console.log('πŸ§‘β€πŸ€β€πŸ§‘');", "console.log(\"πŸ§‘β€πŸ€β€πŸ§‘\");\n"); - test("console.log(\"\\uD800\\uD801\")", "console.log(\"\\uD800\\uD801\");\n"); + test("console.log(\"\\uD800\\uD801\")", "console.log(\"\\ud800\\ud801\");\n"); } #[test]