diff --git a/Cargo.lock b/Cargo.lock index 40f0bc8d5fbd7..b6c5a3c50528d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1402,9 +1402,11 @@ dependencies = [ "bitflags 2.6.0", "daachorse", "insta", + "nonmax", "once_cell", "oxc_allocator", "oxc_ast", + "oxc_index", "oxc_mangler", "oxc_parser", "oxc_sourcemap", diff --git a/crates/oxc_codegen/Cargo.toml b/crates/oxc_codegen/Cargo.toml index 0602bd3f91419..8f249d9f3a6e3 100644 --- a/crates/oxc_codegen/Cargo.toml +++ b/crates/oxc_codegen/Cargo.toml @@ -26,8 +26,10 @@ oxc_allocator = { workspace = true } oxc_syntax = { workspace = true, features = ["to_js_string"] } oxc_sourcemap = { workspace = true } oxc_mangler = { workspace = true } +oxc_index = { workspace = true } bitflags = { workspace = true } +nonmax = { workspace = true } once_cell = { workspace = true } daachorse = { workspace = true } rustc-hash = { workspace = true } diff --git a/crates/oxc_codegen/src/sourcemap_builder.rs b/crates/oxc_codegen/src/sourcemap_builder.rs index b2d25640da2ba..84c16afef887b 100644 --- a/crates/oxc_codegen/src/sourcemap_builder.rs +++ b/crates/oxc_codegen/src/sourcemap_builder.rs @@ -1,5 +1,8 @@ use std::sync::Arc; +use nonmax::NonMaxU32; + +use oxc_index::{Idx, IndexVec}; use oxc_span::Span; use oxc_syntax::identifier::{LS, PS}; @@ -9,16 +12,51 @@ const LS_OR_PS_SECOND: u8 = 0x80; const LS_THIRD: u8 = 0xA8; const PS_THIRD: u8 = 0xA9; -/// Line offset table +/// Index into vec of `ColumnOffsets` +#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct ColumnOffsetsId(NonMaxU32); + +impl Idx for ColumnOffsetsId { + #[allow(clippy::cast_possible_truncation)] + fn from_usize(idx: usize) -> Self { + Self(NonMaxU32::new(idx as u32).unwrap()) + } + + fn index(self) -> usize { + self.0.get() as usize + } +} + +/// Line offset tables. /// /// Used for tracking lines and columns from byte offsets via binary search. /// /// Code is adapted from [esbuild](https://github.com/evanw/esbuild/blob/cc74e6042a9f573bf58e1e3f165ebda70af4ad3b/internal/js_printer/js_printer.go#L4806-L4808) +/// +/// Most lines of source code will not contain Unicode chars, so optimize storage for this common case. +/// +/// Where a line is entirely ASCII, translating byte offset to UTF-16 column is simple, +/// given the byte offset of start of line. A column lookup table isn't needed for that line. +/// In this case, `column_offsets_id` for the line is `None`. +/// For rare lines which do contain Unicode chars, we store column offsets in a `ColumnOffsets` which +/// is stored in a separate `IndexVec`. `column_offsets_id` contains index for that line's `ColumnOffsets`. +/// Storing column offset info which is rarely used in a separate structure keeps `Line` as small as possible. +/// We also store byte offsets in a separate `Vec` so binary search can go through it as fast as possible. +#[derive(Debug, Default)] +pub struct LineOffsetTables { + // Byte offset of start of each line. Indexed by line number. + byte_offset_to_start_of_lines: Vec, + // Column offset ID of each line. Indexed by line number. + // `ColumnOffsetsId` is index into `column_offsets` vec. + column_offsets_ids: Vec>, + // Column offsets tables. Indexed by `ColumnOffsetsId`. + column_offsets: IndexVec, +} + #[derive(Debug)] -pub struct LineOffsetTable { - columns: Option>, +pub struct ColumnOffsets { byte_offset_to_first: u32, - byte_offset_to_start_of_line: u32, + columns: Box<[u32]>, } #[allow(clippy::struct_field_names)] @@ -27,7 +65,7 @@ pub struct SourcemapBuilder { original_source: Arc, last_generated_update: usize, last_position: Option, - line_offset_tables: Vec, + line_offset_tables: LineOffsetTables, sourcemap_builder: oxc_sourcemap::SourceMapBuilder, generated_line: u32, generated_column: u32, @@ -40,7 +78,7 @@ impl Default for SourcemapBuilder { original_source: "".into(), last_generated_update: 0, last_position: None, - line_offset_tables: vec![], + line_offset_tables: LineOffsetTables::default(), sourcemap_builder: oxc_sourcemap::SourceMapBuilder::default(), generated_line: 0, generated_column: 0, @@ -95,19 +133,21 @@ impl SourcemapBuilder { #[allow(clippy::cast_possible_truncation)] fn search_original_line_and_column(&mut self, position: u32) -> (u32, u32) { - let result = self - .line_offset_tables - .partition_point(|table| table.byte_offset_to_start_of_line <= position) - as u32; + let result = self.line_offset_tables.byte_offset_to_start_of_lines.partition_point( + |&byte_offset_to_start_of_line| byte_offset_to_start_of_line <= position, + ); let original_line = if result > 0 { result - 1 } else { 0 }; - let line = &self.line_offset_tables[original_line as usize]; - let mut original_column = position - line.byte_offset_to_start_of_line; - if original_column >= line.byte_offset_to_first { - if let Some(cols) = &line.columns { - original_column = cols[(original_column - line.byte_offset_to_first) as usize]; + let byte_offset_to_start_of_line = + self.line_offset_tables.byte_offset_to_start_of_lines[original_line]; + let mut original_column = position - byte_offset_to_start_of_line; + if let Some(column_offsets_id) = self.line_offset_tables.column_offsets_ids[original_line] { + let column_offsets = &self.line_offset_tables.column_offsets[column_offsets_id]; + if original_column >= column_offsets.byte_offset_to_first { + original_column = column_offsets.columns + [(original_column - column_offsets.byte_offset_to_first) as usize]; } } - (original_line, original_column) + (original_line as u32, original_column) } #[allow(clippy::cast_possible_truncation)] @@ -170,8 +210,10 @@ impl SourcemapBuilder { self.last_generated_update = output.len(); } - fn generate_line_offset_tables(content: &str) -> Vec { - let mut tables = vec![]; + fn generate_line_offset_tables(content: &str) -> LineOffsetTables { + let mut byte_offset_to_start_of_lines = vec![]; + let mut column_offsets_ids = vec![]; + let mut column_offsets = IndexVec::new(); // Process content line-by-line. // For each line, start by assuming line will be entirely ASCII, and read byte-by-byte. @@ -181,13 +223,8 @@ impl SourcemapBuilder { // At end of line, go back to top of outer loop, and again assume ASCII for next line. let mut line_byte_offset = 0; 'lines: loop { - tables.push(LineOffsetTable { - columns: None, - // `usize::MAX` so `original_column >= line.byte_offset_to_first` check in - // `search_original_line_and_column` fails if line is all ASCII - byte_offset_to_first: u32::MAX, - byte_offset_to_start_of_line: line_byte_offset, - }); + byte_offset_to_start_of_lines.push(line_byte_offset); + column_offsets_ids.push(None); let remaining = &content.as_bytes()[line_byte_offset as usize..]; for (byte_offset_from_line_start, b) in remaining.iter().enumerate() { @@ -209,11 +246,11 @@ impl SourcemapBuilder { } _ => { // Unicode char found. - // Create `columns` Vec, and set `byte_offset_to_first`. - let table = tables.iter_mut().last().unwrap(); - table.byte_offset_to_first = byte_offset_from_line_start; - table.columns = Some(vec![]); - let columns = table.columns.as_mut().unwrap(); + // Set `column_offsets_id` for line and create `columns` Vec. + *column_offsets_ids.last_mut().unwrap() = + Some(ColumnOffsetsId::from_usize(column_offsets.len())); + + let mut columns = vec![]; // Loop through rest of line char-by-char. // `chunk_byte_offset` in this loop is byte offset from start of this 1st @@ -256,6 +293,13 @@ impl SourcemapBuilder { // Line break found. // `chunk_byte_offset` is now the offset of *end* of the line break. line_byte_offset += chunk_byte_offset; + + // Record column offsets + column_offsets.push(ColumnOffsets { + byte_offset_to_first: byte_offset_from_line_start, + columns: columns.into_boxed_slice(), + }); + // Revert back to outer loop for next line continue 'lines; } @@ -263,6 +307,13 @@ impl SourcemapBuilder { // EOF. // One last column entry for EOF position. columns.push(column); + + // Record column offsets + column_offsets.push(ColumnOffsets { + byte_offset_to_first: byte_offset_from_line_start, + columns: columns.into_boxed_slice(), + }); + break 'lines; } }; @@ -277,7 +328,7 @@ impl SourcemapBuilder { break; } - tables + LineOffsetTables { byte_offset_to_start_of_lines, column_offsets_ids, column_offsets } } }