diff --git a/Cargo.lock b/Cargo.lock index 0a4cbb54f47ed..ffd4f9556ab5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1402,9 +1402,11 @@ dependencies = [ "bitflags 2.6.0", "daachorse", "insta", + "nonmax", "once_cell", "oxc_allocator", "oxc_ast", + "oxc_index", "oxc_mangler", "oxc_parser", "oxc_sourcemap", diff --git a/crates/oxc_codegen/Cargo.toml b/crates/oxc_codegen/Cargo.toml index 0602bd3f91419..8f249d9f3a6e3 100644 --- a/crates/oxc_codegen/Cargo.toml +++ b/crates/oxc_codegen/Cargo.toml @@ -26,8 +26,10 @@ oxc_allocator = { workspace = true } oxc_syntax = { workspace = true, features = ["to_js_string"] } oxc_sourcemap = { workspace = true } oxc_mangler = { workspace = true } +oxc_index = { workspace = true } bitflags = { workspace = true } +nonmax = { workspace = true } once_cell = { workspace = true } daachorse = { workspace = true } rustc-hash = { workspace = true } diff --git a/crates/oxc_codegen/src/sourcemap_builder.rs b/crates/oxc_codegen/src/sourcemap_builder.rs index b2d25640da2ba..f8142321ecf78 100644 --- a/crates/oxc_codegen/src/sourcemap_builder.rs +++ b/crates/oxc_codegen/src/sourcemap_builder.rs @@ -1,5 +1,8 @@ use std::sync::Arc; +use nonmax::NonMaxU32; + +use oxc_index::{Idx, IndexVec}; use oxc_span::Span; use oxc_syntax::identifier::{LS, PS}; @@ -9,16 +12,54 @@ const LS_OR_PS_SECOND: u8 = 0x80; const LS_THIRD: u8 = 0xA8; const PS_THIRD: u8 = 0xA9; -/// Line offset table +/// Index into vec of `ColumnOffsets` +#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct ColumnOffsetsId(NonMaxU32); + +impl Idx for ColumnOffsetsId { + #[allow(clippy::cast_possible_truncation)] + fn from_usize(idx: usize) -> Self { + assert!(idx < u32::MAX as usize); + // SAFETY: We just checked `idx` is a legal value for `NonMaxU32` + Self(unsafe { NonMaxU32::new_unchecked(idx as u32) }) + } + + fn index(self) -> usize { + self.0.get() as usize + } +} + +/// Line offset tables. /// /// Used for tracking lines and columns from byte offsets via binary search. /// /// Code is adapted from [esbuild](https://github.com/evanw/esbuild/blob/cc74e6042a9f573bf58e1e3f165ebda70af4ad3b/internal/js_printer/js_printer.go#L4806-L4808) +/// +/// Most lines of source code will not contain Unicode chars, so optimize storage for this common case. +/// +/// Each line is represented by a `Line`. +/// Where a line is entirely ASCII, translating byte offset to UTF-16 column is simple, +/// given the byte offset of start of line. A column lookup table isn't needed for that line. +/// In this case, `Line::column_offsets_id` is `None`. +/// For rare lines which do contain Unicode chars, we store column offsets in a `ColumnOffsets` which +/// is stored in a separate `IndexVec`. `Line::column_offsets_id` contains index for that line's `ColumnOffsets`. +/// Storing column offset info which is rarely used in a separate structure keeps `Line` as small as possible. +#[derive(Debug, Default)] +pub struct LineOffsetTables { + lines: Vec, + column_offsets: IndexVec, +} + #[derive(Debug)] -pub struct LineOffsetTable { - columns: Option>, - byte_offset_to_first: u32, +pub struct Line { byte_offset_to_start_of_line: u32, + column_offsets_id: Option, +} + +#[derive(Debug)] +pub struct ColumnOffsets { + byte_offset_to_first: u32, + columns: Box<[u32]>, } #[allow(clippy::struct_field_names)] @@ -27,7 +68,7 @@ pub struct SourcemapBuilder { original_source: Arc, last_generated_update: usize, last_position: Option, - line_offset_tables: Vec, + line_offset_tables: LineOffsetTables, sourcemap_builder: oxc_sourcemap::SourceMapBuilder, generated_line: u32, generated_column: u32, @@ -40,7 +81,7 @@ impl Default for SourcemapBuilder { original_source: "".into(), last_generated_update: 0, last_position: None, - line_offset_tables: vec![], + line_offset_tables: LineOffsetTables::default(), sourcemap_builder: oxc_sourcemap::SourceMapBuilder::default(), generated_line: 0, generated_column: 0, @@ -97,17 +138,19 @@ impl SourcemapBuilder { fn search_original_line_and_column(&mut self, position: u32) -> (u32, u32) { let result = self .line_offset_tables - .partition_point(|table| table.byte_offset_to_start_of_line <= position) - as u32; + .lines + .partition_point(|table| table.byte_offset_to_start_of_line <= position); let original_line = if result > 0 { result - 1 } else { 0 }; - let line = &self.line_offset_tables[original_line as usize]; + let line = &self.line_offset_tables.lines[original_line]; let mut original_column = position - line.byte_offset_to_start_of_line; - if original_column >= line.byte_offset_to_first { - if let Some(cols) = &line.columns { - original_column = cols[(original_column - line.byte_offset_to_first) as usize]; + if let Some(column_offsets_id) = line.column_offsets_id { + let column_offsets = &self.line_offset_tables.column_offsets[column_offsets_id]; + if original_column >= column_offsets.byte_offset_to_first { + original_column = column_offsets.columns + [(original_column - column_offsets.byte_offset_to_first) as usize]; } } - (original_line, original_column) + (original_line as u32, original_column) } #[allow(clippy::cast_possible_truncation)] @@ -170,8 +213,9 @@ impl SourcemapBuilder { self.last_generated_update = output.len(); } - fn generate_line_offset_tables(content: &str) -> Vec { - let mut tables = vec![]; + fn generate_line_offset_tables(content: &str) -> LineOffsetTables { + let mut lines = vec![]; + let mut column_offsets = IndexVec::new(); // Process content line-by-line. // For each line, start by assuming line will be entirely ASCII, and read byte-by-byte. @@ -181,12 +225,9 @@ impl SourcemapBuilder { // At end of line, go back to top of outer loop, and again assume ASCII for next line. let mut line_byte_offset = 0; 'lines: loop { - tables.push(LineOffsetTable { - columns: None, - // `usize::MAX` so `original_column >= line.byte_offset_to_first` check in - // `search_original_line_and_column` fails if line is all ASCII - byte_offset_to_first: u32::MAX, + lines.push(Line { byte_offset_to_start_of_line: line_byte_offset, + column_offsets_id: None, }); let remaining = &content.as_bytes()[line_byte_offset as usize..]; @@ -209,11 +250,12 @@ impl SourcemapBuilder { } _ => { // Unicode char found. - // Create `columns` Vec, and set `byte_offset_to_first`. - let table = tables.iter_mut().last().unwrap(); - table.byte_offset_to_first = byte_offset_from_line_start; - table.columns = Some(vec![]); - let columns = table.columns.as_mut().unwrap(); + // Set `column_offsets_id` for line and create `columns` Vec. + let line = lines.iter_mut().last().unwrap(); + line.column_offsets_id = + Some(ColumnOffsetsId::from_usize(column_offsets.len())); + + let mut columns = vec![]; // Loop through rest of line char-by-char. // `chunk_byte_offset` in this loop is byte offset from start of this 1st @@ -256,6 +298,13 @@ impl SourcemapBuilder { // Line break found. // `chunk_byte_offset` is now the offset of *end* of the line break. line_byte_offset += chunk_byte_offset; + + // Record column offsets + column_offsets.push(ColumnOffsets { + byte_offset_to_first: byte_offset_from_line_start, + columns: columns.into_boxed_slice(), + }); + // Revert back to outer loop for next line continue 'lines; } @@ -263,6 +312,13 @@ impl SourcemapBuilder { // EOF. // One last column entry for EOF position. columns.push(column); + + // Record column offsets + column_offsets.push(ColumnOffsets { + byte_offset_to_first: byte_offset_from_line_start, + columns: columns.into_boxed_slice(), + }); + break 'lines; } }; @@ -277,7 +333,7 @@ impl SourcemapBuilder { break; } - tables + LineOffsetTables { lines, column_offsets } } }