Skip to content

Commit c9c38a1

Browse files
committed
perf(parser): support peeking over bytes (#4304)
Closes #3291
1 parent 732f4e2 commit c9c38a1

File tree

11 files changed

+116
-76
lines changed

11 files changed

+116
-76
lines changed

crates/oxc_ast/src/ast_impl/literal.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,24 @@ impl TryFrom<char> for RegExpFlags {
108108
}
109109
}
110110

111+
impl TryFrom<u8> for RegExpFlags {
112+
type Error = u8;
113+
114+
fn try_from(value: u8) -> Result<Self, Self::Error> {
115+
match value {
116+
b'g' => Ok(Self::G),
117+
b'i' => Ok(Self::I),
118+
b'm' => Ok(Self::M),
119+
b's' => Ok(Self::S),
120+
b'u' => Ok(Self::U),
121+
b'y' => Ok(Self::Y),
122+
b'd' => Ok(Self::D),
123+
b'v' => Ok(Self::V),
124+
_ => Err(value),
125+
}
126+
}
127+
}
128+
111129
impl fmt::Display for RegExpFlags {
112130
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
113131
if self.contains(Self::G) {

crates/oxc_parser/src/lexer/byte_handlers.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -336,12 +336,12 @@ ascii_byte_handler!(PRD(lexer) {
336336
// /
337337
ascii_byte_handler!(SLH(lexer) {
338338
lexer.consume_char();
339-
match lexer.peek() {
340-
Some('/') => {
339+
match lexer.peek_byte() {
340+
Some(b'/') => {
341341
lexer.consume_char();
342342
lexer.skip_single_line_comment()
343343
}
344-
Some('*') => {
344+
Some(b'*') => {
345345
lexer.consume_char();
346346
lexer.skip_multi_line_comment()
347347
}
@@ -418,9 +418,9 @@ ascii_byte_handler!(QST(lexer) {
418418
} else {
419419
Kind::Question2
420420
}
421-
} else if lexer.peek() == Some('.') {
421+
} else if lexer.peek_byte() == Some(b'.') {
422422
// parse `?.1` as `?` `.1`
423-
if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) {
423+
if lexer.peek_char2().is_some_and(|c| c.is_ascii_digit()) {
424424
Kind::Question
425425
} else {
426426
lexer.consume_char();

crates/oxc_parser/src/lexer/identifier.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ impl<'a> Lexer<'a> {
9898
/// Any number of characters can have already been consumed from `self.source` prior to it.
9999
/// `self.source` should be positioned at start of Unicode character.
100100
fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
101-
let c = self.peek().unwrap();
101+
let c = self.peek_char().unwrap();
102102
if is_identifier_part_unicode(c) {
103103
self.consume_char();
104104
self.identifier_tail_after_unicode(start_pos)
@@ -115,7 +115,7 @@ impl<'a> Lexer<'a> {
115115
pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
116116
// Identifier contains a Unicode chars, so probably contains more.
117117
// So just iterate over chars now, instead of bytes.
118-
while let Some(c) = self.peek() {
118+
while let Some(c) = self.peek_char() {
119119
if is_identifier_part(c) {
120120
self.consume_char();
121121
} else if c == '\\' {
@@ -177,7 +177,7 @@ impl<'a> Lexer<'a> {
177177
// Consume chars until reach end of identifier or another escape
178178
let chunk_start = self.source.position();
179179
loop {
180-
let maybe_char = self.peek();
180+
let maybe_char = self.peek_char();
181181
if maybe_char.is_some_and(is_identifier_part) {
182182
self.consume_char();
183183
continue;
@@ -272,7 +272,7 @@ impl<'a> Lexer<'a> {
272272
fn private_identifier_not_ascii_id(&mut self) -> Kind {
273273
let b = self.source.peek_byte().unwrap();
274274
if !b.is_ascii() {
275-
let c = self.peek().unwrap();
275+
let c = self.peek_char().unwrap();
276276
if is_identifier_start_unicode(c) {
277277
let start_pos = self.source.position();
278278
self.consume_char();

crates/oxc_parser/src/lexer/jsx.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,12 @@ impl<'a> Lexer<'a> {
6161
/// `JSXFragment`
6262
/// { `JSXChildExpressionopt` }
6363
fn read_jsx_child(&mut self) -> Kind {
64-
match self.peek() {
65-
Some('<') => {
64+
match self.peek_byte() {
65+
Some(b'<') => {
6666
self.consume_char();
6767
Kind::LAngle
6868
}
69-
Some('{') => {
69+
Some(b'{') => {
7070
self.consume_char();
7171
Kind::LCurly
7272
}
@@ -122,7 +122,7 @@ impl<'a> Lexer<'a> {
122122
// Unicode chars are rare in identifiers, so cold branch to keep common path for ASCII
123123
// as fast as possible
124124
cold_branch(|| {
125-
while let Some(c) = self.peek() {
125+
while let Some(c) = self.peek_char() {
126126
if c == '-' || is_identifier_part(c) {
127127
self.consume_char();
128128
} else {

crates/oxc_parser/src/lexer/kind.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,11 +206,11 @@ impl Kind {
206206
)
207207
}
208208

209-
pub fn matches_number_char(self, c: char) -> bool {
209+
pub fn matches_number_char(self, c: u8) -> bool {
210210
match self {
211211
Decimal => c.is_ascii_digit(),
212-
Binary => matches!(c, '0'..='1'),
213-
Octal => matches!(c, '0'..='7'),
212+
Binary => matches!(c, b'0'..=b'1'),
213+
Octal => matches!(c, b'0'..=b'7'),
214214
Hex => c.is_ascii_hexdigit(),
215215
_ => unreachable!(),
216216
}

crates/oxc_parser/src/lexer/mod.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,15 +251,27 @@ impl<'a> Lexer<'a> {
251251
self.source.next_char().unwrap()
252252
}
253253

254+
/// Peek the next byte without advancing the position
255+
#[inline]
256+
fn peek_byte(&self) -> Option<u8> {
257+
self.source.peek_byte()
258+
}
259+
260+
/// Peek the next two bytes without advancing the position
261+
#[inline]
262+
fn peek_2_bytes(&self) -> Option<[u8; 2]> {
263+
self.source.peek_2_bytes()
264+
}
265+
254266
/// Peek the next char without advancing the position
255267
#[inline]
256-
fn peek(&self) -> Option<char> {
268+
fn peek_char(&self) -> Option<char> {
257269
self.source.peek_char()
258270
}
259271

260272
/// Peek the next next char without advancing the position
261273
#[inline]
262-
fn peek2(&self) -> Option<char> {
274+
fn peek_char2(&self) -> Option<char> {
263275
self.source.peek_char2()
264276
}
265277

@@ -284,7 +296,7 @@ impl<'a> Lexer<'a> {
284296
/// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF
285297
fn unexpected_err(&mut self) {
286298
let offset = self.current_offset();
287-
match self.peek() {
299+
match self.peek_char() {
288300
Some(c) => self.error(diagnostics::invalid_character(c, offset)),
289301
None => self.error(diagnostics::unexpected_end(offset)),
290302
}

crates/oxc_parser/src/lexer/numeric.rs

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,19 @@ use crate::diagnostics;
66
impl<'a> Lexer<'a> {
77
/// 12.9.3 Numeric Literals with `0` prefix
88
pub(super) fn read_zero(&mut self) -> Kind {
9-
match self.peek() {
10-
Some('b' | 'B') => self.read_non_decimal(Kind::Binary),
11-
Some('o' | 'O') => self.read_non_decimal(Kind::Octal),
12-
Some('x' | 'X') => self.read_non_decimal(Kind::Hex),
13-
Some('e' | 'E') => {
9+
match self.peek_byte() {
10+
Some(b'b' | b'B') => self.read_non_decimal(Kind::Binary),
11+
Some(b'o' | b'O') => self.read_non_decimal(Kind::Octal),
12+
Some(b'x' | b'X') => self.read_non_decimal(Kind::Hex),
13+
Some(b'e' | b'E') => {
1414
self.consume_char();
1515
self.read_decimal_exponent()
1616
}
17-
Some('.') => {
17+
Some(b'.') => {
1818
self.consume_char();
1919
self.decimal_literal_after_decimal_point_after_digits()
2020
}
21-
Some('n') => {
21+
Some(b'n') => {
2222
self.consume_char();
2323
self.check_after_numeric_literal(Kind::Decimal)
2424
}
@@ -42,23 +42,23 @@ impl<'a> Lexer<'a> {
4242
fn read_non_decimal(&mut self, kind: Kind) -> Kind {
4343
self.consume_char();
4444

45-
if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
45+
if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
4646
self.consume_char();
4747
} else {
4848
self.unexpected_err();
4949
return Kind::Undetermined;
5050
}
5151

52-
while let Some(c) = self.peek() {
52+
while let Some(c) = self.peek_byte() {
5353
match c {
54-
'_' => {
54+
b'_' => {
5555
self.consume_char();
5656
// NOTE: it looks invalid numeric tokens are still parsed.
5757
// This seems to be a waste. It also requires us to put this
5858
// call here instead of after we ensure the next character
5959
// is a number character
6060
self.token.set_has_separator();
61-
if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
61+
if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
6262
self.consume_char();
6363
} else {
6464
self.unexpected_err();
@@ -71,35 +71,33 @@ impl<'a> Lexer<'a> {
7171
_ => break,
7272
}
7373
}
74-
if self.peek() == Some('n') {
75-
self.consume_char();
76-
}
74+
self.next_ascii_char_eq(b'n');
7775
self.check_after_numeric_literal(kind)
7876
}
7977

8078
fn read_legacy_octal(&mut self) -> Kind {
8179
let mut kind = Kind::Octal;
8280
loop {
83-
match self.peek() {
84-
Some('0'..='7') => {
81+
match self.peek_byte() {
82+
Some(b'0'..=b'7') => {
8583
self.consume_char();
8684
}
87-
Some('8'..='9') => {
85+
Some(b'8'..=b'9') => {
8886
self.consume_char();
8987
kind = Kind::Decimal;
9088
}
9189
_ => break,
9290
}
9391
}
9492

95-
match self.peek() {
93+
match self.peek_byte() {
9694
// allow 08.5 and 09.5
97-
Some('.') if kind == Kind::Decimal => {
95+
Some(b'.') if kind == Kind::Decimal => {
9896
self.consume_char();
9997
self.decimal_literal_after_decimal_point_after_digits()
10098
}
10199
// allow 08e1 and 09e1
102-
Some('e') if kind == Kind::Decimal => {
100+
Some(b'e') if kind == Kind::Decimal => {
103101
self.consume_char();
104102
self.read_decimal_exponent()
105103
}
@@ -108,12 +106,12 @@ impl<'a> Lexer<'a> {
108106
}
109107

110108
fn read_decimal_exponent(&mut self) -> Kind {
111-
let kind = match self.peek() {
112-
Some('-') => {
109+
let kind = match self.peek_byte() {
110+
Some(b'-') => {
113111
self.consume_char();
114112
Kind::NegativeExponential
115113
}
116-
Some('+') => {
114+
Some(b'+') => {
117115
self.consume_char();
118116
Kind::PositiveExponential
119117
}
@@ -124,7 +122,7 @@ impl<'a> Lexer<'a> {
124122
}
125123

126124
fn read_decimal_digits(&mut self) {
127-
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
125+
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
128126
self.consume_char();
129127
} else {
130128
self.unexpected_err();
@@ -135,23 +133,23 @@ impl<'a> Lexer<'a> {
135133
}
136134

137135
fn read_decimal_digits_after_first_digit(&mut self) {
138-
while let Some(c) = self.peek() {
139-
match c {
140-
'_' => {
136+
while let Some(b) = self.peek_byte() {
137+
match b {
138+
b'_' => {
141139
self.consume_char();
142140
// NOTE: it looks invalid numeric tokens are still parsed.
143141
// This seems to be a waste. It also requires us to put this
144142
// call here instead of after we ensure the next character
145143
// is an ASCII digit
146144
self.token.set_has_separator();
147-
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
145+
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
148146
self.consume_char();
149147
} else {
150148
self.unexpected_err();
151149
return;
152150
}
153151
}
154-
'0'..='9' => {
152+
b'0'..=b'9' => {
155153
self.consume_char();
156154
}
157155
_ => break,
@@ -172,16 +170,14 @@ impl<'a> Lexer<'a> {
172170
}
173171

174172
fn optional_decimal_digits(&mut self) {
175-
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
173+
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
176174
self.consume_char();
177-
} else {
178-
return;
175+
self.read_decimal_digits_after_first_digit();
179176
}
180-
self.read_decimal_digits_after_first_digit();
181177
}
182178

183179
fn optional_exponent(&mut self) -> Option<Kind> {
184-
if matches!(self.peek(), Some('e' | 'E')) {
180+
if matches!(self.peek_byte(), Some(b'e' | b'E')) {
185181
self.consume_char();
186182
return Some(self.read_decimal_exponent());
187183
}
@@ -191,12 +187,12 @@ impl<'a> Lexer<'a> {
191187
fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind {
192188
let offset = self.offset();
193189
// The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
194-
let c = self.peek();
190+
let c = self.peek_char();
195191
if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) {
196192
return kind;
197193
}
198194
self.consume_char();
199-
while let Some(c) = self.peek() {
195+
while let Some(c) = self.peek_char() {
200196
if is_identifier_start(c) {
201197
self.consume_char();
202198
} else {

crates/oxc_parser/src/lexer/punctuation.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ use super::{Kind, Lexer, Token};
33
impl<'a> Lexer<'a> {
44
/// Section 12.8 Punctuators
55
pub(super) fn read_dot(&mut self) -> Kind {
6-
if self.peek() == Some('.') && self.peek2() == Some('.') {
6+
if self.peek_2_bytes() == Some([b'.', b'.']) {
77
self.consume_char();
88
self.consume_char();
99
return Kind::Dot3;
1010
}
11-
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
11+
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
1212
self.decimal_literal_after_decimal_point()
1313
} else {
1414
Kind::Dot
@@ -25,7 +25,7 @@ impl<'a> Lexer<'a> {
2525
}
2626
} else if self.next_ascii_char_eq(b'=') {
2727
Some(Kind::LtEq)
28-
} else if self.peek() == Some('!')
28+
} else if self.peek_byte() == Some(b'!')
2929
// SingleLineHTMLOpenComment `<!--` in script mode
3030
&& self.source_type.is_script()
3131
&& self.remaining().starts_with("!--")

crates/oxc_parser/src/lexer/regex.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,16 @@ impl<'a> Lexer<'a> {
5858
let pattern_end = self.offset() - 1; // -1 to exclude `/`
5959
let mut flags = RegExpFlags::empty();
6060

61-
while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
61+
while let Some(ch @ (b'$' | b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) =
62+
self.peek_byte()
63+
{
6264
self.consume_char();
6365
let Ok(flag) = RegExpFlags::try_from(ch) else {
64-
self.error(diagnostics::reg_exp_flag(ch, self.current_offset()));
66+
self.error(diagnostics::reg_exp_flag(ch as char, self.current_offset()));
6567
continue;
6668
};
6769
if flags.contains(flag) {
68-
self.error(diagnostics::reg_exp_flag_twice(ch, self.current_offset()));
70+
self.error(diagnostics::reg_exp_flag_twice(ch as char, self.current_offset()));
6971
continue;
7072
}
7173
flags |= flag;

0 commit comments

Comments
 (0)