swc_ecma_parser/lexer/
mod.rs

1//! ECMAScript lexer.
2
3use std::{borrow::Cow, char, iter::FusedIterator, rc::Rc};
4
5use either::Either::{self, Left, Right};
6use smartstring::{LazyCompact, SmartString};
7use swc_atoms::{
8    wtf8::{CodePoint, Wtf8, Wtf8Buf},
9    Atom, AtomStoreCell,
10};
11use swc_common::{
12    comments::{Comment, CommentKind, Comments},
13    input::{Input, StringInput},
14    BytePos, Span,
15};
16use swc_ecma_ast::{EsVersion, Ident};
17
18use self::table::{ByteHandler, BYTE_HANDLERS};
19use crate::{
20    byte_search,
21    error::{Error, SyntaxError},
22    input::Tokens,
23    lexer::{
24        char_ext::CharExt,
25        comments_buffer::{BufferedComment, BufferedCommentKind, CommentsBuffer},
26        jsx::xhtml,
27        number::{parse_integer, LazyInteger},
28        search::SafeByteMatchTable,
29        state::State,
30    },
31    safe_byte_match_table,
32    syntax::SyntaxFlags,
33    BigIntValue, Context, Syntax,
34};
35
36#[cfg(feature = "unstable")]
37pub(crate) mod capturing;
38mod char_ext;
39mod comments_buffer;
40mod jsx;
41mod number;
42pub(crate) mod search;
43mod state;
44mod table;
45pub(crate) mod token;
46mod whitespace;
47
48pub(crate) use state::TokenFlags;
49pub(crate) use token::{NextTokenAndSpan, Token, TokenAndSpan, TokenValue};
50
51// ===== Byte match tables for comment scanning =====
52// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
53const LS_OR_PS_FIRST: u8 = 0xe2;
54const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xa8];
55const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xa9];
56
57static LINE_BREAK_TABLE: SafeByteMatchTable =
58    safe_byte_match_table!(|b| matches!(b, b'\n' | b'\r' | LS_OR_PS_FIRST));
59
60static BLOCK_COMMENT_SCAN_TABLE: SafeByteMatchTable =
61    safe_byte_match_table!(|b| { matches!(b, b'*' | b'\n' | b'\r' | LS_OR_PS_FIRST) });
62
63static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
64    safe_byte_match_table!(|b| matches!(b, b'"' | b'\n' | b'\\' | b'\r'));
65static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
66    safe_byte_match_table!(|b| matches!(b, b'\'' | b'\n' | b'\\' | b'\r'));
67
68static NOT_ASCII_ID_CONTINUE_TABLE: SafeByteMatchTable =
69    safe_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'));
70
71/// Converts UTF-16 surrogate pair to Unicode code point.
72/// `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair`
73#[inline]
74const fn pair_to_code_point(high: u32, low: u32) -> u32 {
75    (high - 0xd800) * 0x400 + low - 0xdc00 + 0x10000
76}
77
78/// A Unicode escape sequence.
79///
80/// `\u Hex4Digits`, `\u Hex4Digits \u Hex4Digits`, or `\u{ HexDigits }`.
81#[derive(Debug)]
82pub enum UnicodeEscape {
83    // `\u Hex4Digits` or `\u{ HexDigits }`, which forms a valid Unicode code point.
84    // Char cannot be in range 0xD800..=0xDFFF.
85    CodePoint(char),
86    // `\u Hex4Digits \u Hex4Digits`, which forms a valid Unicode astral code point.
87    // Char is in the range 0x10000..=0x10FFFF.
88    SurrogatePair(char),
89    // `\u Hex4Digits` or `\u{ HexDigits }`, which forms an invalid Unicode code point.
90    // Code unit is in the range 0xD800..=0xDFFF.
91    LoneSurrogate(u32),
92}
93
94impl From<UnicodeEscape> for CodePoint {
95    fn from(value: UnicodeEscape) -> Self {
96        match value {
97            UnicodeEscape::CodePoint(c) | UnicodeEscape::SurrogatePair(c) => {
98                CodePoint::from_char(c)
99            }
100            UnicodeEscape::LoneSurrogate(u) => unsafe { CodePoint::from_u32_unchecked(u) },
101        }
102    }
103}
104
105pub type LexResult<T> = Result<T, crate::error::Error>;
106
107fn remove_underscore(s: &str, has_underscore: bool) -> Cow<'_, str> {
108    if has_underscore {
109        debug_assert!(s.contains('_'));
110        s.chars().filter(|&c| c != '_').collect::<String>().into()
111    } else {
112        debug_assert!(!s.contains('_'));
113        Cow::Borrowed(s)
114    }
115}
116
117#[derive(Clone)]
118pub struct Lexer<'a> {
119    comments: Option<&'a dyn Comments>,
120    /// [Some] if comment comment parsing is enabled. Otherwise [None]
121    comments_buffer: Option<CommentsBuffer>,
122
123    pub ctx: Context,
124    input: StringInput<'a>,
125    start_pos: BytePos,
126
127    state: State,
128    token_flags: TokenFlags,
129    pub(crate) syntax: SyntaxFlags,
130    pub(crate) target: EsVersion,
131
132    errors: Vec<Error>,
133    module_errors: Vec<Error>,
134
135    atoms: Rc<AtomStoreCell>,
136}
137
138impl FusedIterator for Lexer<'_> {}
139
140impl<'a> Lexer<'a> {
141    #[inline(always)]
142    fn input(&self) -> &StringInput<'a> {
143        &self.input
144    }
145
146    #[inline(always)]
147    fn input_mut(&mut self) -> &mut StringInput<'a> {
148        &mut self.input
149    }
150
151    #[inline(always)]
152    fn push_error(&mut self, error: Error) {
153        self.errors.push(error);
154    }
155
156    #[inline(always)]
157    fn state(&self) -> &State {
158        &self.state
159    }
160
161    #[inline(always)]
162    fn state_mut(&mut self) -> &mut State {
163        &mut self.state
164    }
165
166    #[inline(always)]
167    fn comments(&self) -> Option<&'a dyn swc_common::comments::Comments> {
168        self.comments
169    }
170
171    #[inline(always)]
172    fn comments_buffer(&self) -> Option<&CommentsBuffer> {
173        self.comments_buffer.as_ref()
174    }
175
176    #[inline(always)]
177    fn comments_buffer_mut(&mut self) -> Option<&mut CommentsBuffer> {
178        self.comments_buffer.as_mut()
179    }
180
181    #[inline(always)]
182    unsafe fn input_slice_to_cur(&mut self, start: BytePos) -> &'a str {
183        self.input.slice_to_cur(start)
184    }
185
186    #[inline(always)]
187    unsafe fn input_slice(&mut self, start: BytePos, end: BytePos) -> &'a str {
188        self.input.slice(start, end)
189    }
190
191    #[inline(always)]
192    fn input_uncons_while(&mut self, f: impl FnMut(char) -> bool) -> &'a str {
193        self.input_mut().uncons_while(f)
194    }
195
196    #[inline(always)]
197    fn atom<'b>(&self, s: impl Into<std::borrow::Cow<'b, str>>) -> swc_atoms::Atom {
198        self.atoms.atom(s)
199    }
200
201    #[inline(always)]
202    fn wtf8_atom<'b>(&self, s: impl Into<std::borrow::Cow<'b, Wtf8>>) -> swc_atoms::Wtf8Atom {
203        self.atoms.wtf8_atom(s)
204    }
205}
206
207impl<'a> Lexer<'a> {
208    pub fn new(
209        syntax: Syntax,
210        target: EsVersion,
211        input: StringInput<'a>,
212        comments: Option<&'a dyn Comments>,
213    ) -> Self {
214        let start_pos = input.last_pos();
215
216        Lexer {
217            comments,
218            comments_buffer: comments.is_some().then(CommentsBuffer::new),
219            ctx: Default::default(),
220            input,
221            start_pos,
222            state: State::new(start_pos),
223            syntax: syntax.into_flags(),
224            target,
225            errors: Default::default(),
226            module_errors: Default::default(),
227            atoms: Default::default(),
228            token_flags: TokenFlags::empty(),
229        }
230    }
231
232    /// babel: `getTokenFromCode`
233    fn read_token(&mut self) -> LexResult<Token> {
234        self.token_flags = TokenFlags::empty();
235        let byte = match self.input.as_str().as_bytes().first() {
236            Some(&v) => v,
237            None => return Ok(Token::Eof),
238        };
239
240        let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) };
241        handler(self)
242    }
243
244    fn read_token_plus_minus<const C: u8>(&mut self) -> LexResult<Token> {
245        let start = self.cur_pos();
246
247        unsafe {
248            // Safety: cur() is Some(c), if this method is called.
249            self.input.bump();
250        }
251
252        // '++', '--'
253        Ok(if self.input.cur() == Some(C as char) {
254            unsafe {
255                // Safety: cur() is Some(c)
256                self.input.bump();
257            }
258
259            // Handle -->
260            if self.state.had_line_break && C == b'-' && self.eat(b'>') {
261                self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule);
262                self.skip_line_comment(0);
263                self.skip_space::<true>();
264                return self.read_token();
265            }
266
267            if C == b'+' {
268                Token::PlusPlus
269            } else {
270                Token::MinusMinus
271            }
272        } else if self.input.eat_byte(b'=') {
273            if C == b'+' {
274                Token::PlusEq
275            } else {
276                Token::MinusEq
277            }
278        } else if C == b'+' {
279            Token::Plus
280        } else {
281            Token::Minus
282        })
283    }
284
285    fn read_token_bang_or_eq<const C: u8>(&mut self) -> LexResult<Token> {
286        let start = self.cur_pos();
287        let had_line_break_before_last = self.had_line_break_before_last();
288
289        unsafe {
290            // Safety: cur() is Some(c) if this method is called.
291            self.input.bump();
292        }
293
294        Ok(if self.input.eat_byte(b'=') {
295            // "=="
296
297            if self.input.eat_byte(b'=') {
298                if C == b'!' {
299                    Token::NotEqEq
300                } else {
301                    // =======
302                    //    ^
303                    if had_line_break_before_last && self.is_str("====") {
304                        self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185);
305                        self.skip_line_comment(4);
306                        self.skip_space::<true>();
307                        return self.read_token();
308                    }
309
310                    Token::EqEqEq
311                }
312            } else if C == b'!' {
313                Token::NotEq
314            } else {
315                Token::EqEq
316            }
317        } else if C == b'=' && self.input.eat_byte(b'>') {
318            // "=>"
319
320            Token::Arrow
321        } else if C == b'!' {
322            Token::Bang
323        } else {
324            Token::Eq
325        })
326    }
327}
328
329impl Lexer<'_> {
330    fn read_token_lt_gt<const C: u8>(&mut self) -> LexResult<Token> {
331        let had_line_break_before_last = self.had_line_break_before_last();
332        let start = self.cur_pos();
333        self.bump();
334
335        if self.syntax.typescript()
336            && self.ctx.contains(Context::InType)
337            && !self.ctx.contains(Context::ShouldNotLexLtOrGtAsType)
338        {
339            if C == b'<' {
340                return Ok(Token::Lt);
341            } else if C == b'>' {
342                return Ok(Token::Gt);
343            }
344        }
345
346        // XML style comment. `<!--`
347        if C == b'<' && self.is(b'!') && self.peek() == Some('-') && self.peek_ahead() == Some('-')
348        {
349            self.skip_line_comment(3);
350            self.skip_space::<true>();
351            self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule);
352
353            return self.read_token();
354        }
355
356        let mut op = if C == b'<' { Token::Lt } else { Token::Gt };
357
358        // '<<', '>>'
359        if self.cur() == Some(C as char) {
360            self.bump();
361            op = if C == b'<' {
362                Token::LShift
363            } else {
364                Token::RShift
365            };
366
367            //'>>>'
368            if C == b'>' && self.cur() == Some(C as char) {
369                self.bump();
370                op = Token::ZeroFillRShift;
371            }
372        }
373
374        let token = if self.eat(b'=') {
375            match op {
376                Token::Lt => Token::LtEq,
377                Token::Gt => Token::GtEq,
378                Token::LShift => Token::LShiftEq,
379                Token::RShift => Token::RShiftEq,
380                Token::ZeroFillRShift => Token::ZeroFillRShiftEq,
381                _ => unreachable!(),
382            }
383        } else {
384            op
385        };
386
387        // All conflict markers consist of the same character repeated seven times.
388        // If it is a <<<<<<< or >>>>>>> marker then it is also followed by a space.
389        // <<<<<<<
390        //   ^
391        // >>>>>>>
392        //    ^
393        if had_line_break_before_last
394            && match op {
395                Token::LShift if self.is_str("<<<<< ") => true,
396                Token::ZeroFillRShift if self.is_str(">>>> ") => true,
397                _ => false,
398            }
399        {
400            self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185);
401            self.skip_line_comment(5);
402            self.skip_space::<true>();
403            return self.read_token();
404        }
405
406        Ok(token)
407    }
408
409    fn read_token_back_quote(&mut self) -> LexResult<Token> {
410        let start = self.cur_pos();
411        self.scan_template_token(start, true)
412    }
413
414    fn scan_template_token(
415        &mut self,
416        start: BytePos,
417        started_with_backtick: bool,
418    ) -> LexResult<Token> {
419        debug_assert!(self.cur() == Some(if started_with_backtick { '`' } else { '}' }));
420        let mut cooked = Ok(Wtf8Buf::with_capacity(8));
421        self.bump(); // `}` or `\``
422        let mut cooked_slice_start = self.cur_pos();
423        let raw_slice_start = cooked_slice_start;
424        let raw_atom = |this: &mut Self| {
425            let last_pos = this.cur_pos();
426            let s = unsafe { this.input.slice(raw_slice_start, last_pos) };
427            this.atoms.atom(s)
428        };
429        macro_rules! consume_cooked {
430            () => {{
431                if let Ok(cooked) = &mut cooked {
432                    let last_pos = self.cur_pos();
433                    cooked.push_str(unsafe {
434                        // Safety: Both of start and last_pos are valid position because we got them
435                        // from `self.input`
436                        self.input.slice(cooked_slice_start, last_pos)
437                    });
438                }
439            }};
440        }
441
442        while let Some(c) = self.cur() {
443            if c == '`' {
444                consume_cooked!();
445                let cooked = cooked.map(|cooked| self.atoms.wtf8_atom(&*cooked));
446                let raw = raw_atom(self);
447                self.bump();
448                return Ok(if started_with_backtick {
449                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
450                    Token::NoSubstitutionTemplateLiteral
451                } else {
452                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
453                    Token::TemplateTail
454                });
455            } else if c == '$' && self.input.peek() == Some('{') {
456                consume_cooked!();
457                let cooked = cooked.map(|cooked| self.atoms.wtf8_atom(&*cooked));
458                let raw = raw_atom(self);
459                self.input.bump_bytes(2);
460                return Ok(if started_with_backtick {
461                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
462                    Token::TemplateHead
463                } else {
464                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
465                    Token::TemplateMiddle
466                });
467            } else if c == '\\' {
468                consume_cooked!();
469
470                match self.read_escaped_char(true) {
471                    Ok(Some(escaped)) => {
472                        if let Ok(ref mut cooked) = cooked {
473                            cooked.push(escaped);
474                        }
475                    }
476                    Ok(None) => {}
477                    Err(error) => {
478                        cooked = Err(error);
479                    }
480                }
481
482                cooked_slice_start = self.cur_pos();
483            } else if c.is_line_terminator() {
484                consume_cooked!();
485
486                let c = if c == '\r' && self.peek() == Some('\n') {
487                    self.bump(); // '\r'
488                    '\n'
489                } else {
490                    match c {
491                        '\n' => '\n',
492                        '\r' => '\n',
493                        '\u{2028}' => '\u{2028}',
494                        '\u{2029}' => '\u{2029}',
495                        _ => unreachable!(),
496                    }
497                };
498
499                self.bump();
500
501                if let Ok(ref mut cooked) = cooked {
502                    cooked.push_char(c);
503                }
504                cooked_slice_start = self.cur_pos();
505            } else {
506                self.bump();
507            }
508        }
509
510        self.error(start, SyntaxError::UnterminatedTpl)?
511    }
512}
513
514impl<'a> Lexer<'a> {
515    #[inline(always)]
516    #[allow(clippy::misnamed_getters)]
517    fn had_line_break_before_last(&self) -> bool {
518        self.state().had_line_break()
519    }
520
521    #[inline(always)]
522    fn span(&self, start: BytePos) -> Span {
523        let end = self.last_pos();
524        if cfg!(debug_assertions) && start > end {
525            unreachable!(
526                "assertion failed: (span.start <= span.end).
527 start = {}, end = {}",
528                start.0, end.0
529            )
530        }
531        Span { lo: start, hi: end }
532    }
533
534    #[inline(always)]
535    fn bump(&mut self) {
536        unsafe {
537            // Safety: Actually this is not safe but this is an internal method.
538            self.input_mut().bump()
539        }
540    }
541
542    #[inline(always)]
543    fn is(&self, c: u8) -> bool {
544        self.input().is_byte(c)
545    }
546
547    #[inline(always)]
548    fn is_str(&self, s: &str) -> bool {
549        self.input().is_str(s)
550    }
551
552    #[inline(always)]
553    fn eat(&mut self, c: u8) -> bool {
554        self.input_mut().eat_byte(c)
555    }
556
557    #[inline(always)]
558    fn cur(&self) -> Option<char> {
559        self.input().cur()
560    }
561
562    #[inline(always)]
563    fn peek(&self) -> Option<char> {
564        self.input().peek()
565    }
566
567    #[inline(always)]
568    fn peek_ahead(&self) -> Option<char> {
569        self.input().peek_ahead()
570    }
571
572    #[inline(always)]
573    fn cur_pos(&self) -> BytePos {
574        self.input().cur_pos()
575    }
576
577    #[inline(always)]
578    fn last_pos(&self) -> BytePos {
579        self.input().last_pos()
580    }
581
582    /// Shorthand for `let span = self.span(start); self.error_span(span)`
583    #[cold]
584    #[inline(never)]
585    fn error<T>(&self, start: BytePos, kind: SyntaxError) -> LexResult<T> {
586        let span = self.span(start);
587        self.error_span(span, kind)
588    }
589
590    #[cold]
591    #[inline(never)]
592    fn error_span<T>(&self, span: Span, kind: SyntaxError) -> LexResult<T> {
593        Err(crate::error::Error::new(span, kind))
594    }
595
596    #[cold]
597    #[inline(never)]
598    fn emit_error(&mut self, start: BytePos, kind: SyntaxError) {
599        let span = self.span(start);
600        self.emit_error_span(span, kind)
601    }
602
603    #[cold]
604    #[inline(never)]
605    fn emit_error_span(&mut self, span: Span, kind: SyntaxError) {
606        if self.ctx().contains(Context::IgnoreError) {
607            return;
608        }
609        tracing::warn!("Lexer error at {:?}", span);
610        let err = crate::error::Error::new(span, kind);
611        self.push_error(err);
612    }
613
614    #[cold]
615    #[inline(never)]
616    fn emit_strict_mode_error(&mut self, start: BytePos, kind: SyntaxError) {
617        let span = self.span(start);
618        if self.ctx().contains(Context::Strict) {
619            self.emit_error_span(span, kind);
620        } else {
621            let err = crate::error::Error::new(span, kind);
622            self.add_module_mode_error(err);
623        }
624    }
625
626    #[cold]
627    #[inline(never)]
628    fn emit_module_mode_error(&mut self, start: BytePos, kind: SyntaxError) {
629        let span = self.span(start);
630        let err = crate::error::Error::new(span, kind);
631        self.add_module_mode_error(err);
632    }
633
634    #[inline(never)]
635    fn skip_line_comment(&mut self, start_skip: usize) {
636        // Position after the initial `//` (or similar)
637        let start = self.cur_pos();
638        self.input_mut().bump_bytes(start_skip);
639        let slice_start = self.cur_pos();
640
641        // foo // comment for foo
642        // bar
643        //
644        // foo
645        // // comment for bar
646        // bar
647        //
648        let is_for_next =
649            self.state().had_line_break() || !self.state().can_have_trailing_line_comment();
650
651        // Fast search for line-terminator
652        byte_search! {
653            lexer: self,
654            table: LINE_BREAK_TABLE,
655            continue_if: (matched_byte, pos_offset) {
656                if matched_byte != LS_OR_PS_FIRST {
657                    // '\r' or '\n' - definitely a line terminator
658                    false
659                } else {
660                    // 0xE2 - could be LS/PS or some other Unicode character
661                    // Check the next 2 bytes to see if it's really LS/PS
662                    let current_slice = self.input().as_str();
663                    let byte_pos = pos_offset;
664                    if byte_pos + 2 < current_slice.len() {
665                        let bytes = current_slice.as_bytes();
666                        let next2 = [bytes[byte_pos + 1], bytes[byte_pos + 2]];
667                        if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
668                            // It's a real line terminator
669                            false
670                        } else {
671                            // Some other Unicode character starting with 0xE2
672                            true
673                        }
674                    } else {
675                        // Not enough bytes for full LS/PS sequence
676                        true
677                    }
678                }
679            },
680            handle_eof: {
681                // Reached EOF – entire remainder is comment
682                let end = self.input().end_pos();
683
684                if self.comments_buffer().is_some() {
685                    let s = unsafe { self.input_slice(slice_start, end) };
686                    let cmt = swc_common::comments::Comment {
687                        kind: swc_common::comments::CommentKind::Line,
688                        span: Span::new_with_checked(start, end),
689                        text: self.atom(s),
690                    };
691
692                    if is_for_next {
693                        self.comments_buffer_mut().unwrap().push_pending(cmt);
694                    } else {
695                        let pos = self.state().prev_hi();
696                        self.comments_buffer_mut().unwrap().push_comment(BufferedComment {
697                            kind: BufferedCommentKind::Trailing,
698                            pos,
699                            comment: cmt,
700                        });
701                    }
702                }
703
704                return;
705            }
706        };
707
708        // Current position is at the line terminator
709        let end = self.cur_pos();
710
711        // Create and process slice only if comments need to be stored
712        if self.comments_buffer().is_some() {
713            let s = unsafe {
714                // Safety: We know that the start and the end are valid
715                self.input_slice_to_cur(slice_start)
716            };
717            let cmt = swc_common::comments::Comment {
718                kind: swc_common::comments::CommentKind::Line,
719                span: Span::new_with_checked(start, end),
720                text: self.atom(s),
721            };
722
723            if is_for_next {
724                self.comments_buffer_mut().unwrap().push_pending(cmt);
725            } else {
726                let pos = self.state().prev_hi();
727                self.comments_buffer_mut()
728                    .unwrap()
729                    .push_comment(BufferedComment {
730                        kind: BufferedCommentKind::Trailing,
731                        pos,
732                        comment: cmt,
733                    });
734            }
735        }
736
737        unsafe {
738            // Safety: We got end from self.input
739            self.input_mut().reset_to(end);
740        }
741    }
742
743    /// Expects current char to be '/' and next char to be '*'.
744    fn skip_block_comment(&mut self) {
745        let start = self.cur_pos();
746
747        debug_assert_eq!(self.cur(), Some('/'));
748        debug_assert_eq!(self.peek(), Some('*'));
749
750        // Consume initial "/*"
751        self.input_mut().bump_bytes(2);
752
753        // jsdoc
754        let slice_start = self.cur_pos();
755
756        let had_line_break_before_last = self.had_line_break_before_last();
757        let mut should_mark_had_line_break = false;
758
759        loop {
760            let matched_byte = byte_search! {
761                lexer: self,
762                table: BLOCK_COMMENT_SCAN_TABLE,
763                continue_if: (matched_byte, pos_offset) {
764                    if matched_byte == LS_OR_PS_FIRST {
765                        // 0xE2 - could be LS/PS or some other Unicode character
766                        let current_slice = self.input().as_str();
767                        let byte_pos = pos_offset;
768                        if byte_pos + 2 < current_slice.len() {
769                            let bytes = current_slice.as_bytes();
770                            let next2 = [bytes[byte_pos + 1], bytes[byte_pos + 2]];
771                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
772                                // It's a real line terminator - don't continue
773                                false
774                            } else {
775                                // Some other Unicode character starting with 0xE2
776                                true
777                            }
778                        } else {
779                            // Not enough bytes for full LS/PS sequence
780                            true
781                        }
782                    } else {
783                        // '*', '\r', or '\n' - don't continue
784                        false
785                    }
786                },
787                handle_eof: {
788                    if should_mark_had_line_break {
789                        self.state_mut().mark_had_line_break();
790                    }
791                    let end_pos = self.input().end_pos();
792                    let span = Span::new_with_checked(end_pos, end_pos);
793                    self.emit_error_span(span, SyntaxError::UnterminatedBlockComment);
794                    return;
795                }
796            };
797
798            match matched_byte {
799                b'*' => {
800                    if self.peek() == Some('/') {
801                        // Consume "*/"
802                        self.input_mut().bump_bytes(2);
803
804                        if should_mark_had_line_break {
805                            self.state_mut().mark_had_line_break();
806                        }
807
808                        let end = self.cur_pos();
809
810                        // Decide trailing / leading
811                        let mut is_for_next =
812                            had_line_break_before_last || !self.state().can_have_trailing_comment();
813
814                        // If next char is ';' without newline, treat as trailing
815                        if !had_line_break_before_last && self.input().is_byte(b';') {
816                            is_for_next = false;
817                        }
818
819                        if self.comments_buffer().is_some() {
820                            let src = unsafe {
821                                // Safety: We got slice_start and end from self.input so those are
822                                // valid.
823                                self.input_mut().slice(slice_start, end)
824                            };
825                            let s = &src[..src.len() - 2];
826                            let cmt = Comment {
827                                kind: CommentKind::Block,
828                                span: Span::new_with_checked(start, end),
829                                text: self.atom(s),
830                            };
831
832                            if is_for_next {
833                                self.comments_buffer_mut().unwrap().push_pending(cmt);
834                            } else {
835                                let pos = self.state().prev_hi();
836                                self.comments_buffer_mut()
837                                    .unwrap()
838                                    .push_comment(BufferedComment {
839                                        kind: BufferedCommentKind::Trailing,
840                                        pos,
841                                        comment: cmt,
842                                    });
843                            }
844                        }
845
846                        return;
847                    } else {
848                        // Just a lone '*', consume it and continue.
849                        self.bump();
850                    }
851                }
852                b'\n' => {
853                    should_mark_had_line_break = true;
854                    self.bump();
855                }
856                b'\r' => {
857                    should_mark_had_line_break = true;
858                    self.bump();
859                    if self.peek() == Some('\n') {
860                        self.bump();
861                    }
862                }
863                _ => {
864                    // Unicode line terminator (LS/PS) or other character
865                    if let Some('\u{2028}' | '\u{2029}') = self.cur() {
866                        should_mark_had_line_break = true;
867                    }
868                    self.bump();
869                }
870            }
871        }
872    }
873
874    /// Skip comments or whitespaces.
875    ///
876    /// See https://tc39.github.io/ecma262/#sec-white-space
877    #[inline(never)]
878    fn skip_space<const LEX_COMMENTS: bool>(&mut self) {
879        loop {
880            let (offset, newline) = {
881                let mut skip = self::whitespace::SkipWhitespace {
882                    input: self.input().as_str(),
883                    newline: false,
884                    offset: 0,
885                };
886
887                skip.scan();
888
889                (skip.offset, skip.newline)
890            };
891
892            self.input_mut().bump_bytes(offset as usize);
893            if newline {
894                self.state_mut().mark_had_line_break();
895            }
896
897            if LEX_COMMENTS && self.input().is_byte(b'/') {
898                if let Some(c) = self.peek() {
899                    if c == '/' {
900                        self.skip_line_comment(2);
901                        continue;
902                    } else if c == '*' {
903                        self.skip_block_comment();
904                        continue;
905                    }
906                }
907            }
908
909            break;
910        }
911    }
912
913    /// Ensure that ident cannot directly follow numbers.
914    fn ensure_not_ident(&mut self) -> LexResult<()> {
915        match self.cur() {
916            Some(c) if c.is_ident_start() => {
917                let span = pos_span(self.cur_pos());
918                self.error_span(span, SyntaxError::IdentAfterNum)?
919            }
920            _ => Ok(()),
921        }
922    }
923
924    fn make_legacy_octal(&mut self, start: BytePos, val: f64) -> LexResult<f64> {
925        self.ensure_not_ident()?;
926        if self.syntax().typescript() && self.target() >= EsVersion::Es5 {
927            self.emit_error(start, SyntaxError::TS1085);
928        }
929        self.emit_strict_mode_error(start, SyntaxError::LegacyOctal);
930        Ok(val)
931    }
932
933    /// `op`- |total, radix, value| -> (total * radix + value, continue)
934    fn read_digits<F, Ret, const RADIX: u8>(
935        &mut self,
936        mut op: F,
937        allow_num_separator: bool,
938        has_underscore: &mut bool,
939    ) -> LexResult<Ret>
940    where
941        F: FnMut(Ret, u8, u32) -> LexResult<(Ret, bool)>,
942        Ret: Copy + Default,
943    {
944        debug_assert!(
945            RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16,
946            "radix for read_int should be one of 2, 8, 10, 16, but got {RADIX}"
947        );
948
949        if cfg!(feature = "debug") {
950            tracing::trace!("read_digits(radix = {}), cur = {:?}", RADIX, self.cur());
951        }
952
953        let start = self.cur_pos();
954        let mut total: Ret = Default::default();
955        let mut prev = None;
956
957        while let Some(c) = self.cur() {
958            if c == '_' {
959                *has_underscore = true;
960                if allow_num_separator {
961                    let is_allowed = |c: Option<char>| {
962                        let Some(c) = c else {
963                            return false;
964                        };
965                        c.is_digit(RADIX as _)
966                    };
967                    let is_forbidden = |c: Option<char>| {
968                        let Some(c) = c else {
969                            return false;
970                        };
971
972                        if RADIX == 16 {
973                            matches!(c, '.' | 'X' | '_' | 'x')
974                        } else {
975                            matches!(c, '.' | 'B' | 'E' | 'O' | '_' | 'b' | 'e' | 'o')
976                        }
977                    };
978
979                    let next = self.input().peek();
980
981                    if !is_allowed(next) || is_forbidden(prev) || is_forbidden(next) {
982                        self.emit_error(
983                            start,
984                            SyntaxError::NumericSeparatorIsAllowedOnlyBetweenTwoDigits,
985                        );
986                    }
987
988                    // Ignore this _ character
989                    unsafe {
990                        // Safety: cur() returns Some(c) where c is a valid char
991                        self.input_mut().bump();
992                    }
993
994                    continue;
995                }
996            }
997
998            // e.g. (val for a) = 10  where radix = 16
999            let val = if let Some(val) = c.to_digit(RADIX as _) {
1000                val
1001            } else {
1002                return Ok(total);
1003            };
1004
1005            self.bump();
1006
1007            let (t, cont) = op(total, RADIX, val)?;
1008
1009            total = t;
1010
1011            if !cont {
1012                return Ok(total);
1013            }
1014
1015            prev = Some(c);
1016        }
1017
1018        Ok(total)
1019    }
1020
1021    /// This can read long integers like
1022    /// "13612536612375123612312312312312312312312".
1023    ///
1024    /// - Returned `bool` is `true` is there was `8` or `9`.
1025    fn read_number_no_dot_as_str<const RADIX: u8>(&mut self) -> LexResult<LazyInteger> {
1026        debug_assert!(
1027            RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16,
1028            "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {RADIX}"
1029        );
1030        let start = self.cur_pos();
1031
1032        let mut not_octal = false;
1033        let mut read_any = false;
1034        let mut has_underscore = false;
1035
1036        self.read_digits::<_, (), RADIX>(
1037            |_, _, v| {
1038                read_any = true;
1039
1040                if v == 8 || v == 9 {
1041                    not_octal = true;
1042                }
1043
1044                Ok(((), true))
1045            },
1046            true,
1047            &mut has_underscore,
1048        )?;
1049
1050        if !read_any {
1051            self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?;
1052        }
1053
1054        Ok(LazyInteger {
1055            start,
1056            end: self.cur_pos(),
1057            not_octal,
1058            has_underscore,
1059        })
1060    }
1061
1062    /// Reads an integer, octal integer, or floating-point number
1063    fn read_number<const START_WITH_DOT: bool, const START_WITH_ZERO: bool>(
1064        &mut self,
1065    ) -> LexResult<Either<(f64, Atom), (Box<BigIntValue>, Atom)>> {
1066        debug_assert!(!(START_WITH_DOT && START_WITH_ZERO));
1067        debug_assert!(self.cur().is_some());
1068
1069        let start = self.cur_pos();
1070        let mut has_underscore = false;
1071
1072        let lazy_integer = if START_WITH_DOT {
1073            // first char is '.'
1074            debug_assert!(
1075                self.cur().is_some_and(|c| c == '.'),
1076                "read_number<START_WITH_DOT = true> expects current char to be '.'"
1077            );
1078            LazyInteger {
1079                start,
1080                end: start,
1081                not_octal: true,
1082                has_underscore: false,
1083            }
1084        } else {
1085            debug_assert!(!START_WITH_DOT);
1086            debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == '0');
1087
1088            // Use read_number_no_dot to support long numbers.
1089            let lazy_integer = self.read_number_no_dot_as_str::<10>()?;
1090            let s = unsafe {
1091                // Safety: We got both start and end position from `self.input`
1092                self.input_slice_to_cur(lazy_integer.start)
1093            };
1094
1095            // legacy octal number is not allowed in bigint.
1096            if (!START_WITH_ZERO || lazy_integer.end - lazy_integer.start == BytePos(1))
1097                && self.eat(b'n')
1098            {
1099                let raw = unsafe {
1100                    // Safety: We got both start and end position from `self.input`
1101                    self.input_slice_to_cur(start)
1102                };
1103                let bigint_value = num_bigint::BigInt::parse_bytes(s.as_bytes(), 10).unwrap();
1104                return Ok(Either::Right((Box::new(bigint_value), self.atom(raw))));
1105            }
1106
1107            if START_WITH_ZERO {
1108                // TODO: I guess it would be okay if I don't use -ffast-math
1109                // (or something like that), but needs review.
1110                if s.as_bytes().iter().all(|&c| c == b'0') {
1111                    // If only one zero is used, it's decimal.
1112                    // And if multiple zero is used, it's octal.
1113                    //
1114                    // e.g. `0` is decimal (so it can be part of float)
1115                    //
1116                    // e.g. `000` is octal
1117                    if start.0 != self.last_pos().0 - 1 {
1118                        let raw = unsafe {
1119                            // Safety: We got both start and end position from `self.input`
1120                            self.input_slice_to_cur(start)
1121                        };
1122                        let raw = self.atom(raw);
1123                        return self
1124                            .make_legacy_octal(start, 0f64)
1125                            .map(|value| Either::Left((value, raw)));
1126                    }
1127                } else if lazy_integer.not_octal {
1128                    // if it contains '8' or '9', it's decimal.
1129                    self.emit_strict_mode_error(start, SyntaxError::LegacyDecimal);
1130                } else {
1131                    // It's Legacy octal, and we should reinterpret value.
1132                    let s = remove_underscore(s, lazy_integer.has_underscore);
1133                    let val = parse_integer::<8>(&s);
1134                    let raw = unsafe {
1135                        // Safety: We got both start and end position from `self.input`
1136                        self.input_slice_to_cur(start)
1137                    };
1138                    let raw = self.atom(raw);
1139                    return self
1140                        .make_legacy_octal(start, val)
1141                        .map(|value| Either::Left((value, raw)));
1142                }
1143            }
1144
1145            lazy_integer
1146        };
1147
1148        has_underscore |= lazy_integer.has_underscore;
1149        // At this point, number cannot be an octal literal.
1150
1151        let has_dot = self.cur() == Some('.');
1152        //  `0.a`, `08.a`, `102.a` are invalid.
1153        //
1154        // `.1.a`, `.1e-4.a` are valid,
1155        if has_dot {
1156            self.bump();
1157
1158            // equal: if START_WITH_DOT { debug_assert!(xxxx) }
1159            debug_assert!(!START_WITH_DOT || self.cur().is_some_and(|cur| cur.is_ascii_digit()));
1160
1161            // Read numbers after dot
1162            self.read_digits::<_, (), 10>(|_, _, _| Ok(((), true)), true, &mut has_underscore)?;
1163        }
1164
1165        let has_e = self.cur().is_some_and(|c| c == 'e' || c == 'E');
1166        // Handle 'e' and 'E'
1167        //
1168        // .5e1 = 5
1169        // 1e2 = 100
1170        // 1e+2 = 100
1171        // 1e-2 = 0.01
1172        if has_e {
1173            self.bump(); // `e`/`E`
1174
1175            let next = match self.cur() {
1176                Some(next) => next,
1177                None => {
1178                    let pos = self.cur_pos();
1179                    self.error(pos, SyntaxError::NumLitTerminatedWithExp)?
1180                }
1181            };
1182
1183            if next == '+' || next == '-' {
1184                self.bump(); // remove '+', '-'
1185            }
1186
1187            let lazy_integer = self.read_number_no_dot_as_str::<10>()?;
1188            has_underscore |= lazy_integer.has_underscore;
1189        }
1190
1191        let val = if has_dot || has_e {
1192            let raw = unsafe {
1193                // Safety: We got both start and end position from `self.input`
1194                self.input_slice_to_cur(start)
1195            };
1196
1197            let raw = remove_underscore(raw, has_underscore);
1198            raw.parse().expect("failed to parse float literal")
1199        } else {
1200            let s = unsafe { self.input_slice(lazy_integer.start, lazy_integer.end) };
1201            let s = remove_underscore(s, has_underscore);
1202            parse_integer::<10>(&s)
1203        };
1204
1205        self.ensure_not_ident()?;
1206
1207        let raw_str = unsafe {
1208            // Safety: We got both start and end position from `self.input`
1209            self.input_slice_to_cur(start)
1210        };
1211        Ok(Either::Left((val, raw_str.into())))
1212    }
1213
1214    fn read_int_u32<const RADIX: u8>(&mut self, len: u8) -> LexResult<Option<u32>> {
1215        let start = self.state().start();
1216
1217        let mut count = 0;
1218        let v = self.read_digits::<_, Option<u32>, RADIX>(
1219            |opt: Option<u32>, radix, val| {
1220                count += 1;
1221
1222                let total = opt
1223                    .unwrap_or_default()
1224                    .checked_mul(radix as u32)
1225                    .and_then(|v| v.checked_add(val))
1226                    .ok_or_else(|| {
1227                        let span = Span::new_with_checked(start, start);
1228                        crate::error::Error::new(span, SyntaxError::InvalidUnicodeEscape)
1229                    })?;
1230
1231                Ok((Some(total), count != len))
1232            },
1233            true,
1234            &mut false,
1235        )?;
1236        if len != 0 && count != len {
1237            Ok(None)
1238        } else {
1239            Ok(v)
1240        }
1241    }
1242
1243    /// Returns `Left(value)` or `Right(BigInt)`
1244    fn read_radix_number<const RADIX: u8>(
1245        &mut self,
1246    ) -> LexResult<Either<(f64, Atom), (Box<BigIntValue>, Atom)>> {
1247        debug_assert!(
1248            RADIX == 2 || RADIX == 8 || RADIX == 16,
1249            "radix should be one of 2, 8, 16, but got {RADIX}"
1250        );
1251        let start = self.cur_pos();
1252
1253        debug_assert_eq!(self.cur(), Some('0'));
1254        self.bump();
1255
1256        debug_assert!(self
1257            .cur()
1258            .is_some_and(|c| matches!(c, 'b' | 'B' | 'o' | 'O' | 'x' | 'X')));
1259        self.bump();
1260
1261        let lazy_integer = self.read_number_no_dot_as_str::<RADIX>()?;
1262        let has_underscore = lazy_integer.has_underscore;
1263
1264        let s = unsafe {
1265            // Safety: We got both start and end position from `self.input`
1266            self.input_slice_to_cur(lazy_integer.start)
1267        };
1268        if self.eat(b'n') {
1269            let raw = unsafe {
1270                // Safety: We got both start and end position from `self.input`
1271                self.input_slice_to_cur(start)
1272            };
1273
1274            let bigint_value = num_bigint::BigInt::parse_bytes(s.as_bytes(), RADIX as _).unwrap();
1275            return Ok(Either::Right((Box::new(bigint_value), self.atom(raw))));
1276        }
1277        let s = remove_underscore(s, has_underscore);
1278        let val = parse_integer::<RADIX>(&s);
1279
1280        self.ensure_not_ident()?;
1281
1282        let raw = unsafe {
1283            // Safety: We got both start and end position from `self.input`
1284            self.input_slice_to_cur(start)
1285        };
1286
1287        Ok(Either::Left((val, self.atom(raw))))
1288    }
1289
1290    /// Consume pending comments.
1291    ///
1292    /// This is called when the input is exhausted.
1293    #[cold]
1294    #[inline(never)]
1295    fn consume_pending_comments(&mut self) {
1296        if let Some(comments) = self.comments() {
1297            let last = self.state().prev_hi();
1298            let start_pos = self.start_pos();
1299            let comments_buffer = self.comments_buffer_mut().unwrap();
1300
1301            // if the file had no tokens and no shebang, then treat any
1302            // comments in the leading comments buffer as leading.
1303            // Otherwise treat them as trailing.
1304            let kind = if last == start_pos {
1305                BufferedCommentKind::Leading
1306            } else {
1307                BufferedCommentKind::Trailing
1308            };
1309            // move the pending to the leading or trailing
1310            comments_buffer.pending_to_comment(kind, last);
1311
1312            // now fill the user's passed in comments
1313            for comment in comments_buffer.take_comments() {
1314                match comment.kind {
1315                    BufferedCommentKind::Leading => {
1316                        comments.add_leading(comment.pos, comment.comment);
1317                    }
1318                    BufferedCommentKind::Trailing => {
1319                        comments.add_trailing(comment.pos, comment.comment);
1320                    }
1321                }
1322            }
1323        }
1324    }
1325
1326    fn read_jsx_entity(&mut self) -> LexResult<(char, String)> {
1327        debug_assert!(self.syntax().jsx());
1328
1329        fn from_code(s: &str, radix: u32) -> LexResult<char> {
1330            // TODO(kdy1): unwrap -> Err
1331            let c = char::from_u32(
1332                u32::from_str_radix(s, radix).expect("failed to parse string as number"),
1333            )
1334            .expect("failed to parse number as char");
1335
1336            Ok(c)
1337        }
1338
1339        fn is_hex(s: &str) -> bool {
1340            s.chars().all(|c| c.is_ascii_hexdigit())
1341        }
1342
1343        fn is_dec(s: &str) -> bool {
1344            s.chars().all(|c| c.is_ascii_digit())
1345        }
1346
1347        let mut s = SmartString::<LazyCompact>::default();
1348
1349        debug_assert!(self.input().cur().is_some_and(|c| c == '&'));
1350        self.bump();
1351
1352        let start_pos = self.input().cur_pos();
1353
1354        for _ in 0..10 {
1355            let c = match self.input().cur() {
1356                Some(c) => c,
1357                None => break,
1358            };
1359            self.bump();
1360
1361            if c == ';' {
1362                if let Some(stripped) = s.strip_prefix('#') {
1363                    if stripped.starts_with('x') {
1364                        if is_hex(&s[2..]) {
1365                            let value = from_code(&s[2..], 16)?;
1366
1367                            return Ok((value, format!("&{s};")));
1368                        }
1369                    } else if is_dec(stripped) {
1370                        let value = from_code(stripped, 10)?;
1371
1372                        return Ok((value, format!("&{s};")));
1373                    }
1374                } else if let Some(entity) = xhtml(&s) {
1375                    return Ok((entity, format!("&{s};")));
1376                }
1377
1378                break;
1379            }
1380
1381            s.push(c)
1382        }
1383
1384        unsafe {
1385            // Safety: start_pos is a valid position because we got it from self.input
1386            self.input_mut().reset_to(start_pos);
1387        }
1388
1389        Ok(('&', "&".to_string()))
1390    }
1391
1392    fn read_jsx_new_line(&mut self, normalize_crlf: bool) -> LexResult<Either<&'static str, char>> {
1393        debug_assert!(self.syntax().jsx());
1394        let ch = self.input().cur().unwrap();
1395        self.bump();
1396
1397        let out = if ch == '\r' && self.input().cur() == Some('\n') {
1398            self.bump(); // `\n`
1399            Either::Left(if normalize_crlf { "\n" } else { "\r\n" })
1400        } else {
1401            Either::Right(ch)
1402        };
1403        Ok(out)
1404    }
1405
1406    fn read_jsx_str(&mut self, quote: char) -> LexResult<Token> {
1407        debug_assert!(self.syntax().jsx());
1408        let start = self.input().cur_pos();
1409        unsafe {
1410            // Safety: cur() was Some(quote)
1411            self.input_mut().bump(); // `quote`
1412        }
1413        let mut out = String::new();
1414        let mut chunk_start = self.input().cur_pos();
1415        loop {
1416            let ch = match self.input().cur() {
1417                Some(c) => c,
1418                None => {
1419                    self.emit_error(start, SyntaxError::UnterminatedStrLit);
1420                    break;
1421                }
1422            };
1423            let cur_pos = self.input().cur_pos();
1424            if ch == '\\' {
1425                let value = unsafe {
1426                    // Safety: We already checked for the range
1427                    self.input_slice_to_cur(chunk_start)
1428                };
1429
1430                out.push_str(value);
1431                out.push('\\');
1432
1433                self.bump();
1434
1435                chunk_start = self.input().cur_pos();
1436
1437                continue;
1438            }
1439
1440            if ch == quote {
1441                break;
1442            }
1443
1444            if ch == '&' {
1445                let value = unsafe {
1446                    // Safety: We already checked for the range
1447                    self.input_slice_to_cur(chunk_start)
1448                };
1449
1450                out.push_str(value);
1451
1452                let jsx_entity = self.read_jsx_entity()?;
1453
1454                out.push(jsx_entity.0);
1455
1456                chunk_start = self.input().cur_pos();
1457            } else if ch.is_line_terminator() {
1458                let value = unsafe {
1459                    // Safety: We already checked for the range
1460                    self.input_slice_to_cur(chunk_start)
1461                };
1462
1463                out.push_str(value);
1464
1465                match self.read_jsx_new_line(false)? {
1466                    Either::Left(s) => {
1467                        out.push_str(s);
1468                    }
1469                    Either::Right(c) => {
1470                        out.push(c);
1471                    }
1472                }
1473
1474                chunk_start = cur_pos + BytePos(ch.len_utf8() as _);
1475            } else {
1476                unsafe {
1477                    // Safety: cur() was Some(ch)
1478                    self.input_mut().bump();
1479                }
1480            }
1481        }
1482        let s = unsafe {
1483            // Safety: We already checked for the range
1484            self.input_slice_to_cur(chunk_start)
1485        };
1486        let value = if out.is_empty() {
1487            // Fast path: We don't need to allocate
1488            self.atom(s)
1489        } else {
1490            out.push_str(s);
1491            self.atom(out)
1492        };
1493
1494        // it might be at the end of the file when
1495        // the string literal is unterminated
1496        if self.input().peek_ahead().is_some() {
1497            self.bump();
1498        }
1499
1500        let raw = unsafe {
1501            // Safety: Both of `start` and `end` are generated from `cur_pos()`
1502            self.input_slice_to_cur(start)
1503        };
1504        let raw = self.atom(raw);
1505        Ok(Token::str(value.into(), raw, self))
1506    }
1507
1508    // Modified based on <https://github.com/oxc-project/oxc/blob/f0e1510b44efdb1b0d9a09f950181b0e4c435abe/crates/oxc_parser/src/lexer/unicode.rs#L237>
1509    /// Unicode code unit (`\uXXXX`).
1510    ///
1511    /// The opening `\u` must already have been consumed before calling this
1512    /// method.
1513    ///
1514    /// See background info on surrogate pairs:
1515    ///   * `https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae`
1516    ///   * `https://mathiasbynens.be/notes/javascript-identifiers-es6`
1517    fn read_unicode_code_unit(&mut self) -> LexResult<Option<UnicodeEscape>> {
1518        const MIN_HIGH: u32 = 0xd800;
1519        const MAX_HIGH: u32 = 0xdbff;
1520        const MIN_LOW: u32 = 0xdc00;
1521        const MAX_LOW: u32 = 0xdfff;
1522
1523        let Some(high) = self.read_int_u32::<16>(4)? else {
1524            return Ok(None);
1525        };
1526        if let Some(ch) = char::from_u32(high) {
1527            return Ok(Some(UnicodeEscape::CodePoint(ch)));
1528        }
1529
1530        // The first code unit of a surrogate pair is always in the range from 0xD800 to
1531        // 0xDBFF, and is called a high surrogate or a lead surrogate.
1532        // Note: `high` must be >= `MIN_HIGH`, otherwise `char::from_u32` would have
1533        // returned `Some`, and already exited.
1534        debug_assert!(high >= MIN_HIGH);
1535        let is_pair = high <= MAX_HIGH
1536            && self.input().cur() == Some('\\')
1537            && self.input().peek() == Some('u');
1538        if !is_pair {
1539            return Ok(Some(UnicodeEscape::LoneSurrogate(high)));
1540        }
1541
1542        let before_second = self.input().cur_pos();
1543
1544        // Bump `\u`
1545        self.input_mut().bump_bytes(2);
1546
1547        let Some(low) = self.read_int_u32::<16>(4)? else {
1548            return Ok(None);
1549        };
1550
1551        // The second code unit of a surrogate pair is always in the range from 0xDC00
1552        // to 0xDFFF, and is called a low surrogate or a trail surrogate.
1553        // If this isn't a valid pair, rewind to before the 2nd, and return the first
1554        // only. The 2nd could be the first part of a valid pair.
1555        if !(MIN_LOW..=MAX_LOW).contains(&low) {
1556            unsafe {
1557                // Safety: state is valid position because we got it from cur_pos()
1558                self.input_mut().reset_to(before_second);
1559            }
1560            return Ok(Some(UnicodeEscape::LoneSurrogate(high)));
1561        }
1562
1563        let code_point = pair_to_code_point(high, low);
1564        // SAFETY: `high` and `low` have been checked to be in ranges which always yield
1565        // a `code_point` which is a valid `char`
1566        let ch = unsafe { char::from_u32_unchecked(code_point) };
1567        Ok(Some(UnicodeEscape::SurrogatePair(ch)))
1568    }
1569
1570    fn read_unicode_escape(&mut self) -> LexResult<UnicodeEscape> {
1571        debug_assert_eq!(self.cur(), Some('u'));
1572
1573        let mut is_curly = false;
1574
1575        self.bump(); // 'u'
1576
1577        if self.eat(b'{') {
1578            is_curly = true;
1579        }
1580
1581        let state = self.input().cur_pos();
1582        let c = match self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) {
1583            Ok(Some(val)) => {
1584                if 0x0010_ffff >= val {
1585                    char::from_u32(val)
1586                } else {
1587                    let start = self.cur_pos();
1588
1589                    self.error(
1590                        start,
1591                        SyntaxError::BadCharacterEscapeSequence {
1592                            expected: if is_curly {
1593                                "1-6 hex characters in the range 0 to 10FFFF."
1594                            } else {
1595                                "4 hex characters"
1596                            },
1597                        },
1598                    )?
1599                }
1600            }
1601            _ => {
1602                let start = self.cur_pos();
1603
1604                self.error(
1605                    start,
1606                    SyntaxError::BadCharacterEscapeSequence {
1607                        expected: if is_curly {
1608                            "1-6 hex characters"
1609                        } else {
1610                            "4 hex characters"
1611                        },
1612                    },
1613                )?
1614            }
1615        };
1616
1617        match c {
1618            Some(c) => {
1619                if is_curly && !self.eat(b'}') {
1620                    self.error(state, SyntaxError::InvalidUnicodeEscape)?
1621                }
1622
1623                Ok(UnicodeEscape::CodePoint(c))
1624            }
1625            _ => {
1626                unsafe {
1627                    // Safety: state is valid position because we got it from cur_pos()
1628                    self.input_mut().reset_to(state);
1629                }
1630
1631                let Some(value) = self.read_unicode_code_unit()? else {
1632                    self.error(
1633                        state,
1634                        SyntaxError::BadCharacterEscapeSequence {
1635                            expected: if is_curly {
1636                                "1-6 hex characters"
1637                            } else {
1638                                "4 hex characters"
1639                            },
1640                        },
1641                    )?
1642                };
1643
1644                if is_curly && !self.eat(b'}') {
1645                    self.error(state, SyntaxError::InvalidUnicodeEscape)?
1646                }
1647
1648                Ok(value)
1649            }
1650        }
1651    }
1652
1653    #[cold]
1654    fn read_shebang(&mut self) -> LexResult<Option<Atom>> {
1655        if self.input().cur() != Some('#') || self.input().peek() != Some('!') {
1656            return Ok(None);
1657        }
1658        self.bump(); // `#`
1659        self.bump(); // `!`
1660        let s = self.input_uncons_while(|c| !c.is_line_terminator());
1661        Ok(Some(self.atom(s)))
1662    }
1663
1664    /// Read an escaped character for string literal.
1665    ///
1666    /// In template literal, we should preserve raw string.
1667    fn read_escaped_char(&mut self, in_template: bool) -> LexResult<Option<CodePoint>> {
1668        debug_assert_eq!(self.cur(), Some('\\'));
1669
1670        let start = self.cur_pos();
1671
1672        self.bump(); // '\'
1673
1674        let c = match self.cur() {
1675            Some(c) => c,
1676            None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?,
1677        };
1678
1679        let c = match c {
1680            '\\' => '\\',
1681            'n' => '\n',
1682            'r' => '\r',
1683            't' => '\t',
1684            'b' => '\u{0008}',
1685            'v' => '\u{000b}',
1686            'f' => '\u{000c}',
1687            '\r' => {
1688                self.bump(); // remove '\r'
1689
1690                self.eat(b'\n');
1691
1692                return Ok(None);
1693            }
1694            '\n' | '\u{2028}' | '\u{2029}' => {
1695                self.bump();
1696
1697                return Ok(None);
1698            }
1699
1700            // read hexadecimal escape sequences
1701            'x' => {
1702                self.bump(); // 'x'
1703
1704                match self.read_int_u32::<16>(2)? {
1705                    Some(val) => return Ok(CodePoint::from_u32(val)),
1706                    None => self.error(
1707                        start,
1708                        SyntaxError::BadCharacterEscapeSequence {
1709                            expected: "2 hex characters",
1710                        },
1711                    )?,
1712                }
1713            }
1714
1715            // read unicode escape sequences
1716            'u' => match self.read_unicode_escape() {
1717                Ok(value) => {
1718                    return Ok(Some(value.into()));
1719                }
1720                Err(err) => self.error(start, err.into_kind())?,
1721            },
1722
1723            // octal escape sequences
1724            '0'..='7' => {
1725                self.bump();
1726
1727                let first_c = if c == '0' {
1728                    match self.cur() {
1729                        Some(next) if next.is_digit(8) => c,
1730                        // \0 is not an octal literal nor decimal literal.
1731                        _ => return Ok(Some(CodePoint::from_char('\u{0000}'))),
1732                    }
1733                } else {
1734                    c
1735                };
1736
1737                // TODO: Show template instead of strict mode
1738                if in_template {
1739                    self.error(start, SyntaxError::LegacyOctal)?
1740                }
1741
1742                self.emit_strict_mode_error(start, SyntaxError::LegacyOctal);
1743
1744                let mut value: u8 = first_c.to_digit(8).unwrap() as u8;
1745
1746                macro_rules! one {
1747                    ($check:expr) => {{
1748                        let cur = self.cur();
1749
1750                        match cur.and_then(|c| c.to_digit(8)) {
1751                            Some(v) => {
1752                                value = if $check {
1753                                    let new_val = value
1754                                        .checked_mul(8)
1755                                        .and_then(|value| value.checked_add(v as u8));
1756                                    match new_val {
1757                                        Some(val) => val,
1758                                        None => return Ok(CodePoint::from_u32(value as u32)),
1759                                    }
1760                                } else {
1761                                    value * 8 + v as u8
1762                                };
1763
1764                                self.bump();
1765                            }
1766                            _ => return Ok(CodePoint::from_u32(value as u32)),
1767                        }
1768                    }};
1769                }
1770
1771                one!(false);
1772                one!(true);
1773
1774                return Ok(CodePoint::from_u32(value as u32));
1775            }
1776            _ => c,
1777        };
1778
1779        unsafe {
1780            // Safety: cur() is Some(c) if this method is called.
1781            self.input_mut().bump();
1782        }
1783
1784        Ok(CodePoint::from_u32(c as u32))
1785    }
1786
1787    /// Expects current char to be '/'
1788    fn read_regexp(&mut self, start: BytePos) -> LexResult<Token> {
1789        unsafe {
1790            // Safety: start is valid position, and cur() is Some('/')
1791            self.input_mut().reset_to(start);
1792        }
1793
1794        debug_assert_eq!(self.cur(), Some('/'));
1795
1796        let start = self.cur_pos();
1797
1798        self.bump(); // bump '/'
1799
1800        let slice_start = self.cur_pos();
1801
1802        let (mut escaped, mut in_class) = (false, false);
1803
1804        while let Some(c) = self.cur() {
1805            // This is ported from babel.
1806            // Seems like regexp literal cannot contain linebreak.
1807            if c.is_line_terminator() {
1808                let span = self.span(start);
1809
1810                return Err(crate::error::Error::new(
1811                    span,
1812                    SyntaxError::UnterminatedRegExp,
1813                ));
1814            }
1815
1816            if escaped {
1817                escaped = false;
1818            } else {
1819                match c {
1820                    '[' => in_class = true,
1821                    ']' if in_class => in_class = false,
1822                    // Terminates content part of regex literal
1823                    '/' if !in_class => break,
1824                    _ => {}
1825                }
1826
1827                escaped = c == '\\';
1828            }
1829
1830            self.bump();
1831        }
1832
1833        let content = {
1834            let s = unsafe { self.input_slice_to_cur(slice_start) };
1835            self.atom(s)
1836        };
1837
1838        // input is terminated without following `/`
1839        if !self.is(b'/') {
1840            let span = self.span(start);
1841
1842            return Err(crate::error::Error::new(
1843                span,
1844                SyntaxError::UnterminatedRegExp,
1845            ));
1846        }
1847
1848        self.bump(); // '/'
1849
1850        // Spec says "It is a Syntax Error if IdentifierPart contains a Unicode escape
1851        // sequence." TODO: check for escape
1852
1853        // Need to use `read_word` because '\uXXXX' sequences are allowed
1854        // here (don't ask).
1855        // let flags_start = self.cur_pos();
1856        let flags = {
1857            match self.cur() {
1858                Some(c) if c.is_ident_start() => self
1859                    .read_word_as_str_with()
1860                    .map(|(s, _)| Some(self.atom(s))),
1861                _ => Ok(None),
1862            }
1863        }?
1864        .unwrap_or_default();
1865
1866        Ok(Token::regexp(content, flags, self))
1867    }
1868
1869    /// This method is optimized for texts without escape sequences.
1870    fn read_word_as_str_with(&mut self) -> LexResult<(Cow<'a, str>, bool)> {
1871        debug_assert!(self.cur().is_some());
1872        let slice_start = self.cur_pos();
1873
1874        // Fast path: try to scan ASCII identifier using byte_search
1875        if let Some(c) = self.input().cur_as_ascii() {
1876            if Ident::is_valid_ascii_start(c) {
1877                // Advance past first byte
1878                self.bump();
1879
1880                // Use byte_search to quickly scan to end of ASCII identifier
1881                let next_byte = byte_search! {
1882                    lexer: self,
1883                    table: NOT_ASCII_ID_CONTINUE_TABLE,
1884                    handle_eof: {
1885                        // Reached EOF, entire remainder is identifier
1886                        let s = unsafe {
1887                            // Safety: slice_start and end are valid position because we got them from
1888                            // `self.input`
1889                            self.input_slice_to_cur(slice_start)
1890                        };
1891
1892                        return Ok((Cow::Borrowed(s), false));
1893                    },
1894                };
1895
1896                // Check if we hit end of identifier or need to fall back to slow path
1897                if !next_byte.is_ascii() {
1898                    // Hit Unicode character, fall back to slow path from current position
1899                    return self.read_word_as_str_with_slow_path(slice_start);
1900                } else if next_byte == b'\\' {
1901                    // Hit escape sequence, fall back to slow path from current position
1902                    return self.read_word_as_str_with_slow_path(slice_start);
1903                } else {
1904                    // Hit end of identifier (non-continue ASCII char)
1905                    let s = unsafe {
1906                        // Safety: slice_start and end are valid position because we got them from
1907                        // `self.input`
1908                        self.input_slice_to_cur(slice_start)
1909                    };
1910
1911                    return Ok((Cow::Borrowed(s), false));
1912                }
1913            }
1914        }
1915
1916        // Fall back to slow path for non-ASCII start or complex cases
1917        self.read_word_as_str_with_slow_path(slice_start)
1918    }
1919
1920    /// Slow path for identifier parsing that handles Unicode and escapes
1921    #[cold]
1922    fn read_word_as_str_with_slow_path(
1923        &mut self,
1924        mut slice_start: BytePos,
1925    ) -> LexResult<(Cow<'a, str>, bool)> {
1926        let mut first = true;
1927        let mut has_escape = false;
1928
1929        let mut buf = String::with_capacity(16);
1930        loop {
1931            if let Some(c) = self.input().cur_as_ascii() {
1932                if Ident::is_valid_ascii_continue(c) {
1933                    self.bump();
1934                    continue;
1935                } else if first && Ident::is_valid_ascii_start(c) {
1936                    self.bump();
1937                    first = false;
1938                    continue;
1939                }
1940
1941                // unicode escape
1942                if c == b'\\' {
1943                    first = false;
1944                    has_escape = true;
1945                    let start = self.cur_pos();
1946                    self.bump();
1947
1948                    if !self.is(b'u') {
1949                        self.error_span(pos_span(start), SyntaxError::ExpectedUnicodeEscape)?
1950                    }
1951
1952                    {
1953                        let end = self.input().cur_pos();
1954                        let s = unsafe {
1955                            // Safety: start and end are valid position because we got them from
1956                            // `self.input`
1957                            self.input_slice(slice_start, start)
1958                        };
1959                        buf.push_str(s);
1960                        unsafe {
1961                            // Safety: We got end from `self.input`
1962                            self.input_mut().reset_to(end);
1963                        }
1964                    }
1965
1966                    let value = self.read_unicode_escape()?;
1967
1968                    match value {
1969                        UnicodeEscape::CodePoint(ch) => {
1970                            let valid = if first {
1971                                ch.is_ident_start()
1972                            } else {
1973                                ch.is_ident_part()
1974                            };
1975                            if !valid {
1976                                self.emit_error(start, SyntaxError::InvalidIdentChar);
1977                            }
1978                            buf.push(ch);
1979                        }
1980                        UnicodeEscape::SurrogatePair(ch) => {
1981                            buf.push(ch);
1982                            self.emit_error(start, SyntaxError::InvalidIdentChar);
1983                        }
1984                        UnicodeEscape::LoneSurrogate(code_point) => {
1985                            buf.push_str(format!("\\u{code_point:04X}").as_str());
1986                            self.emit_error(start, SyntaxError::InvalidIdentChar);
1987                        }
1988                    };
1989
1990                    slice_start = self.cur_pos();
1991                    continue;
1992                }
1993
1994                // ASCII but not a valid identifier
1995                break;
1996            } else if let Some(c) = self.input().cur() {
1997                if Ident::is_valid_non_ascii_continue(c) {
1998                    self.bump();
1999                    continue;
2000                } else if first && Ident::is_valid_non_ascii_start(c) {
2001                    self.bump();
2002                    first = false;
2003                    continue;
2004                }
2005            }
2006
2007            break;
2008        }
2009
2010        let end = self.cur_pos();
2011        let s = unsafe {
2012            // Safety: slice_start and end are valid position because we got them from
2013            // `self.input`
2014            self.input_slice(slice_start, end)
2015        };
2016        let value = if !has_escape {
2017            // Fast path: raw slice is enough if there's no escape.
2018            Cow::Borrowed(s)
2019        } else {
2020            buf.push_str(s);
2021            Cow::Owned(buf)
2022        };
2023
2024        Ok((value, has_escape))
2025    }
2026
2027    /// `#`
2028    fn read_token_number_sign(&mut self) -> LexResult<Token> {
2029        debug_assert!(self.cur().is_some_and(|c| c == '#'));
2030
2031        self.bump(); // '#'
2032
2033        // `#` can also be a part of shebangs, however they should have been
2034        // handled by `read_shebang()`
2035        debug_assert!(
2036            !self.input().is_at_start() || self.cur() != Some('!'),
2037            "#! should have already been handled by read_shebang()"
2038        );
2039        Ok(Token::Hash)
2040    }
2041
2042    /// Read a token given `.`.
2043    ///
2044    /// This is extracted as a method to reduce size of `read_token`.
2045    fn read_token_dot(&mut self) -> LexResult<Token> {
2046        debug_assert!(self.cur().is_some_and(|c| c == '.'));
2047        // Check for eof
2048        let next = match self.input().peek() {
2049            Some(next) => next,
2050            None => {
2051                self.bump(); // '.'
2052                return Ok(Token::Dot);
2053            }
2054        };
2055        if next.is_ascii_digit() {
2056            return self.read_number::<true, false>().map(|v| match v {
2057                Left((value, raw)) => Token::num(value, raw, self),
2058                Right(_) => unreachable!("read_number should not return bigint for leading dot"),
2059            });
2060        }
2061
2062        self.bump(); // 1st `.`
2063
2064        if next == '.' && self.input().peek() == Some('.') {
2065            self.bump(); // 2nd `.`
2066            self.bump(); // 3rd `.`
2067
2068            return Ok(Token::DotDotDot);
2069        }
2070
2071        Ok(Token::Dot)
2072    }
2073
2074    /// Read a token given `?`.
2075    ///
2076    /// This is extracted as a method to reduce size of `read_token`.
2077    fn read_token_question_mark(&mut self) -> LexResult<Token> {
2078        debug_assert!(self.cur().is_some_and(|c| c == '?'));
2079        self.bump();
2080        if self.input_mut().eat_byte(b'?') {
2081            if self.input_mut().eat_byte(b'=') {
2082                Ok(Token::NullishEq)
2083            } else {
2084                Ok(Token::NullishCoalescing)
2085            }
2086        } else {
2087            Ok(Token::QuestionMark)
2088        }
2089    }
2090
2091    /// Read a token given `:`.
2092    ///
2093    /// This is extracted as a method to reduce size of `read_token`.
2094    fn read_token_colon(&mut self) -> LexResult<Token> {
2095        debug_assert!(self.cur().is_some_and(|c| c == ':'));
2096        self.bump(); // ':'
2097        Ok(Token::Colon)
2098    }
2099
2100    /// Read a token given `0`.
2101    ///
2102    /// This is extracted as a method to reduce size of `read_token`.
2103    fn read_token_zero(&mut self) -> LexResult<Token> {
2104        debug_assert_eq!(self.cur(), Some('0'));
2105        let next = self.input().peek();
2106
2107        let bigint = match next {
2108            Some('x') | Some('X') => self.read_radix_number::<16>(),
2109            Some('o') | Some('O') => self.read_radix_number::<8>(),
2110            Some('b') | Some('B') => self.read_radix_number::<2>(),
2111            _ => {
2112                return self.read_number::<false, true>().map(|v| match v {
2113                    Left((value, raw)) => Token::num(value, raw, self),
2114                    Right((value, raw)) => Token::bigint(value, raw, self),
2115                });
2116            }
2117        };
2118
2119        bigint.map(|v| match v {
2120            Left((value, raw)) => Token::num(value, raw, self),
2121            Right((value, raw)) => Token::bigint(value, raw, self),
2122        })
2123    }
2124
2125    /// Read a token given `|` or `&`.
2126    ///
2127    /// This is extracted as a method to reduce size of `read_token`.
2128    fn read_token_logical<const C: u8>(&mut self) -> LexResult<Token> {
2129        debug_assert!(C == b'|' || C == b'&');
2130        let is_bit_and = C == b'&';
2131        let had_line_break_before_last = self.had_line_break_before_last();
2132        let start = self.cur_pos();
2133
2134        unsafe {
2135            // Safety: cur() is Some(c as char)
2136            self.input_mut().bump();
2137        }
2138        let token = if is_bit_and {
2139            Token::Ampersand
2140        } else {
2141            Token::Pipe
2142        };
2143
2144        // '|=', '&='
2145        if self.input_mut().eat_byte(b'=') {
2146            return Ok(if is_bit_and {
2147                Token::BitAndEq
2148            } else {
2149                debug_assert!(token == Token::Pipe);
2150                Token::BitOrEq
2151            });
2152        }
2153
2154        // '||', '&&'
2155        if self.input().cur() == Some(C as char) {
2156            unsafe {
2157                // Safety: cur() is Some(c)
2158                self.input_mut().bump();
2159            }
2160
2161            if self.input().cur() == Some('=') {
2162                unsafe {
2163                    // Safety: cur() is Some('=')
2164                    self.input_mut().bump();
2165                }
2166
2167                return Ok(if is_bit_and {
2168                    Token::LogicalAndEq
2169                } else {
2170                    debug_assert!(token == Token::Pipe);
2171                    Token::LogicalOrEq
2172                });
2173            }
2174
2175            // |||||||
2176            //   ^
2177            if had_line_break_before_last && !is_bit_and && self.is_str("||||| ") {
2178                let span = fixed_len_span(start, 7);
2179                self.emit_error_span(span, SyntaxError::TS1185);
2180                self.skip_line_comment(5);
2181                self.skip_space::<true>();
2182                return self.error_span(span, SyntaxError::TS1185);
2183            }
2184
2185            return Ok(if is_bit_and {
2186                Token::LogicalAnd
2187            } else {
2188                debug_assert!(token == Token::Pipe);
2189                Token::LogicalOr
2190            });
2191        }
2192
2193        Ok(token)
2194    }
2195
2196    /// Read a token given `*` or `%`.
2197    ///
2198    /// This is extracted as a method to reduce size of `read_token`.
2199    fn read_token_mul_mod<const IS_MUL: bool>(&mut self) -> LexResult<Token> {
2200        debug_assert!(self.cur().is_some_and(|c| c == '*' || c == '%'));
2201        self.bump();
2202        let token = if IS_MUL {
2203            if self.input_mut().eat_byte(b'*') {
2204                // `**`
2205                Token::Exp
2206            } else {
2207                Token::Asterisk
2208            }
2209        } else {
2210            Token::Percent
2211        };
2212
2213        Ok(if self.input_mut().eat_byte(b'=') {
2214            if token == Token::Asterisk {
2215                Token::MulEq
2216            } else if token == Token::Percent {
2217                Token::ModEq
2218            } else {
2219                debug_assert!(token == Token::Exp);
2220                Token::ExpEq
2221            }
2222        } else {
2223            token
2224        })
2225    }
2226
2227    fn read_slash(&mut self) -> LexResult<Token> {
2228        debug_assert_eq!(self.cur(), Some('/'));
2229        self.bump(); // '/'
2230        Ok(if self.eat(b'=') {
2231            Token::DivEq
2232        } else {
2233            Token::Slash
2234        })
2235    }
2236
2237    /// This can be used if there's no keyword starting with the first
2238    /// character.
2239    fn read_ident_unknown(&mut self) -> LexResult<Token> {
2240        debug_assert!(self.cur().is_some());
2241
2242        let (s, has_escape) = self.read_word_as_str_with()?;
2243        let atom = self.atom(s);
2244        let word = Token::unknown_ident(atom, self);
2245
2246        if has_escape {
2247            self.update_token_flags(|flags| *flags |= TokenFlags::UNICODE);
2248        }
2249
2250        Ok(word)
2251    }
2252
2253    /// See https://tc39.github.io/ecma262/#sec-literals-string-literals
2254    // TODO: merge `read_str_lit` and `read_jsx_str`
2255    fn read_str_lit(&mut self) -> LexResult<Token> {
2256        debug_assert!(self.cur() == Some('\'') || self.cur() == Some('"'));
2257        let start = self.cur_pos();
2258        let quote = self.cur().unwrap() as u8;
2259
2260        self.bump(); // '"' or '\''
2261
2262        let mut slice_start = self.input().cur_pos();
2263
2264        let mut buf: Option<Wtf8Buf> = None;
2265
2266        loop {
2267            let table = if quote == b'"' {
2268                &DOUBLE_QUOTE_STRING_END_TABLE
2269            } else {
2270                &SINGLE_QUOTE_STRING_END_TABLE
2271            };
2272
2273            let fast_path_result = byte_search! {
2274                lexer: self,
2275                table: table,
2276                handle_eof: {
2277                    let value_end = self.cur_pos();
2278                    let s = unsafe {
2279                            // Safety: slice_start and value_end are valid position because we
2280                            // got them from `self.input`
2281                        self.input_slice(slice_start, value_end)
2282                    };
2283
2284                    self.emit_error(start, SyntaxError::UnterminatedStrLit);
2285
2286                    let end = self.cur_pos();
2287                    let raw = unsafe { self.input_slice(start, end) };
2288                    return Ok(Token::str(self.wtf8_atom(Wtf8::from_str(s)), self.atom(raw), self));
2289                },
2290            };
2291            // dbg!(char::from_u32(fast_path_result as u32));
2292
2293            match fast_path_result {
2294                b'"' | b'\'' if fast_path_result == quote => {
2295                    let value_end = self.cur_pos();
2296
2297                    let value = if let Some(buf) = buf.as_mut() {
2298                        // `buf` only exist when there has escape.
2299                        debug_assert!(unsafe { self.input_slice(start, value_end).contains('\\') });
2300                        let s = unsafe {
2301                            // Safety: slice_start and value_end are valid position because we
2302                            // got them from `self.input`
2303                            self.input_slice(slice_start, value_end)
2304                        };
2305                        buf.push_str(s);
2306                        self.wtf8_atom(&**buf)
2307                    } else {
2308                        let s = unsafe { self.input_slice(slice_start, value_end) };
2309                        self.wtf8_atom(Wtf8::from_str(s))
2310                    };
2311
2312                    unsafe {
2313                        // Safety: cur is quote
2314                        self.input_mut().bump();
2315                    }
2316
2317                    let end = self.cur_pos();
2318                    let raw = unsafe {
2319                        // Safety: start and end are valid position because we got them from
2320                        // `self.input`
2321                        self.input_slice(start, end)
2322                    };
2323                    let raw = self.atom(raw);
2324                    return Ok(Token::str(value, raw, self));
2325                }
2326                b'\\' => {
2327                    let end = self.cur_pos();
2328                    let s = unsafe {
2329                        // Safety: start and end are valid position because we got them from
2330                        // `self.input`
2331                        self.input_slice(slice_start, end)
2332                    };
2333
2334                    if buf.is_none() {
2335                        buf = Some(Wtf8Buf::from_str(s));
2336                    } else {
2337                        buf.as_mut().unwrap().push_str(s);
2338                    }
2339
2340                    if let Some(escaped) = self.read_escaped_char(false)? {
2341                        buf.as_mut().unwrap().push(escaped);
2342                    }
2343
2344                    slice_start = self.cur_pos();
2345                    continue;
2346                }
2347                b'\n' | b'\r' => {
2348                    let end = self.cur_pos();
2349                    let s = unsafe {
2350                        // Safety: start and end are valid position because we got them from
2351                        // `self.input`
2352                        self.input_slice(slice_start, end)
2353                    };
2354
2355                    self.emit_error(start, SyntaxError::UnterminatedStrLit);
2356
2357                    let end = self.cur_pos();
2358
2359                    let raw = unsafe {
2360                        // Safety: start and end are valid position because we got them from
2361                        // `self.input`
2362                        self.input_slice(start, end)
2363                    };
2364                    return Ok(Token::str(
2365                        self.wtf8_atom(Wtf8::from_str(s)),
2366                        self.atom(raw),
2367                        self,
2368                    ));
2369                }
2370                _ => self.bump(),
2371            }
2372        }
2373    }
2374
2375    fn read_keyword_with(&mut self, convert: &dyn Fn(&str) -> Option<Token>) -> LexResult<Token> {
2376        debug_assert!(self.cur().is_some());
2377
2378        let start = self.cur_pos();
2379        let (s, has_escape) = self.read_keyword_as_str_with()?;
2380        if let Some(word) = convert(s.as_ref()) {
2381            // Note: ctx is store in lexer because of this error.
2382            // 'await' and 'yield' may have semantic of reserved word, which means lexer
2383            // should know context or parser should handle this error. Our approach to this
2384            // problem is former one.
2385            if has_escape && word.is_reserved(self.ctx()) {
2386                self.error(
2387                    start,
2388                    SyntaxError::EscapeInReservedWord { word: Atom::new(s) },
2389                )
2390            } else {
2391                Ok(word)
2392            }
2393        } else {
2394            let atom = self.atom(s);
2395            Ok(Token::unknown_ident(atom, self))
2396        }
2397    }
2398
2399    /// This is a performant version of [Lexer::read_word_as_str_with] for
2400    /// reading keywords. We should make sure the first byte is a valid
2401    /// ASCII.
2402    fn read_keyword_as_str_with(&mut self) -> LexResult<(Cow<'a, str>, bool)> {
2403        let slice_start = self.cur_pos();
2404
2405        // Fast path: try to scan ASCII identifier using byte_search
2406        // Performance optimization: check if first char disqualifies as keyword
2407        // Advance past first byte
2408        self.bump();
2409
2410        // Use byte_search to quickly scan to end of ASCII identifier
2411        let next_byte = byte_search! {
2412            lexer: self,
2413            table: NOT_ASCII_ID_CONTINUE_TABLE,
2414            handle_eof: {
2415                // Reached EOF, entire remainder is identifier
2416                let s = unsafe {
2417                    // Safety: slice_start and end are valid position because we got them from
2418                    // `self.input`
2419                    self.input_slice_to_cur(slice_start)
2420                };
2421
2422                return Ok((Cow::Borrowed(s), false));
2423            },
2424        };
2425
2426        // Check if we hit end of identifier or need to fall back to slow path
2427        if !next_byte.is_ascii() || next_byte == b'\\' {
2428            // Hit Unicode character or escape sequence, fall back to slow path from current
2429            // position
2430            self.read_word_as_str_with_slow_path(slice_start)
2431        } else {
2432            // Hit end of identifier (non-continue ASCII char)
2433            let s = unsafe {
2434                // Safety: slice_start and end are valid position because we got them from
2435                // `self.input`
2436                self.input_slice_to_cur(slice_start)
2437            };
2438
2439            Ok((Cow::Borrowed(s), false))
2440        }
2441    }
2442}
2443
2444fn pos_span(p: BytePos) -> Span {
2445    Span::new_with_checked(p, p)
2446}
2447
2448fn fixed_len_span(p: BytePos, len: u32) -> Span {
2449    Span::new_with_checked(p, p + BytePos(len))
2450}
swc_ecma_parser/lexer/mod.rs

swc_ecma_parser/lexer/
mod.rs