swc_ecma_lexer/common/lexer/
mod.rs

1use std::borrow::Cow;
2
3use char::{Char, CharExt};
4use either::Either::{self, Left, Right};
5use num_bigint::BigInt as BigIntValue;
6use smartstring::{LazyCompact, SmartString};
7use state::State;
8use swc_atoms::Atom;
9use swc_common::{
10    comments::{Comment, CommentKind},
11    input::{Input, StringInput},
12    BytePos, Span,
13};
14use swc_ecma_ast::{EsVersion, Ident};
15
16use self::jsx::xhtml;
17use super::{context::Context, input::Tokens};
18use crate::{
19    common::lexer::{
20        comments_buffer::{BufferedComment, BufferedCommentKind, CommentsBufferTrait},
21        number::{parse_integer, LazyInteger},
22    },
23    error::SyntaxError,
24    lexer::TokenFlags,
25};
26
27pub mod char;
28pub mod comments_buffer;
29mod jsx;
30pub mod number;
31mod search;
32pub mod state;
33pub mod token;
34pub mod whitespace;
35
36use token::TokenFactory;
37
38// Byte-search utilities
39use self::search::SafeByteMatchTable;
40use crate::{byte_search, safe_byte_match_table};
41
42// ===== Byte match tables for comment scanning =====
43// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
44const LS_OR_PS_FIRST: u8 = 0xe2;
45const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xa8];
46const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xa9];
47
48static LINE_BREAK_TABLE: SafeByteMatchTable =
49    safe_byte_match_table!(|b| matches!(b, b'\n' | b'\r' | LS_OR_PS_FIRST));
50
51static BLOCK_COMMENT_SCAN_TABLE: SafeByteMatchTable =
52    safe_byte_match_table!(|b| { matches!(b, b'*' | b'\n' | b'\r' | LS_OR_PS_FIRST) });
53
54static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
55    safe_byte_match_table!(|b| matches!(b, b'"' | b'\n' | b'\\' | b'\r'));
56static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
57    safe_byte_match_table!(|b| matches!(b, b'\'' | b'\n' | b'\\' | b'\r'));
58
59static NOT_ASCII_ID_CONTINUE_TABLE: SafeByteMatchTable =
60    safe_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'));
61
62static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable =
63    safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\\' | b'\r'));
64
65pub type LexResult<T> = Result<T, crate::error::Error>;
66
67fn remove_underscore(s: &str, has_underscore: bool) -> Cow<'_, str> {
68    if has_underscore {
69        debug_assert!(s.contains('_'));
70        s.chars().filter(|&c| c != '_').collect::<String>().into()
71    } else {
72        debug_assert!(!s.contains('_'));
73        Cow::Borrowed(s)
74    }
75}
76
77pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
78    type State: self::state::State;
79    type Token: token::TokenFactory<'a, TokenAndSpan, Self, Lexer = Self>;
80    type CommentsBuffer: CommentsBufferTrait;
81
82    fn input(&self) -> &StringInput<'a>;
83    fn input_mut(&mut self) -> &mut StringInput<'a>;
84    fn state(&self) -> &Self::State;
85    fn state_mut(&mut self) -> &mut Self::State;
86    fn comments(&self) -> Option<&'a dyn swc_common::comments::Comments>;
87    fn comments_buffer(&self) -> Option<&Self::CommentsBuffer>;
88    fn comments_buffer_mut(&mut self) -> Option<&mut Self::CommentsBuffer>;
89    /// # Safety
90    ///
91    /// We know that the start and the end are valid
92    unsafe fn input_slice(&mut self, start: BytePos, end: BytePos) -> &'a str;
93    fn input_uncons_while(&mut self, f: impl FnMut(char) -> bool) -> &'a str;
94    fn atom<'b>(&self, s: impl Into<Cow<'b, str>>) -> swc_atoms::Atom;
95    fn push_error(&mut self, error: crate::error::Error);
96
97    #[inline(always)]
98    #[allow(clippy::misnamed_getters)]
99    fn had_line_break_before_last(&self) -> bool {
100        self.state().had_line_break()
101    }
102
103    #[inline(always)]
104    fn span(&self, start: BytePos) -> Span {
105        let end = self.last_pos();
106        if cfg!(debug_assertions) && start > end {
107            unreachable!(
108                "assertion failed: (span.start <= span.end).
109 start = {}, end = {}",
110                start.0, end.0
111            )
112        }
113        Span { lo: start, hi: end }
114    }
115
116    #[inline(always)]
117    fn bump(&mut self) {
118        unsafe {
119            // Safety: Actually this is not safe but this is an internal method.
120            self.input_mut().bump()
121        }
122    }
123
124    #[inline(always)]
125    fn is(&self, c: u8) -> bool {
126        self.input().is_byte(c)
127    }
128
129    #[inline(always)]
130    fn is_str(&self, s: &str) -> bool {
131        self.input().is_str(s)
132    }
133
134    #[inline(always)]
135    fn eat(&mut self, c: u8) -> bool {
136        self.input_mut().eat_byte(c)
137    }
138
139    #[inline(always)]
140    fn cur(&self) -> Option<char> {
141        self.input().cur()
142    }
143
144    #[inline(always)]
145    fn peek(&self) -> Option<char> {
146        self.input().peek()
147    }
148
149    #[inline(always)]
150    fn peek_ahead(&self) -> Option<char> {
151        self.input().peek_ahead()
152    }
153
154    #[inline(always)]
155    fn cur_pos(&self) -> BytePos {
156        self.input().cur_pos()
157    }
158
159    #[inline(always)]
160    fn last_pos(&self) -> BytePos {
161        self.input().last_pos()
162    }
163
164    /// Shorthand for `let span = self.span(start); self.error_span(span)`
165    #[cold]
166    #[inline(never)]
167    fn error<T>(&self, start: BytePos, kind: SyntaxError) -> LexResult<T> {
168        let span = self.span(start);
169        self.error_span(span, kind)
170    }
171
172    #[cold]
173    #[inline(never)]
174    fn error_span<T>(&self, span: Span, kind: SyntaxError) -> LexResult<T> {
175        Err(crate::error::Error::new(span, kind))
176    }
177
178    #[cold]
179    #[inline(never)]
180    fn emit_error(&mut self, start: BytePos, kind: SyntaxError) {
181        let span = self.span(start);
182        self.emit_error_span(span, kind)
183    }
184
185    #[cold]
186    #[inline(never)]
187    fn emit_error_span(&mut self, span: Span, kind: SyntaxError) {
188        if self.ctx().contains(Context::IgnoreError) {
189            return;
190        }
191        tracing::warn!("Lexer error at {:?}", span);
192        let err = crate::error::Error::new(span, kind);
193        self.push_error(err);
194    }
195
196    #[cold]
197    #[inline(never)]
198    fn emit_strict_mode_error(&mut self, start: BytePos, kind: SyntaxError) {
199        let span = self.span(start);
200        if self.ctx().contains(Context::Strict) {
201            self.emit_error_span(span, kind);
202        } else {
203            let err = crate::error::Error::new(span, kind);
204            self.add_module_mode_error(err);
205        }
206    }
207
208    #[cold]
209    #[inline(never)]
210    fn emit_module_mode_error(&mut self, start: BytePos, kind: SyntaxError) {
211        let span = self.span(start);
212        let err = crate::error::Error::new(span, kind);
213        self.add_module_mode_error(err);
214    }
215
216    #[inline(never)]
217    fn skip_line_comment(&mut self, start_skip: usize) {
218        // Position after the initial `//` (or similar)
219        let start = self.cur_pos();
220        self.input_mut().bump_bytes(start_skip);
221        let slice_start = self.cur_pos();
222
223        // foo // comment for foo
224        // bar
225        //
226        // foo
227        // // comment for bar
228        // bar
229        //
230        let is_for_next =
231            self.state().had_line_break() || !self.state().can_have_trailing_line_comment();
232
233        // Fast search for line-terminator
234        byte_search! {
235            lexer: self,
236            table: LINE_BREAK_TABLE,
237            continue_if: (matched_byte, pos_offset) {
238                if matched_byte != LS_OR_PS_FIRST {
239                    // '\r' or '\n' - definitely a line terminator
240                    false
241                } else {
242                    // 0xE2 - could be LS/PS or some other Unicode character
243                    // Check the next 2 bytes to see if it's really LS/PS
244                    let current_slice = self.input().as_str();
245                    let byte_pos = pos_offset;
246                    if byte_pos + 2 < current_slice.len() {
247                        let bytes = current_slice.as_bytes();
248                        let next2 = [bytes[byte_pos + 1], bytes[byte_pos + 2]];
249                        if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
250                            // It's a real line terminator
251                            false
252                        } else {
253                            // Some other Unicode character starting with 0xE2
254                            true
255                        }
256                    } else {
257                        // Not enough bytes for full LS/PS sequence
258                        true
259                    }
260                }
261            },
262            handle_eof: {
263                // Reached EOF – entire remainder is comment
264                let end = self.input().end_pos();
265
266                if self.comments_buffer().is_some() {
267                    let s = unsafe { self.input_slice(slice_start, end) };
268                    let cmt = swc_common::comments::Comment {
269                        kind: swc_common::comments::CommentKind::Line,
270                        span: Span::new_with_checked(start, end),
271                        text: self.atom(s),
272                    };
273
274                    if is_for_next {
275                        self.comments_buffer_mut().unwrap().push_pending(cmt);
276                    } else {
277                        let pos = self.state().prev_hi();
278                        self.comments_buffer_mut().unwrap().push_comment(BufferedComment {
279                            kind: BufferedCommentKind::Trailing,
280                            pos,
281                            comment: cmt,
282                        });
283                    }
284                }
285
286                return;
287            }
288        };
289
290        // Current position is at the line terminator
291        let end = self.cur_pos();
292
293        // Create and process slice only if comments need to be stored
294        if self.comments_buffer().is_some() {
295            let s = unsafe {
296                // Safety: We know that the start and the end are valid
297                self.input_slice(slice_start, end)
298            };
299            let cmt = swc_common::comments::Comment {
300                kind: swc_common::comments::CommentKind::Line,
301                span: Span::new_with_checked(start, end),
302                text: self.atom(s),
303            };
304
305            if is_for_next {
306                self.comments_buffer_mut().unwrap().push_pending(cmt);
307            } else {
308                let pos = self.state().prev_hi();
309                self.comments_buffer_mut()
310                    .unwrap()
311                    .push_comment(BufferedComment {
312                        kind: BufferedCommentKind::Trailing,
313                        pos,
314                        comment: cmt,
315                    });
316            }
317        }
318
319        unsafe {
320            // Safety: We got end from self.input
321            self.input_mut().reset_to(end);
322        }
323    }
324
325    /// Expects current char to be '/' and next char to be '*'.
326    fn skip_block_comment(&mut self) {
327        let start = self.cur_pos();
328
329        debug_assert_eq!(self.cur(), Some('/'));
330        debug_assert_eq!(self.peek(), Some('*'));
331
332        // Consume initial "/*"
333        self.input_mut().bump_bytes(2);
334
335        // jsdoc
336        let slice_start = self.cur_pos();
337
338        let had_line_break_before_last = self.had_line_break_before_last();
339        let mut should_mark_had_line_break = false;
340
341        loop {
342            let matched_byte = byte_search! {
343                lexer: self,
344                table: BLOCK_COMMENT_SCAN_TABLE,
345                continue_if: (matched_byte, pos_offset) {
346                    if matched_byte == LS_OR_PS_FIRST {
347                        // 0xE2 - could be LS/PS or some other Unicode character
348                        let current_slice = self.input().as_str();
349                        let byte_pos = pos_offset;
350                        if byte_pos + 2 < current_slice.len() {
351                            let bytes = current_slice.as_bytes();
352                            let next2 = [bytes[byte_pos + 1], bytes[byte_pos + 2]];
353                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
354                                // It's a real line terminator - don't continue
355                                false
356                            } else {
357                                // Some other Unicode character starting with 0xE2
358                                true
359                            }
360                        } else {
361                            // Not enough bytes for full LS/PS sequence
362                            true
363                        }
364                    } else {
365                        // '*', '\r', or '\n' - don't continue
366                        false
367                    }
368                },
369                handle_eof: {
370                    if should_mark_had_line_break {
371                        self.state_mut().mark_had_line_break();
372                    }
373                    let end_pos = self.input().end_pos();
374                    let span = Span::new_with_checked(end_pos, end_pos);
375                    self.emit_error_span(span, SyntaxError::UnterminatedBlockComment);
376                    return;
377                }
378            };
379
380            match matched_byte {
381                b'*' => {
382                    if self.peek() == Some('/') {
383                        // Consume "*/"
384                        self.input_mut().bump_bytes(2);
385
386                        if should_mark_had_line_break {
387                            self.state_mut().mark_had_line_break();
388                        }
389
390                        let end = self.cur_pos();
391
392                        // Decide trailing / leading
393                        let mut is_for_next =
394                            had_line_break_before_last || !self.state().can_have_trailing_comment();
395
396                        // If next char is ';' without newline, treat as trailing
397                        if !had_line_break_before_last && self.input().is_byte(b';') {
398                            is_for_next = false;
399                        }
400
401                        if self.comments_buffer().is_some() {
402                            let src = unsafe {
403                                // Safety: We got slice_start and end from self.input so those are
404                                // valid.
405                                self.input_mut().slice(slice_start, end)
406                            };
407                            let s = &src[..src.len() - 2];
408                            let cmt = Comment {
409                                kind: CommentKind::Block,
410                                span: Span::new_with_checked(start, end),
411                                text: self.atom(s),
412                            };
413
414                            if is_for_next {
415                                self.comments_buffer_mut().unwrap().push_pending(cmt);
416                            } else {
417                                let pos = self.state().prev_hi();
418                                self.comments_buffer_mut()
419                                    .unwrap()
420                                    .push_comment(BufferedComment {
421                                        kind: BufferedCommentKind::Trailing,
422                                        pos,
423                                        comment: cmt,
424                                    });
425                            }
426                        }
427
428                        return;
429                    } else {
430                        // Just a lone '*', consume it and continue.
431                        self.bump();
432                    }
433                }
434                b'\n' => {
435                    should_mark_had_line_break = true;
436                    self.bump();
437                }
438                b'\r' => {
439                    should_mark_had_line_break = true;
440                    self.bump();
441                    if self.peek() == Some('\n') {
442                        self.bump();
443                    }
444                }
445                _ => {
446                    // Unicode line terminator (LS/PS) or other character
447                    if let Some('\u{2028}' | '\u{2029}') = self.cur() {
448                        should_mark_had_line_break = true;
449                    }
450                    self.bump();
451                }
452            }
453        }
454    }
455
456    /// Skip comments or whitespaces.
457    ///
458    /// See https://tc39.github.io/ecma262/#sec-white-space
459    #[inline(never)]
460    fn skip_space<const LEX_COMMENTS: bool>(&mut self) {
461        loop {
462            let (offset, newline) = {
463                let mut skip = self::whitespace::SkipWhitespace {
464                    input: self.input().as_str(),
465                    newline: false,
466                    offset: 0,
467                };
468
469                skip.scan();
470
471                (skip.offset, skip.newline)
472            };
473
474            self.input_mut().bump_bytes(offset as usize);
475            if newline {
476                self.state_mut().mark_had_line_break();
477            }
478
479            if LEX_COMMENTS && self.input().is_byte(b'/') {
480                if let Some(c) = self.peek() {
481                    if c == '/' {
482                        self.skip_line_comment(2);
483                        continue;
484                    } else if c == '*' {
485                        self.skip_block_comment();
486                        continue;
487                    }
488                }
489            }
490
491            break;
492        }
493    }
494
495    /// Ensure that ident cannot directly follow numbers.
496    fn ensure_not_ident(&mut self) -> LexResult<()> {
497        match self.cur() {
498            Some(c) if c.is_ident_start() => {
499                let span = pos_span(self.cur_pos());
500                self.error_span(span, SyntaxError::IdentAfterNum)?
501            }
502            _ => Ok(()),
503        }
504    }
505
506    fn make_legacy_octal(&mut self, start: BytePos, val: f64) -> LexResult<f64> {
507        self.ensure_not_ident()?;
508        if self.syntax().typescript() && self.target() >= EsVersion::Es5 {
509            self.emit_error(start, SyntaxError::TS1085);
510        }
511        self.emit_strict_mode_error(start, SyntaxError::LegacyOctal);
512        Ok(val)
513    }
514
515    /// `op`- |total, radix, value| -> (total * radix + value, continue)
516    fn read_digits<F, Ret, const RADIX: u8>(
517        &mut self,
518        mut op: F,
519        allow_num_separator: bool,
520        has_underscore: &mut bool,
521    ) -> LexResult<Ret>
522    where
523        F: FnMut(Ret, u8, u32) -> LexResult<(Ret, bool)>,
524        Ret: Copy + Default,
525    {
526        debug_assert!(
527            RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16,
528            "radix for read_int should be one of 2, 8, 10, 16, but got {RADIX}"
529        );
530
531        if cfg!(feature = "debug") {
532            tracing::trace!("read_digits(radix = {}), cur = {:?}", RADIX, self.cur());
533        }
534
535        let start = self.cur_pos();
536        let mut total: Ret = Default::default();
537        let mut prev = None;
538
539        while let Some(c) = self.cur() {
540            if c == '_' {
541                *has_underscore = true;
542                if allow_num_separator {
543                    let is_allowed = |c: Option<char>| {
544                        let Some(c) = c else {
545                            return false;
546                        };
547                        c.is_digit(RADIX as _)
548                    };
549                    let is_forbidden = |c: Option<char>| {
550                        let Some(c) = c else {
551                            return false;
552                        };
553
554                        if RADIX == 16 {
555                            matches!(c, '.' | 'X' | '_' | 'x')
556                        } else {
557                            matches!(c, '.' | 'B' | 'E' | 'O' | '_' | 'b' | 'e' | 'o')
558                        }
559                    };
560
561                    let next = self.input().peek();
562
563                    if !is_allowed(next) || is_forbidden(prev) || is_forbidden(next) {
564                        self.emit_error(
565                            start,
566                            SyntaxError::NumericSeparatorIsAllowedOnlyBetweenTwoDigits,
567                        );
568                    }
569
570                    // Ignore this _ character
571                    unsafe {
572                        // Safety: cur() returns Some(c) where c is a valid char
573                        self.input_mut().bump();
574                    }
575
576                    continue;
577                }
578            }
579
580            // e.g. (val for a) = 10  where radix = 16
581            let val = if let Some(val) = c.to_digit(RADIX as _) {
582                val
583            } else {
584                return Ok(total);
585            };
586
587            self.bump();
588
589            let (t, cont) = op(total, RADIX, val)?;
590
591            total = t;
592
593            if !cont {
594                return Ok(total);
595            }
596
597            prev = Some(c);
598        }
599
600        Ok(total)
601    }
602
603    /// This can read long integers like
604    /// "13612536612375123612312312312312312312312".
605    ///
606    /// - Returned `bool` is `true` is there was `8` or `9`.
607    fn read_number_no_dot_as_str<const RADIX: u8>(&mut self) -> LexResult<LazyInteger> {
608        debug_assert!(
609            RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16,
610            "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {RADIX}"
611        );
612        let start = self.cur_pos();
613
614        let mut not_octal = false;
615        let mut read_any = false;
616        let mut has_underscore = false;
617
618        self.read_digits::<_, (), RADIX>(
619            |_, _, v| {
620                read_any = true;
621
622                if v == 8 || v == 9 {
623                    not_octal = true;
624                }
625
626                Ok(((), true))
627            },
628            true,
629            &mut has_underscore,
630        )?;
631
632        if !read_any {
633            self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?;
634        }
635
636        Ok(LazyInteger {
637            start,
638            end: self.cur_pos(),
639            not_octal,
640            has_underscore,
641        })
642    }
643
644    /// Reads an integer, octal integer, or floating-point number
645    fn read_number<const START_WITH_DOT: bool, const START_WITH_ZERO: bool>(
646        &mut self,
647    ) -> LexResult<Either<(f64, Atom), (Box<BigIntValue>, Atom)>> {
648        debug_assert!(!(START_WITH_DOT && START_WITH_ZERO));
649        debug_assert!(self.cur().is_some());
650
651        let start = self.cur_pos();
652        let mut has_underscore = false;
653
654        let lazy_integer = if START_WITH_DOT {
655            // first char is '.'
656            debug_assert!(
657                self.cur().is_some_and(|c| c == '.'),
658                "read_number<START_WITH_DOT = true> expects current char to be '.'"
659            );
660            LazyInteger {
661                start,
662                end: start,
663                not_octal: true,
664                has_underscore: false,
665            }
666        } else {
667            debug_assert!(!START_WITH_DOT);
668            debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == '0');
669
670            // Use read_number_no_dot to support long numbers.
671            let lazy_integer = self.read_number_no_dot_as_str::<10>()?;
672            let s = unsafe {
673                // Safety: We got both start and end position from `self.input`
674                self.input_slice(lazy_integer.start, lazy_integer.end)
675            };
676
677            // legacy octal number is not allowed in bigint.
678            if (!START_WITH_ZERO || lazy_integer.end - lazy_integer.start == BytePos(1))
679                && self.eat(b'n')
680            {
681                let end = self.cur_pos();
682                let raw = unsafe {
683                    // Safety: We got both start and end position from `self.input`
684                    self.input_slice(start, end)
685                };
686                let bigint_value = num_bigint::BigInt::parse_bytes(s.as_bytes(), 10).unwrap();
687                return Ok(Either::Right((Box::new(bigint_value), self.atom(raw))));
688            }
689
690            if START_WITH_ZERO {
691                // TODO: I guess it would be okay if I don't use -ffast-math
692                // (or something like that), but needs review.
693                if s.as_bytes().iter().all(|&c| c == b'0') {
694                    // If only one zero is used, it's decimal.
695                    // And if multiple zero is used, it's octal.
696                    //
697                    // e.g. `0` is decimal (so it can be part of float)
698                    //
699                    // e.g. `000` is octal
700                    if start.0 != self.last_pos().0 - 1 {
701                        let end = self.cur_pos();
702                        let raw = unsafe {
703                            // Safety: We got both start and end position from `self.input`
704                            self.input_slice(start, end)
705                        };
706                        let raw = self.atom(raw);
707                        return self
708                            .make_legacy_octal(start, 0f64)
709                            .map(|value| Either::Left((value, raw)));
710                    }
711                } else if lazy_integer.not_octal {
712                    // if it contains '8' or '9', it's decimal.
713                    self.emit_strict_mode_error(start, SyntaxError::LegacyDecimal);
714                } else {
715                    // It's Legacy octal, and we should reinterpret value.
716                    let s = remove_underscore(s, lazy_integer.has_underscore);
717                    let val = parse_integer::<8>(&s);
718                    let end = self.cur_pos();
719                    let raw = unsafe {
720                        // Safety: We got both start and end position from `self.input`
721                        self.input_slice(start, end)
722                    };
723                    let raw = self.atom(raw);
724                    return self
725                        .make_legacy_octal(start, val)
726                        .map(|value| Either::Left((value, raw)));
727                }
728            }
729
730            lazy_integer
731        };
732
733        has_underscore |= lazy_integer.has_underscore;
734        // At this point, number cannot be an octal literal.
735
736        let has_dot = self.cur() == Some('.');
737        //  `0.a`, `08.a`, `102.a` are invalid.
738        //
739        // `.1.a`, `.1e-4.a` are valid,
740        if has_dot {
741            self.bump();
742
743            // equal: if START_WITH_DOT { debug_assert!(xxxx) }
744            debug_assert!(!START_WITH_DOT || self.cur().is_some_and(|cur| cur.is_ascii_digit()));
745
746            // Read numbers after dot
747            self.read_digits::<_, (), 10>(|_, _, _| Ok(((), true)), true, &mut has_underscore)?;
748        }
749
750        let has_e = self.cur().is_some_and(|c| c == 'e' || c == 'E');
751        // Handle 'e' and 'E'
752        //
753        // .5e1 = 5
754        // 1e2 = 100
755        // 1e+2 = 100
756        // 1e-2 = 0.01
757        if has_e {
758            self.bump(); // `e`/`E`
759
760            let next = match self.cur() {
761                Some(next) => next,
762                None => {
763                    let pos = self.cur_pos();
764                    self.error(pos, SyntaxError::NumLitTerminatedWithExp)?
765                }
766            };
767
768            if next == '+' || next == '-' {
769                self.bump(); // remove '+', '-'
770            }
771
772            let lazy_integer = self.read_number_no_dot_as_str::<10>()?;
773            has_underscore |= lazy_integer.has_underscore;
774        }
775
776        let val = if has_dot || has_e {
777            let end = self.cur_pos();
778            let raw = unsafe {
779                // Safety: We got both start and end position from `self.input`
780                self.input_slice(start, end)
781            };
782
783            let raw = remove_underscore(raw, has_underscore);
784            raw.parse().expect("failed to parse float literal")
785        } else {
786            let s = unsafe { self.input_slice(lazy_integer.start, lazy_integer.end) };
787            let s = remove_underscore(s, has_underscore);
788            parse_integer::<10>(&s)
789        };
790
791        self.ensure_not_ident()?;
792
793        let end = self.cur_pos();
794        let raw_str = unsafe {
795            // Safety: We got both start and end position from `self.input`
796            self.input_slice(start, end)
797        };
798        Ok(Either::Left((val, raw_str.into())))
799    }
800
801    fn read_int_u32<const RADIX: u8>(&mut self, len: u8) -> LexResult<Option<u32>> {
802        let start = self.state().start();
803
804        let mut count = 0;
805        let v = self.read_digits::<_, Option<u32>, RADIX>(
806            |opt: Option<u32>, radix, val| {
807                count += 1;
808
809                let total = opt
810                    .unwrap_or_default()
811                    .checked_mul(radix as u32)
812                    .and_then(|v| v.checked_add(val))
813                    .ok_or_else(|| {
814                        let span = Span::new_with_checked(start, start);
815                        crate::error::Error::new(span, SyntaxError::InvalidUnicodeEscape)
816                    })?;
817
818                Ok((Some(total), count != len))
819            },
820            true,
821            &mut false,
822        )?;
823        if len != 0 && count != len {
824            Ok(None)
825        } else {
826            Ok(v)
827        }
828    }
829
830    /// Returns `Left(value)` or `Right(BigInt)`
831    fn read_radix_number<const RADIX: u8>(
832        &mut self,
833    ) -> LexResult<Either<(f64, Atom), (Box<BigIntValue>, Atom)>> {
834        debug_assert!(
835            RADIX == 2 || RADIX == 8 || RADIX == 16,
836            "radix should be one of 2, 8, 16, but got {RADIX}"
837        );
838        let start = self.cur_pos();
839
840        debug_assert_eq!(self.cur(), Some('0'));
841        self.bump();
842
843        debug_assert!(self
844            .cur()
845            .is_some_and(|c| matches!(c, 'b' | 'B' | 'o' | 'O' | 'x' | 'X')));
846        self.bump();
847
848        let lazy_integer = self.read_number_no_dot_as_str::<RADIX>()?;
849        let has_underscore = lazy_integer.has_underscore;
850
851        let s = unsafe {
852            // Safety: We got both start and end position from `self.input`
853            self.input_slice(lazy_integer.start, lazy_integer.end)
854        };
855        if self.eat(b'n') {
856            let end = self.cur_pos();
857            let raw = unsafe {
858                // Safety: We got both start and end position from `self.input`
859                self.input_slice(start, end)
860            };
861
862            let bigint_value = num_bigint::BigInt::parse_bytes(s.as_bytes(), RADIX as _).unwrap();
863            return Ok(Either::Right((Box::new(bigint_value), self.atom(raw))));
864        }
865        let s = remove_underscore(s, has_underscore);
866        let val = parse_integer::<RADIX>(&s);
867
868        self.ensure_not_ident()?;
869
870        let end = self.cur_pos();
871        let raw = unsafe {
872            // Safety: We got both start and end position from `self.input`
873            self.input_slice(start, end)
874        };
875
876        Ok(Either::Left((val, self.atom(raw))))
877    }
878
879    /// Consume pending comments.
880    ///
881    /// This is called when the input is exhausted.
882    #[cold]
883    #[inline(never)]
884    fn consume_pending_comments(&mut self) {
885        if let Some(comments) = self.comments() {
886            let last = self.state().prev_hi();
887            let start_pos = self.start_pos();
888            let comments_buffer = self.comments_buffer_mut().unwrap();
889
890            // if the file had no tokens and no shebang, then treat any
891            // comments in the leading comments buffer as leading.
892            // Otherwise treat them as trailing.
893            let kind = if last == start_pos {
894                BufferedCommentKind::Leading
895            } else {
896                BufferedCommentKind::Trailing
897            };
898            // move the pending to the leading or trailing
899            comments_buffer.pending_to_comment(kind, last);
900
901            // now fill the user's passed in comments
902            for comment in comments_buffer.take_comments() {
903                match comment.kind {
904                    BufferedCommentKind::Leading => {
905                        comments.add_leading(comment.pos, comment.comment);
906                    }
907                    BufferedCommentKind::Trailing => {
908                        comments.add_trailing(comment.pos, comment.comment);
909                    }
910                }
911            }
912        }
913    }
914
915    /// Read a JSX identifier (valid tag or attribute name).
916    ///
917    /// Optimized version since JSX identifiers can"t contain
918    /// escape characters and so can be read as single slice.
919    /// Also assumes that first character was already checked
920    /// by isIdentifierStart in readToken.
921    fn read_jsx_word(&mut self) -> LexResult<Self::Token> {
922        debug_assert!(self.syntax().jsx());
923        debug_assert!(self.input().cur().is_some_and(|c| c.is_ident_start()));
924
925        let mut first = true;
926        let slice = self.input_uncons_while(|c| {
927            if first {
928                first = false;
929                c.is_ident_start()
930            } else {
931                c.is_ident_part() || c == '-'
932            }
933        });
934
935        Ok(Self::Token::jsx_name(slice, self))
936    }
937
938    fn read_jsx_entity(&mut self) -> LexResult<(char, String)> {
939        debug_assert!(self.syntax().jsx());
940
941        fn from_code(s: &str, radix: u32) -> LexResult<char> {
942            // TODO(kdy1): unwrap -> Err
943            let c = char::from_u32(
944                u32::from_str_radix(s, radix).expect("failed to parse string as number"),
945            )
946            .expect("failed to parse number as char");
947
948            Ok(c)
949        }
950
951        fn is_hex(s: &str) -> bool {
952            s.chars().all(|c| c.is_ascii_hexdigit())
953        }
954
955        fn is_dec(s: &str) -> bool {
956            s.chars().all(|c| c.is_ascii_digit())
957        }
958
959        let mut s = SmartString::<LazyCompact>::default();
960
961        debug_assert!(self.input().cur().is_some_and(|c| c == '&'));
962        self.bump();
963
964        let start_pos = self.input().cur_pos();
965
966        for _ in 0..10 {
967            let c = match self.input().cur() {
968                Some(c) => c,
969                None => break,
970            };
971            self.bump();
972
973            if c == ';' {
974                if let Some(stripped) = s.strip_prefix('#') {
975                    if stripped.starts_with('x') {
976                        if is_hex(&s[2..]) {
977                            let value = from_code(&s[2..], 16)?;
978
979                            return Ok((value, format!("&{s};")));
980                        }
981                    } else if is_dec(stripped) {
982                        let value = from_code(stripped, 10)?;
983
984                        return Ok((value, format!("&{s};")));
985                    }
986                } else if let Some(entity) = xhtml(&s) {
987                    return Ok((entity, format!("&{s};")));
988                }
989
990                break;
991            }
992
993            s.push(c)
994        }
995
996        unsafe {
997            // Safety: start_pos is a valid position because we got it from self.input
998            self.input_mut().reset_to(start_pos);
999        }
1000
1001        Ok(('&', "&".to_string()))
1002    }
1003
1004    fn read_jsx_new_line(&mut self, normalize_crlf: bool) -> LexResult<Either<&'static str, char>> {
1005        debug_assert!(self.syntax().jsx());
1006        let ch = self.input().cur().unwrap();
1007        self.bump();
1008
1009        let out = if ch == '\r' && self.input().cur() == Some('\n') {
1010            self.bump(); // `\n`
1011            Either::Left(if normalize_crlf { "\n" } else { "\r\n" })
1012        } else {
1013            Either::Right(ch)
1014        };
1015        Ok(out)
1016    }
1017
1018    fn read_jsx_str(&mut self, quote: char) -> LexResult<Self::Token> {
1019        debug_assert!(self.syntax().jsx());
1020        let start = self.input().cur_pos();
1021        unsafe {
1022            // Safety: cur() was Some(quote)
1023            self.input_mut().bump(); // `quote`
1024        }
1025        let mut out = String::new();
1026        let mut chunk_start = self.input().cur_pos();
1027        loop {
1028            let ch = match self.input().cur() {
1029                Some(c) => c,
1030                None => {
1031                    self.emit_error(start, SyntaxError::UnterminatedStrLit);
1032                    break;
1033                }
1034            };
1035            let cur_pos = self.input().cur_pos();
1036            if ch == '\\' {
1037                let value = unsafe {
1038                    // Safety: We already checked for the range
1039                    self.input_slice(chunk_start, cur_pos)
1040                };
1041
1042                out.push_str(value);
1043                out.push('\\');
1044
1045                self.bump();
1046
1047                chunk_start = self.input().cur_pos();
1048
1049                continue;
1050            }
1051
1052            if ch == quote {
1053                break;
1054            }
1055
1056            if ch == '&' {
1057                let value = unsafe {
1058                    // Safety: We already checked for the range
1059                    self.input_slice(chunk_start, cur_pos)
1060                };
1061
1062                out.push_str(value);
1063
1064                let jsx_entity = self.read_jsx_entity()?;
1065
1066                out.push(jsx_entity.0);
1067
1068                chunk_start = self.input().cur_pos();
1069            } else if ch.is_line_terminator() {
1070                let value = unsafe {
1071                    // Safety: We already checked for the range
1072                    self.input_slice(chunk_start, cur_pos)
1073                };
1074
1075                out.push_str(value);
1076
1077                match self.read_jsx_new_line(false)? {
1078                    Either::Left(s) => {
1079                        out.push_str(s);
1080                    }
1081                    Either::Right(c) => {
1082                        out.push(c);
1083                    }
1084                }
1085
1086                chunk_start = cur_pos + BytePos(ch.len_utf8() as _);
1087            } else {
1088                unsafe {
1089                    // Safety: cur() was Some(ch)
1090                    self.input_mut().bump();
1091                }
1092            }
1093        }
1094        let cur_pos = self.input().cur_pos();
1095        let s = unsafe {
1096            // Safety: We already checked for the range
1097            self.input_slice(chunk_start, cur_pos)
1098        };
1099        let value = if out.is_empty() {
1100            // Fast path: We don't need to allocate
1101            self.atom(s)
1102        } else {
1103            out.push_str(s);
1104            self.atom(out)
1105        };
1106
1107        // it might be at the end of the file when
1108        // the string literal is unterminated
1109        if self.input().peek_ahead().is_some() {
1110            self.bump();
1111        }
1112
1113        let end = self.input().cur_pos();
1114        let raw = unsafe {
1115            // Safety: Both of `start` and `end` are generated from `cur_pos()`
1116            self.input_slice(start, end)
1117        };
1118        let raw = self.atom(raw);
1119        Ok(Self::Token::str(value, raw, self))
1120    }
1121
1122    fn read_unicode_escape(&mut self) -> LexResult<Vec<Char>> {
1123        debug_assert_eq!(self.cur(), Some('u'));
1124
1125        let mut chars = Vec::with_capacity(4);
1126        let mut is_curly = false;
1127
1128        self.bump(); // 'u'
1129
1130        if self.eat(b'{') {
1131            is_curly = true;
1132        }
1133
1134        let state = self.input().cur_pos();
1135        let c = match self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) {
1136            Ok(Some(val)) => {
1137                if 0x0010_ffff >= val {
1138                    char::from_u32(val)
1139                } else {
1140                    let start = self.cur_pos();
1141
1142                    self.error(
1143                        start,
1144                        SyntaxError::BadCharacterEscapeSequence {
1145                            expected: if is_curly {
1146                                "1-6 hex characters in the range 0 to 10FFFF."
1147                            } else {
1148                                "4 hex characters"
1149                            },
1150                        },
1151                    )?
1152                }
1153            }
1154            _ => {
1155                let start = self.cur_pos();
1156
1157                self.error(
1158                    start,
1159                    SyntaxError::BadCharacterEscapeSequence {
1160                        expected: if is_curly {
1161                            "1-6 hex characters"
1162                        } else {
1163                            "4 hex characters"
1164                        },
1165                    },
1166                )?
1167            }
1168        };
1169
1170        match c {
1171            Some(c) => {
1172                chars.push(c.into());
1173            }
1174            _ => {
1175                unsafe {
1176                    // Safety: state is valid position because we got it from cur_pos()
1177                    self.input_mut().reset_to(state);
1178                }
1179
1180                chars.push(Char::from('\\'));
1181                chars.push(Char::from('u'));
1182
1183                if is_curly {
1184                    chars.push(Char::from('{'));
1185
1186                    for _ in 0..6 {
1187                        if let Some(c) = self.input().cur() {
1188                            if c == '}' {
1189                                break;
1190                            }
1191
1192                            self.bump();
1193
1194                            chars.push(Char::from(c));
1195                        } else {
1196                            break;
1197                        }
1198                    }
1199
1200                    chars.push(Char::from('}'));
1201                } else {
1202                    for _ in 0..4 {
1203                        if let Some(c) = self.input().cur() {
1204                            self.bump();
1205
1206                            chars.push(Char::from(c));
1207                        }
1208                    }
1209                }
1210            }
1211        }
1212
1213        if is_curly && !self.eat(b'}') {
1214            self.error(state, SyntaxError::InvalidUnicodeEscape)?
1215        }
1216
1217        Ok(chars)
1218    }
1219
1220    #[cold]
1221    fn read_shebang(&mut self) -> LexResult<Option<Atom>> {
1222        if self.input().cur() != Some('#') || self.input().peek() != Some('!') {
1223            return Ok(None);
1224        }
1225        self.bump(); // `#`
1226        self.bump(); // `!`
1227        let s = self.input_uncons_while(|c| !c.is_line_terminator());
1228        Ok(Some(self.atom(s)))
1229    }
1230
1231    fn read_tmpl_token(&mut self, start_of_tpl: BytePos) -> LexResult<Self::Token> {
1232        let start = self.cur_pos();
1233
1234        let mut cooked = Ok(String::new());
1235        let mut cooked_slice_start = start;
1236        let raw_slice_start = start;
1237
1238        macro_rules! consume_cooked {
1239            () => {{
1240                if let Ok(cooked) = &mut cooked {
1241                    let last_pos = self.cur_pos();
1242                    cooked.push_str(unsafe {
1243                        // Safety: Both of start and last_pos are valid position because we got them
1244                        // from `self.input`
1245                        self.input_slice(cooked_slice_start, last_pos)
1246                    });
1247                }
1248            }};
1249        }
1250
1251        // Handle edge case for immediate template end
1252        if start == self.cur_pos() && self.state().last_was_tpl_element() {
1253            if let Some(c) = self.cur() {
1254                if c == '$' && self.peek() == Some('{') {
1255                    self.bump(); // '$'
1256                    self.bump(); // '{'
1257                    return Ok(Self::Token::DOLLAR_LBRACE);
1258                } else if c == '`' {
1259                    self.bump(); // '`'
1260                    return Ok(Self::Token::BACKQUOTE);
1261                }
1262            }
1263        }
1264
1265        // Fast path: use byte_search to scan for template literal terminators
1266        loop {
1267            let matched_byte = byte_search! {
1268                lexer: self,
1269                table: TEMPLATE_LITERAL_TABLE,
1270                handle_eof: {
1271                    // EOF reached - unterminated template
1272                    self.error(start_of_tpl, SyntaxError::UnterminatedTpl)?
1273                }
1274            };
1275
1276            match matched_byte {
1277                b'$' => {
1278                    // Check if this is ${
1279                    if self.peek() == Some('{') {
1280                        // Found template substitution
1281                        let cooked = if cooked_slice_start == raw_slice_start {
1282                            let last_pos = self.cur_pos();
1283                            let s = unsafe {
1284                                // Safety: Both of start and last_pos are valid position because we
1285                                // got them from `self.input`
1286                                self.input_slice(cooked_slice_start, last_pos)
1287                            };
1288                            Ok(self.atom(s))
1289                        } else {
1290                            consume_cooked!();
1291                            cooked.map(|s| self.atom(s))
1292                        };
1293
1294                        let end = self.input().cur_pos();
1295                        let raw = unsafe {
1296                            // Safety: Both of start and last_pos are valid position because we got
1297                            // them from `self.input`
1298                            self.input_slice(raw_slice_start, end)
1299                        };
1300                        let raw = self.atom(raw);
1301                        return Ok(Self::Token::template(cooked, raw, self));
1302                    } else {
1303                        // Just a regular $ character, continue scanning
1304                        self.bump();
1305                        continue;
1306                    }
1307                }
1308                b'`' => {
1309                    // Found template end
1310                    let cooked = if cooked_slice_start == raw_slice_start {
1311                        let last_pos = self.cur_pos();
1312                        let s = unsafe { self.input_slice(cooked_slice_start, last_pos) };
1313                        Ok(self.atom(s))
1314                    } else {
1315                        consume_cooked!();
1316                        cooked.map(|s| self.atom(s))
1317                    };
1318
1319                    let end = self.input().cur_pos();
1320                    let raw = unsafe { self.input_slice(raw_slice_start, end) };
1321                    let raw = self.atom(raw);
1322                    return Ok(Self::Token::template(cooked, raw, self));
1323                }
1324                b'\r' => {
1325                    // Handle carriage return line terminator
1326                    self.state_mut().mark_had_line_break();
1327                    consume_cooked!();
1328
1329                    // Handle carriage return - consume \r and optionally \n, normalize to \n
1330                    self.bump(); // '\r'
1331                    if self.peek() == Some('\n') {
1332                        self.bump(); // '\n'
1333                    }
1334
1335                    if let Ok(ref mut cooked) = cooked {
1336                        cooked.push('\n');
1337                    }
1338                    cooked_slice_start = self.cur_pos();
1339                }
1340                b'\\' => {
1341                    // Handle escape sequence - fall back to slow path for this part
1342                    consume_cooked!();
1343
1344                    match self.read_escaped_char(true) {
1345                        Ok(Some(chars)) => {
1346                            if let Ok(ref mut cooked) = cooked {
1347                                for c in chars {
1348                                    cooked.extend(c);
1349                                }
1350                            }
1351                        }
1352                        Ok(None) => {}
1353                        Err(error) => {
1354                            cooked = Err(error);
1355                        }
1356                    }
1357
1358                    cooked_slice_start = self.cur_pos();
1359                }
1360                _ => unreachable!(),
1361            }
1362        }
1363    }
1364
1365    /// Read an escaped character for string literal.
1366    ///
1367    /// In template literal, we should preserve raw string.
1368    fn read_escaped_char(&mut self, in_template: bool) -> LexResult<Option<Vec<Char>>> {
1369        debug_assert_eq!(self.cur(), Some('\\'));
1370
1371        let start = self.cur_pos();
1372
1373        self.bump(); // '\'
1374
1375        let c = match self.cur() {
1376            Some(c) => c,
1377            None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?,
1378        };
1379
1380        let c = match c {
1381            '\\' => '\\',
1382            'n' => '\n',
1383            'r' => '\r',
1384            't' => '\t',
1385            'b' => '\u{0008}',
1386            'v' => '\u{000b}',
1387            'f' => '\u{000c}',
1388            '\r' => {
1389                self.bump(); // remove '\r'
1390
1391                self.eat(b'\n');
1392
1393                return Ok(None);
1394            }
1395            '\n' | '\u{2028}' | '\u{2029}' => {
1396                self.bump();
1397
1398                return Ok(None);
1399            }
1400
1401            // read hexadecimal escape sequences
1402            'x' => {
1403                self.bump(); // 'x'
1404
1405                match self.read_int_u32::<16>(2)? {
1406                    Some(val) => return Ok(Some(vec![Char::from(val)])),
1407                    None => self.error(
1408                        start,
1409                        SyntaxError::BadCharacterEscapeSequence {
1410                            expected: "2 hex characters",
1411                        },
1412                    )?,
1413                }
1414            }
1415
1416            // read unicode escape sequences
1417            'u' => match self.read_unicode_escape() {
1418                Ok(chars) => return Ok(Some(chars)),
1419                Err(err) => self.error(start, err.into_kind())?,
1420            },
1421
1422            // octal escape sequences
1423            '0'..='7' => {
1424                self.bump();
1425
1426                let first_c = if c == '0' {
1427                    match self.cur() {
1428                        Some(next) if next.is_digit(8) => c,
1429                        // \0 is not an octal literal nor decimal literal.
1430                        _ => return Ok(Some(vec!['\u{0000}'.into()])),
1431                    }
1432                } else {
1433                    c
1434                };
1435
1436                // TODO: Show template instead of strict mode
1437                if in_template {
1438                    self.error(start, SyntaxError::LegacyOctal)?
1439                }
1440
1441                self.emit_strict_mode_error(start, SyntaxError::LegacyOctal);
1442
1443                let mut value: u8 = first_c.to_digit(8).unwrap() as u8;
1444
1445                macro_rules! one {
1446                    ($check:expr) => {{
1447                        let cur = self.cur();
1448
1449                        match cur.and_then(|c| c.to_digit(8)) {
1450                            Some(v) => {
1451                                value = if $check {
1452                                    let new_val = value
1453                                        .checked_mul(8)
1454                                        .and_then(|value| value.checked_add(v as u8));
1455                                    match new_val {
1456                                        Some(val) => val,
1457                                        None => return Ok(Some(vec![Char::from(value as char)])),
1458                                    }
1459                                } else {
1460                                    value * 8 + v as u8
1461                                };
1462
1463                                self.bump();
1464                            }
1465                            _ => return Ok(Some(vec![Char::from(value as u32)])),
1466                        }
1467                    }};
1468                }
1469
1470                one!(false);
1471                one!(true);
1472
1473                return Ok(Some(vec![Char::from(value as char)]));
1474            }
1475            _ => c,
1476        };
1477
1478        unsafe {
1479            // Safety: cur() is Some(c) if this method is called.
1480            self.input_mut().bump();
1481        }
1482
1483        Ok(Some(vec![c.into()]))
1484    }
1485
1486    /// Expects current char to be '/'
1487    fn read_regexp(&mut self, start: BytePos) -> LexResult<Self::Token> {
1488        unsafe {
1489            // Safety: start is valid position, and cur() is Some('/')
1490            self.input_mut().reset_to(start);
1491        }
1492
1493        debug_assert_eq!(self.cur(), Some('/'));
1494
1495        let start = self.cur_pos();
1496
1497        self.bump(); // bump '/'
1498
1499        let slice_start = self.cur_pos();
1500
1501        let (mut escaped, mut in_class) = (false, false);
1502
1503        while let Some(c) = self.cur() {
1504            // This is ported from babel.
1505            // Seems like regexp literal cannot contain linebreak.
1506            if c.is_line_terminator() {
1507                let span = self.span(start);
1508
1509                return Err(crate::error::Error::new(
1510                    span,
1511                    SyntaxError::UnterminatedRegExp,
1512                ));
1513            }
1514
1515            if escaped {
1516                escaped = false;
1517            } else {
1518                match c {
1519                    '[' => in_class = true,
1520                    ']' if in_class => in_class = false,
1521                    // Terminates content part of regex literal
1522                    '/' if !in_class => break,
1523                    _ => {}
1524                }
1525
1526                escaped = c == '\\';
1527            }
1528
1529            self.bump();
1530        }
1531
1532        let content = {
1533            let end = self.cur_pos();
1534            let s = unsafe { self.input_slice(slice_start, end) };
1535            self.atom(s)
1536        };
1537
1538        // input is terminated without following `/`
1539        if !self.is(b'/') {
1540            let span = self.span(start);
1541
1542            return Err(crate::error::Error::new(
1543                span,
1544                SyntaxError::UnterminatedRegExp,
1545            ));
1546        }
1547
1548        self.bump(); // '/'
1549
1550        // Spec says "It is a Syntax Error if IdentifierPart contains a Unicode escape
1551        // sequence." TODO: check for escape
1552
1553        // Need to use `read_word` because '\uXXXX' sequences are allowed
1554        // here (don't ask).
1555        // let flags_start = self.cur_pos();
1556        let flags = {
1557            match self.cur() {
1558                Some(c) if c.is_ident_start() => self
1559                    .read_word_as_str_with()
1560                    .map(|(s, _)| Some(self.atom(s))),
1561                _ => Ok(None),
1562            }
1563        }?
1564        .unwrap_or_default();
1565
1566        Ok(Self::Token::regexp(content, flags, self))
1567    }
1568
1569    /// This method is optimized for texts without escape sequences.
1570    fn read_word_as_str_with(&mut self) -> LexResult<(Cow<'a, str>, bool)> {
1571        debug_assert!(self.cur().is_some());
1572        let slice_start = self.cur_pos();
1573
1574        // Fast path: try to scan ASCII identifier using byte_search
1575        if let Some(c) = self.input().cur_as_ascii() {
1576            if Ident::is_valid_ascii_start(c) {
1577                // Advance past first byte
1578                self.bump();
1579
1580                // Use byte_search to quickly scan to end of ASCII identifier
1581                let next_byte = byte_search! {
1582                    lexer: self,
1583                    table: NOT_ASCII_ID_CONTINUE_TABLE,
1584                    handle_eof: {
1585                        // Reached EOF, entire remainder is identifier
1586                        let end = self.cur_pos();
1587                        let s = unsafe {
1588                            // Safety: slice_start and end are valid position because we got them from
1589                            // `self.input`
1590                            self.input_slice(slice_start, end)
1591                        };
1592
1593                        return Ok((Cow::Borrowed(s), false));
1594                    },
1595                };
1596
1597                // Check if we hit end of identifier or need to fall back to slow path
1598                if !next_byte.is_ascii() {
1599                    // Hit Unicode character, fall back to slow path from current position
1600                    return self.read_word_as_str_with_slow_path(slice_start);
1601                } else if next_byte == b'\\' {
1602                    // Hit escape sequence, fall back to slow path from current position
1603                    return self.read_word_as_str_with_slow_path(slice_start);
1604                } else {
1605                    // Hit end of identifier (non-continue ASCII char)
1606                    let end = self.cur_pos();
1607                    let s = unsafe {
1608                        // Safety: slice_start and end are valid position because we got them from
1609                        // `self.input`
1610                        self.input_slice(slice_start, end)
1611                    };
1612
1613                    return Ok((Cow::Borrowed(s), false));
1614                }
1615            }
1616        }
1617
1618        // Fall back to slow path for non-ASCII start or complex cases
1619        self.read_word_as_str_with_slow_path(slice_start)
1620    }
1621
1622    /// Slow path for identifier parsing that handles Unicode and escapes
1623    #[cold]
1624    fn read_word_as_str_with_slow_path(
1625        &mut self,
1626        mut slice_start: BytePos,
1627    ) -> LexResult<(Cow<'a, str>, bool)> {
1628        let mut first = true;
1629        let mut has_escape = false;
1630
1631        let mut buf = String::with_capacity(16);
1632        loop {
1633            if let Some(c) = self.input().cur_as_ascii() {
1634                if Ident::is_valid_ascii_continue(c) {
1635                    self.bump();
1636                    continue;
1637                } else if first && Ident::is_valid_ascii_start(c) {
1638                    self.bump();
1639                    first = false;
1640                    continue;
1641                }
1642
1643                // unicode escape
1644                if c == b'\\' {
1645                    first = false;
1646                    has_escape = true;
1647                    let start = self.cur_pos();
1648                    self.bump();
1649
1650                    if !self.is(b'u') {
1651                        self.error_span(pos_span(start), SyntaxError::ExpectedUnicodeEscape)?
1652                    }
1653
1654                    {
1655                        let end = self.input().cur_pos();
1656                        let s = unsafe {
1657                            // Safety: start and end are valid position because we got them from
1658                            // `self.input`
1659                            self.input_slice(slice_start, start)
1660                        };
1661                        buf.push_str(s);
1662                        unsafe {
1663                            // Safety: We got end from `self.input`
1664                            self.input_mut().reset_to(end);
1665                        }
1666                    }
1667
1668                    let chars = self.read_unicode_escape()?;
1669
1670                    if let Some(c) = chars.first() {
1671                        let valid = if first {
1672                            c.is_ident_start()
1673                        } else {
1674                            c.is_ident_part()
1675                        };
1676
1677                        if !valid {
1678                            self.emit_error(start, SyntaxError::InvalidIdentChar);
1679                        }
1680                    }
1681
1682                    for c in chars {
1683                        buf.extend(c);
1684                    }
1685
1686                    slice_start = self.cur_pos();
1687                    continue;
1688                }
1689
1690                // ASCII but not a valid identifier
1691                break;
1692            } else if let Some(c) = self.input().cur() {
1693                if Ident::is_valid_non_ascii_continue(c) {
1694                    self.bump();
1695                    continue;
1696                } else if first && Ident::is_valid_non_ascii_start(c) {
1697                    self.bump();
1698                    first = false;
1699                    continue;
1700                }
1701            }
1702
1703            break;
1704        }
1705
1706        let end = self.cur_pos();
1707        let s = unsafe {
1708            // Safety: slice_start and end are valid position because we got them from
1709            // `self.input`
1710            self.input_slice(slice_start, end)
1711        };
1712        let value = if !has_escape {
1713            // Fast path: raw slice is enough if there's no escape.
1714            Cow::Borrowed(s)
1715        } else {
1716            buf.push_str(s);
1717            Cow::Owned(buf)
1718        };
1719
1720        Ok((value, has_escape))
1721    }
1722
1723    /// `#`
1724    fn read_token_number_sign(&mut self) -> LexResult<Self::Token> {
1725        debug_assert!(self.cur().is_some_and(|c| c == '#'));
1726
1727        self.bump(); // '#'
1728
1729        // `#` can also be a part of shebangs, however they should have been
1730        // handled by `read_shebang()`
1731        debug_assert!(
1732            !self.input().is_at_start() || self.cur() != Some('!'),
1733            "#! should have already been handled by read_shebang()"
1734        );
1735        Ok(Self::Token::HASH)
1736    }
1737
1738    /// Read a token given `.`.
1739    ///
1740    /// This is extracted as a method to reduce size of `read_token`.
1741    #[inline(never)]
1742    fn read_token_dot(&mut self) -> LexResult<Self::Token> {
1743        debug_assert!(self.cur().is_some_and(|c| c == '.'));
1744        // Check for eof
1745        let next = match self.input().peek() {
1746            Some(next) => next,
1747            None => {
1748                self.bump(); // '.'
1749                return Ok(Self::Token::DOT);
1750            }
1751        };
1752        if next.is_ascii_digit() {
1753            return self.read_number::<true, false>().map(|v| match v {
1754                Left((value, raw)) => Self::Token::num(value, raw, self),
1755                Right(_) => unreachable!("read_number should not return bigint for leading dot"),
1756            });
1757        }
1758
1759        self.bump(); // 1st `.`
1760
1761        if next == '.' && self.input().peek() == Some('.') {
1762            self.bump(); // 2nd `.`
1763            self.bump(); // 3rd `.`
1764
1765            return Ok(Self::Token::DOTDOTDOT);
1766        }
1767
1768        Ok(Self::Token::DOT)
1769    }
1770
1771    /// Read a token given `?`.
1772    ///
1773    /// This is extracted as a method to reduce size of `read_token`.
1774    #[inline(never)]
1775    fn read_token_question_mark(&mut self) -> LexResult<Self::Token> {
1776        debug_assert!(self.cur().is_some_and(|c| c == '?'));
1777        self.bump();
1778        if self.input_mut().eat_byte(b'?') {
1779            if self.input_mut().eat_byte(b'=') {
1780                Ok(Self::Token::NULLISH_ASSIGN)
1781            } else {
1782                Ok(Self::Token::NULLISH_COALESCING)
1783            }
1784        } else {
1785            Ok(Self::Token::QUESTION)
1786        }
1787    }
1788
1789    /// Read a token given `:`.
1790    ///
1791    /// This is extracted as a method to reduce size of `read_token`.
1792    #[inline(never)]
1793    fn read_token_colon(&mut self) -> LexResult<Self::Token> {
1794        debug_assert!(self.cur().is_some_and(|c| c == ':'));
1795        self.bump(); // ':'
1796        Ok(Self::Token::COLON)
1797    }
1798
1799    /// Read a token given `0`.
1800    ///
1801    /// This is extracted as a method to reduce size of `read_token`.
1802    #[inline(never)]
1803    fn read_token_zero(&mut self) -> LexResult<Self::Token> {
1804        debug_assert_eq!(self.cur(), Some('0'));
1805        let next = self.input().peek();
1806
1807        let bigint = match next {
1808            Some('x') | Some('X') => self.read_radix_number::<16>(),
1809            Some('o') | Some('O') => self.read_radix_number::<8>(),
1810            Some('b') | Some('B') => self.read_radix_number::<2>(),
1811            _ => {
1812                return self.read_number::<false, true>().map(|v| match v {
1813                    Left((value, raw)) => Self::Token::num(value, raw, self),
1814                    Right((value, raw)) => Self::Token::bigint(value, raw, self),
1815                });
1816            }
1817        };
1818
1819        bigint.map(|v| match v {
1820            Left((value, raw)) => Self::Token::num(value, raw, self),
1821            Right((value, raw)) => Self::Token::bigint(value, raw, self),
1822        })
1823    }
1824
1825    /// Read a token given `|` or `&`.
1826    ///
1827    /// This is extracted as a method to reduce size of `read_token`.
1828    #[inline(never)]
1829    fn read_token_logical<const C: u8>(&mut self) -> LexResult<Self::Token> {
1830        debug_assert!(C == b'|' || C == b'&');
1831        let is_bit_and = C == b'&';
1832        let had_line_break_before_last = self.had_line_break_before_last();
1833        let start = self.cur_pos();
1834
1835        unsafe {
1836            // Safety: cur() is Some(c as char)
1837            self.input_mut().bump();
1838        }
1839        let token = if is_bit_and {
1840            Self::Token::BIT_AND
1841        } else {
1842            Self::Token::BIT_OR
1843        };
1844
1845        // '|=', '&='
1846        if self.input_mut().eat_byte(b'=') {
1847            return Ok(if is_bit_and {
1848                Self::Token::BIT_AND_EQ
1849            } else {
1850                debug_assert!(token.is_bit_or());
1851                Self::Token::BIT_OR_EQ
1852            });
1853        }
1854
1855        // '||', '&&'
1856        if self.input().cur() == Some(C as char) {
1857            unsafe {
1858                // Safety: cur() is Some(c)
1859                self.input_mut().bump();
1860            }
1861
1862            if self.input().cur() == Some('=') {
1863                unsafe {
1864                    // Safety: cur() is Some('=')
1865                    self.input_mut().bump();
1866                }
1867
1868                return Ok(if is_bit_and {
1869                    Self::Token::LOGICAL_AND_EQ
1870                } else {
1871                    debug_assert!(token.is_bit_or());
1872                    Self::Token::LOGICAL_OR_EQ
1873                });
1874            }
1875
1876            // |||||||
1877            //   ^
1878            if had_line_break_before_last && !is_bit_and && self.is_str("||||| ") {
1879                let span = fixed_len_span(start, 7);
1880                self.emit_error_span(span, SyntaxError::TS1185);
1881                self.skip_line_comment(5);
1882                self.skip_space::<true>();
1883                return self.error_span(span, SyntaxError::TS1185);
1884            }
1885
1886            return Ok(if is_bit_and {
1887                Self::Token::LOGICAL_AND
1888            } else {
1889                debug_assert!(token.is_bit_or());
1890                Self::Token::LOGICAL_OR
1891            });
1892        }
1893
1894        Ok(token)
1895    }
1896
1897    /// Read a token given `*` or `%`.
1898    ///
1899    /// This is extracted as a method to reduce size of `read_token`.
1900    #[inline(never)]
1901    fn read_token_mul_mod(&mut self, is_mul: bool) -> LexResult<Self::Token> {
1902        debug_assert!(self.cur().is_some_and(|c| c == '*' || c == '%'));
1903        self.bump();
1904        let token = if is_mul {
1905            if self.input_mut().eat_byte(b'*') {
1906                // `**`
1907                Self::Token::EXP
1908            } else {
1909                Self::Token::MUL
1910            }
1911        } else {
1912            Self::Token::MOD
1913        };
1914
1915        Ok(if self.input_mut().eat_byte(b'=') {
1916            if token.is_star() {
1917                Self::Token::MUL_EQ
1918            } else if token.is_mod() {
1919                Self::Token::MOD_EQ
1920            } else {
1921                debug_assert!(token.is_exp());
1922                Self::Token::EXP_EQ
1923            }
1924        } else {
1925            token
1926        })
1927    }
1928
1929    #[inline(never)]
1930    fn read_slash(&mut self) -> LexResult<Self::Token> {
1931        debug_assert_eq!(self.cur(), Some('/'));
1932        self.bump(); // '/'
1933        Ok(if self.eat(b'=') {
1934            Self::Token::DIV_EQ
1935        } else {
1936            Self::Token::DIV
1937        })
1938    }
1939
1940    /// This can be used if there's no keyword starting with the first
1941    /// character.
1942    fn read_ident_unknown(&mut self) -> LexResult<Self::Token> {
1943        debug_assert!(self.cur().is_some());
1944
1945        let (s, has_escape) = self.read_word_as_str_with()?;
1946        let atom = self.atom(s);
1947        let word = Self::Token::unknown_ident(atom, self);
1948
1949        if has_escape {
1950            self.update_token_flags(|flags| *flags |= TokenFlags::UNICODE);
1951        }
1952
1953        Ok(word)
1954    }
1955
1956    /// See https://tc39.github.io/ecma262/#sec-literals-string-literals
1957    // TODO: merge `read_str_lit` and `read_jsx_str`
1958    fn read_str_lit(&mut self) -> LexResult<Self::Token> {
1959        debug_assert!(self.cur() == Some('\'') || self.cur() == Some('"'));
1960        let start = self.cur_pos();
1961        let quote = self.cur().unwrap() as u8;
1962
1963        self.bump(); // '"' or '\''
1964
1965        let mut slice_start = self.input().cur_pos();
1966
1967        let mut buf: Option<String> = None;
1968
1969        loop {
1970            let table = if quote == b'"' {
1971                &DOUBLE_QUOTE_STRING_END_TABLE
1972            } else {
1973                &SINGLE_QUOTE_STRING_END_TABLE
1974            };
1975
1976            let fast_path_result = byte_search! {
1977                lexer: self,
1978                table: table,
1979                handle_eof: {
1980                    let value_end = self.cur_pos();
1981                    let s = unsafe {
1982                            // Safety: slice_start and value_end are valid position because we
1983                            // got them from `self.input`
1984                        self.input_slice(slice_start, value_end)
1985                    };
1986
1987                    self.emit_error(start, SyntaxError::UnterminatedStrLit);
1988
1989                    let end = self.cur_pos();
1990                    let raw = unsafe { self.input_slice(start, end) };
1991                    return Ok(Self::Token::str(self.atom(s), self.atom(raw), self));
1992                },
1993            };
1994
1995            match fast_path_result {
1996                b'"' | b'\'' if fast_path_result == quote => {
1997                    let value_end = self.cur_pos();
1998
1999                    let value = if let Some(buf) = buf.as_mut() {
2000                        // `buf` only exist when there has escape.
2001                        debug_assert!(unsafe { self.input_slice(start, value_end).contains('\\') });
2002                        let s = unsafe {
2003                            // Safety: slice_start and value_end are valid position because we
2004                            // got them from `self.input`
2005                            self.input_slice(slice_start, value_end)
2006                        };
2007                        buf.push_str(s);
2008                        self.atom(&*buf)
2009                    } else {
2010                        let s = unsafe { self.input_slice(slice_start, value_end) };
2011                        self.atom(s)
2012                    };
2013
2014                    unsafe {
2015                        // Safety: cur is quote
2016                        self.input_mut().bump();
2017                    }
2018
2019                    let end = self.cur_pos();
2020                    let raw = unsafe {
2021                        // Safety: start and end are valid position because we got them from
2022                        // `self.input`
2023                        self.input_slice(start, end)
2024                    };
2025                    let raw = self.atom(raw);
2026                    return Ok(Self::Token::str(value, raw, self));
2027                }
2028                b'\\' => {
2029                    let end = self.cur_pos();
2030                    let s = unsafe {
2031                        // Safety: start and end are valid position because we got them from
2032                        // `self.input`
2033                        self.input_slice(slice_start, end)
2034                    };
2035
2036                    if buf.is_none() {
2037                        buf = Some(s.to_string());
2038                    } else {
2039                        buf.as_mut().unwrap().push_str(s);
2040                    }
2041
2042                    if let Some(chars) = self.read_escaped_char(false)? {
2043                        for c in chars {
2044                            buf.as_mut().unwrap().extend(c);
2045                        }
2046                    }
2047
2048                    slice_start = self.cur_pos();
2049                    continue;
2050                }
2051                b'\n' | b'\r' => {
2052                    let end = self.cur_pos();
2053                    let s = unsafe {
2054                        // Safety: start and end are valid position because we got them from
2055                        // `self.input`
2056                        self.input_slice(slice_start, end)
2057                    };
2058
2059                    self.emit_error(start, SyntaxError::UnterminatedStrLit);
2060
2061                    let end = self.cur_pos();
2062
2063                    let raw = unsafe {
2064                        // Safety: start and end are valid position because we got them from
2065                        // `self.input`
2066                        self.input_slice(start, end)
2067                    };
2068                    return Ok(Self::Token::str(self.atom(s), self.atom(raw), self));
2069                }
2070                _ => self.bump(),
2071            }
2072        }
2073    }
2074
2075    fn read_keyword_with(
2076        &mut self,
2077        convert: &dyn Fn(&str) -> Option<Self::Token>,
2078    ) -> LexResult<Self::Token> {
2079        debug_assert!(self.cur().is_some());
2080
2081        let start = self.cur_pos();
2082        let (s, has_escape) = self.read_keyword_as_str_with()?;
2083        if let Some(word) = convert(s.as_ref()) {
2084            // Note: ctx is store in lexer because of this error.
2085            // 'await' and 'yield' may have semantic of reserved word, which means lexer
2086            // should know context or parser should handle this error. Our approach to this
2087            // problem is former one.
2088            if has_escape && word.is_reserved(self.ctx()) {
2089                self.error(
2090                    start,
2091                    SyntaxError::EscapeInReservedWord { word: Atom::new(s) },
2092                )
2093            } else {
2094                Ok(word)
2095            }
2096        } else {
2097            let atom = self.atom(s);
2098            Ok(Self::Token::unknown_ident(atom, self))
2099        }
2100    }
2101
2102    /// This is a performant version of [Lexer::read_word_as_str_with] for
2103    /// reading keywords. We should make sure the first byte is a valid
2104    /// ASCII.
2105    fn read_keyword_as_str_with(&mut self) -> LexResult<(Cow<'a, str>, bool)> {
2106        let slice_start = self.cur_pos();
2107
2108        // Fast path: try to scan ASCII identifier using byte_search
2109        // Performance optimization: check if first char disqualifies as keyword
2110        // Advance past first byte
2111        self.bump();
2112
2113        // Use byte_search to quickly scan to end of ASCII identifier
2114        let next_byte = byte_search! {
2115            lexer: self,
2116            table: NOT_ASCII_ID_CONTINUE_TABLE,
2117            handle_eof: {
2118                // Reached EOF, entire remainder is identifier
2119                let end = self.cur_pos();
2120                let s = unsafe {
2121                    // Safety: slice_start and end are valid position because we got them from
2122                    // `self.input`
2123                    self.input_slice(slice_start, end)
2124                };
2125
2126                return Ok((Cow::Borrowed(s), false));
2127            },
2128        };
2129
2130        // Check if we hit end of identifier or need to fall back to slow path
2131        if !next_byte.is_ascii() || next_byte == b'\\' {
2132            // Hit Unicode character or escape sequence, fall back to slow path from current
2133            // position
2134            self.read_word_as_str_with_slow_path(slice_start)
2135        } else {
2136            // Hit end of identifier (non-continue ASCII char)
2137            let end = self.cur_pos();
2138            let s = unsafe {
2139                // Safety: slice_start and end are valid position because we got them from
2140                // `self.input`
2141                self.input_slice(slice_start, end)
2142            };
2143
2144            Ok((Cow::Borrowed(s), false))
2145        }
2146    }
2147}
2148
2149pub fn pos_span(p: BytePos) -> Span {
2150    Span::new_with_checked(p, p)
2151}
2152
2153pub fn fixed_len_span(p: BytePos, len: u32) -> Span {
2154    Span::new_with_checked(p, p + BytePos(len))
2155}