swc_ecma_parser/lexer/
state.rs

1use std::mem::take;
2
3use swc_atoms::wtf8::CodePoint;
4use swc_common::BytePos;
5use swc_ecma_ast::EsVersion;
6
7use super::{Context, Input, Lexer};
8use crate::{
9    error::{Error, SyntaxError},
10    input::Tokens,
11    lexer::{
12        char_ext::CharExt,
13        comments_buffer::{BufferedCommentKind, CommentsBufferCheckpoint},
14        token::{Token, TokenAndSpan, TokenValue},
15        LexResult,
16    },
17    syntax::SyntaxFlags,
18};
19
20bitflags::bitflags! {
21    #[derive(Debug, Default, Clone, Copy)]
22    pub struct TokenFlags: u8 {
23        const UNICODE = 1 << 0;
24    }
25}
26
27/// State of lexer.
28///
29/// Ported from babylon.
30#[derive(Clone)]
31pub struct State {
32    /// if line break exists between previous token and new token?
33    pub had_line_break: bool,
34    /// TODO: Remove this field.
35    is_first: bool,
36    pub next_regexp: Option<BytePos>,
37    pub start: BytePos,
38    pub prev_hi: BytePos,
39
40    pub(super) token_value: Option<TokenValue>,
41    token_type: Option<Token>,
42}
43
44pub struct LexerCheckpoint {
45    comments_buffer: CommentsBufferCheckpoint,
46    state: State,
47    ctx: Context,
48    input_last_pos: BytePos,
49}
50
51impl crate::input::Tokens for Lexer<'_> {
52    type Checkpoint = LexerCheckpoint;
53
54    fn checkpoint_save(&self) -> LexerCheckpoint {
55        LexerCheckpoint {
56            state: self.state.clone(),
57            ctx: self.ctx,
58            input_last_pos: self.input.last_pos(),
59            comments_buffer: self
60                .comments_buffer
61                .as_ref()
62                .map(|cb| cb.checkpoint_save())
63                .unwrap_or_default(),
64        }
65    }
66
67    fn checkpoint_load(&mut self, checkpoint: LexerCheckpoint) {
68        self.state = checkpoint.state;
69        self.ctx = checkpoint.ctx;
70        unsafe { self.input.reset_to(checkpoint.input_last_pos) };
71        if let Some(comments_buffer) = self.comments_buffer.as_mut() {
72            comments_buffer.checkpoint_load(checkpoint.comments_buffer);
73        }
74    }
75
76    #[inline]
77    fn set_ctx(&mut self, ctx: Context) {
78        if ctx.contains(Context::Module) && !self.module_errors.is_empty() {
79            self.errors.append(&mut self.module_errors);
80        }
81        self.ctx = ctx
82    }
83
84    #[inline]
85    fn ctx(&self) -> Context {
86        self.ctx
87    }
88
89    #[inline]
90    fn ctx_mut(&mut self) -> &mut Context {
91        &mut self.ctx
92    }
93
94    #[inline]
95    fn syntax(&self) -> SyntaxFlags {
96        self.syntax
97    }
98
99    #[inline]
100    fn target(&self) -> EsVersion {
101        self.target
102    }
103
104    #[inline]
105    fn start_pos(&self) -> BytePos {
106        self.start_pos
107    }
108
109    #[inline]
110    fn set_expr_allowed(&mut self, _: bool) {}
111
112    #[inline]
113    fn set_next_regexp(&mut self, start: Option<BytePos>) {
114        self.state.next_regexp = start;
115    }
116
117    fn add_error(&mut self, error: Error) {
118        self.errors.push(error);
119    }
120
121    fn add_module_mode_error(&mut self, error: Error) {
122        if self.ctx.contains(Context::Module) {
123            self.add_error(error);
124            return;
125        }
126        self.module_errors.push(error);
127    }
128
129    #[inline]
130    fn take_errors(&mut self) -> Vec<Error> {
131        take(&mut self.errors)
132    }
133
134    #[inline]
135    fn take_script_module_errors(&mut self) -> Vec<Error> {
136        take(&mut self.module_errors)
137    }
138
139    #[inline]
140    fn end_pos(&self) -> BytePos {
141        self.input.end_pos()
142    }
143
144    #[inline]
145    fn update_token_flags(&mut self, f: impl FnOnce(&mut TokenFlags)) {
146        f(&mut self.token_flags)
147    }
148
149    #[inline]
150    fn token_flags(&self) -> TokenFlags {
151        self.token_flags
152    }
153
154    fn clone_token_value(&self) -> Option<TokenValue> {
155        self.state.token_value.clone()
156    }
157
158    fn get_token_value(&self) -> Option<&TokenValue> {
159        self.state.token_value.as_ref()
160    }
161
162    fn set_token_value(&mut self, token_value: Option<TokenValue>) {
163        self.state.token_value = token_value;
164    }
165
166    fn take_token_value(&mut self) -> Option<TokenValue> {
167        self.state.token_value.take()
168    }
169
170    fn rescan_jsx_token(&mut self, allow_multiline_jsx_text: bool, reset: BytePos) -> TokenAndSpan {
171        unsafe {
172            self.input.reset_to(reset);
173        }
174        Tokens::scan_jsx_token(self, allow_multiline_jsx_text)
175    }
176
177    fn rescan_jsx_open_el_terminal_token(&mut self, reset: BytePos) -> TokenAndSpan {
178        unsafe {
179            self.input.reset_to(reset);
180        }
181        Tokens::scan_jsx_open_el_terminal_token(self)
182    }
183
184    fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> TokenAndSpan {
185        let start = self.cur_pos();
186        let res = match self.scan_jsx_token(allow_multiline_jsx_text) {
187            Ok(res) => Ok(res),
188            Err(error) => {
189                self.state.set_token_value(TokenValue::Error(error));
190                Err(Token::Error)
191            }
192        };
193        let token = match res {
194            Ok(t) => t,
195            Err(e) => e,
196        };
197        let span = self.span(start);
198        if token != Token::Eof {
199            if let Some(comments) = self.comments_buffer.as_mut() {
200                comments.pending_to_comment(BufferedCommentKind::Leading, start);
201            }
202
203            self.state.set_token_type(token);
204            self.state.prev_hi = self.last_pos();
205        }
206        // Attach span to token.
207        TokenAndSpan {
208            token,
209            had_line_break: self.had_line_break_before_last(),
210            span,
211        }
212    }
213
214    fn scan_jsx_open_el_terminal_token(&mut self) -> TokenAndSpan {
215        self.skip_space::<true>();
216        let start = self.input.cur_pos();
217        let res = match self.scan_jsx_attrs_terminal_token() {
218            Ok(res) => Ok(res),
219            Err(error) => {
220                self.state.set_token_value(TokenValue::Error(error));
221                Err(Token::Error)
222            }
223        };
224        let token = match res {
225            Ok(t) => t,
226            Err(e) => e,
227        };
228        let span = self.span(start);
229        if token != Token::Eof {
230            if let Some(comments) = self.comments_buffer.as_mut() {
231                comments.pending_to_comment(BufferedCommentKind::Leading, start);
232            }
233
234            self.state.set_token_type(token);
235            self.state.prev_hi = self.last_pos();
236        }
237        // Attach span to token.
238        TokenAndSpan {
239            token,
240            had_line_break: self.had_line_break_before_last(),
241            span,
242        }
243    }
244
245    fn scan_jsx_identifier(&mut self, start: BytePos) -> TokenAndSpan {
246        let token = self.state.token_type.unwrap();
247        debug_assert!(token.is_word());
248        let mut v = String::with_capacity(16);
249        while let Some(ch) = self.input().cur() {
250            if ch == '-' {
251                v.push(ch);
252                self.bump();
253            } else {
254                let old_pos = self.cur_pos();
255                v.push_str(&self.scan_identifier_parts());
256                if self.cur_pos() == old_pos {
257                    break;
258                }
259            }
260        }
261        let v = if !v.is_empty() {
262            let v = if token.is_known_ident() || token.is_keyword() {
263                format!("{}{}", token.to_string(None), v)
264            } else if let Some(TokenValue::Word(value)) = self.state.token_value.take() {
265                format!("{value}{v}")
266            } else {
267                format!("{}{}", token.to_string(None), v)
268            };
269            self.atom(v)
270        } else if token.is_known_ident() || token.is_keyword() {
271            self.atom(token.to_string(None))
272        } else if let Some(TokenValue::Word(value)) = self.state.token_value.take() {
273            value
274        } else {
275            unreachable!(
276                "`token_value` should be a word, but got: {:?}",
277                self.state.token_value
278            )
279        };
280        self.state.set_token_value(TokenValue::Word(v));
281        TokenAndSpan {
282            token: Token::JSXName,
283            had_line_break: self.had_line_break_before_last(),
284            span: self.span(start),
285        }
286    }
287
288    fn scan_jsx_attribute_value(&mut self) -> TokenAndSpan {
289        let Some(cur) = self.cur() else {
290            let start = self.cur_pos();
291            return TokenAndSpan {
292                token: Token::Eof,
293                had_line_break: self.had_line_break_before_last(),
294                span: self.span(start),
295            };
296        };
297        let start = self.cur_pos();
298
299        match cur {
300            '\'' | '"' => {
301                let token = self.read_jsx_str(cur);
302                let token = match token {
303                    Ok(token) => token,
304                    Err(e) => {
305                        self.state.set_token_value(TokenValue::Error(e));
306                        return TokenAndSpan {
307                            token: Token::Error,
308                            had_line_break: self.had_line_break_before_last(),
309                            span: self.span(start),
310                        };
311                    }
312                };
313                debug_assert!(self
314                    .get_token_value()
315                    .is_some_and(|t| matches!(t, TokenValue::Str { .. })));
316                debug_assert!(token == Token::Str);
317                TokenAndSpan {
318                    token,
319                    had_line_break: self.had_line_break_before_last(),
320                    span: self.span(start),
321                }
322            }
323            _ => self.next().unwrap_or_else(|| TokenAndSpan {
324                token: Token::Eof,
325                had_line_break: self.had_line_break_before_last(),
326                span: self.span(start),
327            }),
328        }
329    }
330
331    fn rescan_template_token(
332        &mut self,
333        start: BytePos,
334        start_with_back_tick: bool,
335    ) -> TokenAndSpan {
336        unsafe { self.input.reset_to(start) };
337        let res = self.scan_template_token(start, start_with_back_tick);
338        let token = match res.map_err(|e| {
339            self.state.set_token_value(TokenValue::Error(e));
340            Token::Error
341        }) {
342            Ok(t) => t,
343            Err(e) => e,
344        };
345        let span = if start_with_back_tick {
346            self.span(start)
347        } else {
348            // `+ BytePos(1)` is used to skip `{`
349            self.span(start + BytePos(1))
350        };
351
352        if token != Token::Eof {
353            if let Some(comments) = self.comments_buffer.as_mut() {
354                comments.pending_to_comment(BufferedCommentKind::Leading, start);
355            }
356
357            self.state.set_token_type(token);
358            self.state.prev_hi = self.last_pos();
359        }
360        // Attach span to token.
361        TokenAndSpan {
362            token,
363            had_line_break: self.had_line_break_before_last(),
364            span,
365        }
366    }
367}
368
369impl Lexer<'_> {
370    fn next_token(&mut self, start: &mut BytePos) -> Result<Token, Error> {
371        if let Some(next_regexp) = self.state.next_regexp {
372            *start = next_regexp;
373            return self.read_regexp(next_regexp);
374        }
375
376        if self.state.is_first {
377            if let Some(shebang) = self.read_shebang()? {
378                self.state.set_token_value(TokenValue::Word(shebang));
379                return Ok(Token::Shebang);
380            }
381        }
382
383        self.state.had_line_break = self.state.is_first;
384        self.state.is_first = false;
385
386        self.skip_space::<true>();
387        *start = self.input.cur_pos();
388
389        if self.input.last_pos() == self.input.end_pos() {
390            // End of input.
391            self.consume_pending_comments();
392            return Ok(Token::Eof);
393        }
394
395        // println!(
396        //     "\tContext: ({:?}) {:?}",
397        //     self.input.cur().unwrap(),
398        //     self.state.context.0
399        // );
400
401        self.state.start = *start;
402
403        self.read_token()
404    }
405
406    fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> Result<Token, Error> {
407        debug_assert!(self.syntax.jsx());
408
409        if self.input_mut().as_str().is_empty() {
410            return Ok(Token::Eof);
411        };
412
413        if self.input.eat_byte(b'<') {
414            return Ok(if self.input.eat_byte(b'/') {
415                Token::LessSlash
416            } else {
417                Token::Lt
418            });
419        } else if self.input.eat_byte(b'{') {
420            return Ok(Token::LBrace);
421        }
422
423        let start = self.input.cur_pos();
424        let mut first_non_whitespace = 0;
425        let mut chunk_start = start;
426        let mut value = String::new();
427
428        while let Some(ch) = self.input_mut().cur() {
429            if ch == '{' {
430                break;
431            } else if ch == '<' {
432                // TODO: check git conflict mark
433                break;
434            }
435
436            if ch == '>' {
437                self.emit_error(
438                    self.input().cur_pos(),
439                    SyntaxError::UnexpectedTokenWithSuggestions {
440                        candidate_list: vec!["`{'>'}`", "`&gt;`"],
441                    },
442                );
443            } else if ch == '}' {
444                self.emit_error(
445                    self.input().cur_pos(),
446                    SyntaxError::UnexpectedTokenWithSuggestions {
447                        candidate_list: vec!["`{'}'}`", "`&rbrace;`"],
448                    },
449                );
450            }
451
452            if first_non_whitespace == 0 && ch.is_line_terminator() {
453                first_non_whitespace = -1;
454            } else if !allow_multiline_jsx_text
455                && ch.is_line_terminator()
456                && first_non_whitespace > 0
457            {
458                break;
459            } else if ch.is_whitespace() {
460                first_non_whitespace = self.cur_pos().0 as i32;
461            }
462
463            if ch == '&' {
464                let s = unsafe {
465                    // Safety: We already checked for the range
466                    self.input_slice_to_cur(chunk_start)
467                };
468                value.push_str(s);
469
470                if let Ok(jsx_entity) = self.read_jsx_entity() {
471                    value.push(jsx_entity.0);
472
473                    chunk_start = self.input.cur_pos();
474                }
475            } else {
476                self.bump();
477            }
478        }
479
480        let raw = unsafe {
481            // Safety: Both of `start` and `end` are generated from `cur_pos()`
482            self.input_slice_to_cur(start)
483        };
484        let value = if value.is_empty() {
485            self.atom(raw)
486        } else {
487            let s = unsafe {
488                // Safety: We already checked for the range
489                self.input_slice_to_cur(chunk_start)
490            };
491            value.push_str(s);
492            self.atom(value)
493        };
494
495        let raw: swc_atoms::Atom = self.atom(raw);
496
497        self.state.set_token_value(TokenValue::Str {
498            raw,
499            value: value.into(),
500        });
501
502        self.state.start = start;
503
504        Ok(Token::JSXText)
505    }
506
507    fn scan_jsx_attrs_terminal_token(&mut self) -> LexResult<Token> {
508        if self.input_mut().as_str().is_empty() {
509            Ok(Token::Eof)
510        } else if self.input.eat_byte(b'>') {
511            Ok(Token::Gt)
512        } else if self.input.eat_byte(b'/') {
513            Ok(Token::Slash)
514        } else {
515            self.read_token()
516        }
517    }
518
519    fn scan_identifier_parts(&mut self) -> String {
520        let mut v = String::with_capacity(16);
521        while let Some(ch) = self.input().cur() {
522            if ch.is_ident_part() {
523                v.push(ch);
524                self.input_mut().bump_bytes(ch.len_utf8());
525            } else if ch == '\\' {
526                self.bump(); // bump '\'
527                if !self.is(b'u') {
528                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
529                    continue;
530                }
531                self.bump(); // bump 'u'
532                let Ok(value) = self.read_unicode_escape() else {
533                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
534                    break;
535                };
536                if let Some(c) = CodePoint::from(value).to_char() {
537                    v.push(c);
538                } else {
539                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
540                }
541                self.token_flags |= TokenFlags::UNICODE;
542            } else {
543                break;
544            }
545        }
546        v
547    }
548}
549
550impl Iterator for Lexer<'_> {
551    type Item = TokenAndSpan;
552
553    fn next(&mut self) -> Option<Self::Item> {
554        let mut start = self.cur_pos();
555
556        let token = match self.next_token(&mut start) {
557            Ok(res) => res,
558            Err(error) => {
559                self.state.set_token_value(TokenValue::Error(error));
560                Token::Error
561            }
562        };
563
564        let span = self.span(start);
565        if token != Token::Eof {
566            if let Some(comments) = self.comments_buffer.as_mut() {
567                comments.pending_to_comment(BufferedCommentKind::Leading, start);
568            }
569
570            self.state.set_token_type(token);
571            self.state.prev_hi = self.last_pos();
572            // Attach span to token.
573            Some(TokenAndSpan {
574                token,
575                had_line_break: self.had_line_break_before_last(),
576                span,
577            })
578        } else {
579            None
580        }
581    }
582}
583
584impl State {
585    pub fn new(start_pos: BytePos) -> Self {
586        State {
587            had_line_break: false,
588            is_first: true,
589            next_regexp: None,
590            start: BytePos(0),
591            prev_hi: start_pos,
592            token_value: None,
593            token_type: None,
594        }
595    }
596
597    pub(crate) fn set_token_value(&mut self, token_value: TokenValue) {
598        self.token_value = Some(token_value);
599    }
600}
601
602impl State {
603    #[inline(always)]
604    pub fn had_line_break(&self) -> bool {
605        self.had_line_break
606    }
607
608    #[inline(always)]
609    pub fn mark_had_line_break(&mut self) {
610        self.had_line_break = true;
611    }
612
613    #[inline(always)]
614    pub fn set_token_type(&mut self, token_type: Token) {
615        self.token_type = Some(token_type);
616    }
617
618    #[inline(always)]
619    pub fn token_type(&self) -> Option<Token> {
620        self.token_type
621    }
622
623    #[inline(always)]
624    pub fn prev_hi(&self) -> BytePos {
625        self.prev_hi
626    }
627
628    #[inline(always)]
629    pub fn start(&self) -> BytePos {
630        self.start
631    }
632
633    pub fn can_have_trailing_line_comment(&self) -> bool {
634        let Some(t) = self.token_type() else {
635            return true;
636        };
637        !t.is_bin_op()
638    }
639
640    pub fn can_have_trailing_comment(&self) -> bool {
641        self.token_type().is_some_and(|t| {
642            !t.is_keyword()
643                && (t == Token::Semi
644                    || t == Token::LBrace
645                    || t.is_other_and_can_have_trailing_comment())
646        })
647    }
648}