swc_ecma_parser/lexer/
state.rs

1use std::mem::take;
2
3use swc_common::BytePos;
4use swc_ecma_ast::EsVersion;
5use swc_ecma_lexer::{
6    common::{
7        lexer::{
8            char::CharExt,
9            comments_buffer::{BufferedCommentKind, CommentsBufferTrait},
10            state::State as StateTrait,
11            LexResult,
12        },
13        syntax::SyntaxFlags,
14    },
15    error::SyntaxError,
16    TokenContexts,
17};
18
19use super::{Context, Input, Lexer, LexerTrait};
20use crate::{
21    error::Error,
22    input::Tokens,
23    lexer::{
24        comments_buffer::CommentsBufferCheckpoint,
25        token::{Token, TokenAndSpan, TokenValue},
26    },
27};
28
29/// State of lexer.
30///
31/// Ported from babylon.
32#[derive(Clone)]
33pub struct State {
34    /// if line break exists between previous token and new token?
35    pub had_line_break: bool,
36    /// if line break exists before last?
37    pub had_line_break_before_last: bool,
38    /// TODO: Remove this field.
39    is_first: bool,
40    pub next_regexp: Option<BytePos>,
41    pub start: BytePos,
42    pub prev_hi: BytePos,
43
44    pub(super) token_value: Option<TokenValue>,
45    token_type: Option<Token>,
46}
47
48pub struct LexerCheckpoint {
49    comments_buffer: CommentsBufferCheckpoint,
50    state: State,
51    ctx: Context,
52    input_last_pos: BytePos,
53}
54
55impl<'a> swc_ecma_lexer::common::input::Tokens<TokenAndSpan> for Lexer<'a> {
56    type Checkpoint = LexerCheckpoint;
57
58    fn checkpoint_save(&self) -> Self::Checkpoint {
59        Self::Checkpoint {
60            state: self.state.clone(),
61            ctx: self.ctx,
62            input_last_pos: self.input.last_pos(),
63            comments_buffer: self
64                .comments_buffer
65                .as_ref()
66                .map(|cb| cb.checkpoint_save())
67                .unwrap_or_default(),
68        }
69    }
70
71    fn checkpoint_load(&mut self, checkpoint: Self::Checkpoint) {
72        self.state = checkpoint.state;
73        self.ctx = checkpoint.ctx;
74        unsafe { self.input.reset_to(checkpoint.input_last_pos) };
75        if let Some(comments_buffer) = self.comments_buffer.as_mut() {
76            comments_buffer.checkpoint_load(checkpoint.comments_buffer);
77        }
78    }
79
80    #[inline]
81    fn set_ctx(&mut self, ctx: Context) {
82        if ctx.contains(Context::Module) && !self.module_errors.is_empty() {
83            self.errors.append(&mut self.module_errors);
84        }
85        self.ctx = ctx
86    }
87
88    #[inline]
89    fn ctx(&self) -> Context {
90        self.ctx
91    }
92
93    #[inline]
94    fn ctx_mut(&mut self) -> &mut Context {
95        &mut self.ctx
96    }
97
98    #[inline]
99    fn syntax(&self) -> SyntaxFlags {
100        self.syntax
101    }
102
103    #[inline]
104    fn target(&self) -> EsVersion {
105        self.target
106    }
107
108    #[inline]
109    fn start_pos(&self) -> BytePos {
110        self.start_pos
111    }
112
113    #[inline]
114    fn set_expr_allowed(&mut self, _: bool) {}
115
116    #[inline]
117    fn set_next_regexp(&mut self, start: Option<BytePos>) {
118        self.state.next_regexp = start;
119    }
120
121    #[inline]
122    fn token_context(&self) -> &TokenContexts {
123        unreachable!();
124    }
125
126    #[inline]
127    fn token_context_mut(&mut self) -> &mut TokenContexts {
128        unreachable!();
129    }
130
131    #[inline]
132    fn set_token_context(&mut self, _: TokenContexts) {
133        unreachable!();
134    }
135
136    fn add_error(&mut self, error: Error) {
137        self.errors.push(error);
138    }
139
140    fn add_module_mode_error(&mut self, error: Error) {
141        if self.ctx.contains(Context::Module) {
142            self.add_error(error);
143            return;
144        }
145        self.module_errors.push(error);
146    }
147
148    #[inline]
149    fn take_errors(&mut self) -> Vec<Error> {
150        take(&mut self.errors)
151    }
152
153    #[inline]
154    fn take_script_module_errors(&mut self) -> Vec<Error> {
155        take(&mut self.module_errors)
156    }
157
158    #[inline]
159    fn end_pos(&self) -> BytePos {
160        self.input.end_pos()
161    }
162
163    #[inline]
164    fn update_token_flags(&mut self, f: impl FnOnce(&mut swc_ecma_lexer::lexer::TokenFlags)) {
165        f(&mut self.token_flags)
166    }
167
168    #[inline]
169    fn token_flags(&self) -> swc_ecma_lexer::lexer::TokenFlags {
170        self.token_flags
171    }
172}
173
174impl crate::input::Tokens for Lexer<'_> {
175    fn clone_token_value(&self) -> Option<TokenValue> {
176        self.state.token_value.clone()
177    }
178
179    fn get_token_value(&self) -> Option<&TokenValue> {
180        self.state.token_value.as_ref()
181    }
182
183    fn set_token_value(&mut self, token_value: Option<TokenValue>) {
184        self.state.token_value = token_value;
185    }
186
187    fn take_token_value(&mut self) -> Option<TokenValue> {
188        self.state.token_value.take()
189    }
190
191    fn rescan_jsx_token(&mut self, allow_multiline_jsx_text: bool, reset: BytePos) -> TokenAndSpan {
192        unsafe {
193            self.input.reset_to(reset);
194        }
195        Tokens::scan_jsx_token(self, allow_multiline_jsx_text)
196    }
197
198    fn rescan_jsx_open_el_terminal_token(&mut self, reset: BytePos) -> TokenAndSpan {
199        unsafe {
200            self.input.reset_to(reset);
201        }
202        Tokens::scan_jsx_open_el_terminal_token(self)
203    }
204
205    fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> TokenAndSpan {
206        let start = self.cur_pos();
207        let res = match self.scan_jsx_token(allow_multiline_jsx_text) {
208            Ok(res) => Ok(res),
209            Err(error) => {
210                self.state.set_token_value(TokenValue::Error(error));
211                Err(Token::Error)
212            }
213        };
214        let token = match res {
215            Ok(t) => t,
216            Err(e) => e,
217        };
218        let span = self.span(start);
219        if token != Token::Eof {
220            if let Some(comments) = self.comments_buffer.as_mut() {
221                comments.pending_to_comment(BufferedCommentKind::Leading, start);
222            }
223
224            self.state.set_token_type(token);
225            self.state.prev_hi = self.last_pos();
226            self.state.had_line_break_before_last = self.had_line_break_before_last();
227        }
228        // Attach span to token.
229        TokenAndSpan {
230            token,
231            had_line_break: self.had_line_break_before_last(),
232            span,
233        }
234    }
235
236    fn scan_jsx_open_el_terminal_token(&mut self) -> TokenAndSpan {
237        self.skip_space::<true>();
238        let start = self.input.cur_pos();
239        let res = match self.scan_jsx_attrs_terminal_token() {
240            Ok(res) => Ok(res),
241            Err(error) => {
242                self.state.set_token_value(TokenValue::Error(error));
243                Err(Token::Error)
244            }
245        };
246        let token = match res {
247            Ok(t) => t,
248            Err(e) => e,
249        };
250        let span = self.span(start);
251        if token != Token::Eof {
252            if let Some(comments) = self.comments_buffer.as_mut() {
253                comments.pending_to_comment(BufferedCommentKind::Leading, start);
254            }
255
256            self.state.set_token_type(token);
257            self.state.prev_hi = self.last_pos();
258            self.state.had_line_break_before_last = self.had_line_break_before_last();
259        }
260        // Attach span to token.
261        TokenAndSpan {
262            token,
263            had_line_break: self.had_line_break_before_last(),
264            span,
265        }
266    }
267
268    fn scan_jsx_identifier(&mut self, start: BytePos) -> TokenAndSpan {
269        let token = self.state.token_type.unwrap();
270        debug_assert!(token.is_word());
271        let mut v = String::with_capacity(16);
272        while let Some(ch) = self.input().cur() {
273            if ch == '-' {
274                v.push(ch);
275                self.bump();
276            } else {
277                let old_pos = self.cur_pos();
278                v.push_str(&self.scan_identifier_parts());
279                if self.cur_pos() == old_pos {
280                    break;
281                }
282            }
283        }
284        let v = if !v.is_empty() {
285            let v = if token.is_known_ident() {
286                format!("{}{}", token.to_string(None), v)
287            } else if let Some(TokenValue::Word(value)) = self.state.token_value.take() {
288                format!("{value}{v}")
289            } else {
290                format!("{}{}", token.to_string(None), v)
291            };
292            self.atom(v)
293        } else if token.is_known_ident() || token.is_keyword() {
294            self.atom(token.to_string(None))
295        } else if let Some(TokenValue::Word(value)) = self.state.token_value.take() {
296            value
297        } else {
298            unreachable!(
299                "`token_value` should be a word, but got: {:?}",
300                self.state.token_value
301            )
302        };
303        self.state.set_token_value(TokenValue::Word(v));
304        TokenAndSpan {
305            token: Token::JSXName,
306            had_line_break: self.had_line_break_before_last(),
307            span: self.span(start),
308        }
309    }
310
311    fn scan_jsx_attribute_value(&mut self) -> TokenAndSpan {
312        let Some(cur) = self.cur() else {
313            let start = self.cur_pos();
314            return TokenAndSpan {
315                token: Token::Eof,
316                had_line_break: self.had_line_break_before_last(),
317                span: self.span(start),
318            };
319        };
320        let start = self.cur_pos();
321
322        match cur {
323            '\'' | '"' => {
324                let token = self.read_jsx_str(cur);
325                let token = match token {
326                    Ok(token) => token,
327                    Err(e) => {
328                        self.state.set_token_value(TokenValue::Error(e));
329                        return TokenAndSpan {
330                            token: Token::Error,
331                            had_line_break: self.had_line_break_before_last(),
332                            span: self.span(start),
333                        };
334                    }
335                };
336                debug_assert!(self
337                    .get_token_value()
338                    .is_some_and(|t| matches!(t, TokenValue::Str { .. })));
339                debug_assert!(token == Token::Str);
340                TokenAndSpan {
341                    token,
342                    had_line_break: self.had_line_break_before_last(),
343                    span: self.span(start),
344                }
345            }
346            _ => self.next().unwrap_or_else(|| TokenAndSpan {
347                token: Token::Eof,
348                had_line_break: self.had_line_break_before_last(),
349                span: self.span(start),
350            }),
351        }
352    }
353
354    fn rescan_template_token(
355        &mut self,
356        start: BytePos,
357        start_with_back_tick: bool,
358    ) -> TokenAndSpan {
359        unsafe { self.input.reset_to(start) };
360        let res = self.scan_template_token(start, start_with_back_tick);
361        let token = match res.map_err(|e| {
362            self.state.set_token_value(TokenValue::Error(e));
363            Token::Error
364        }) {
365            Ok(t) => t,
366            Err(e) => e,
367        };
368        let span = if start_with_back_tick {
369            self.span(start)
370        } else {
371            // `+ BytePos(1)` is used to skip `{`
372            self.span(start + BytePos(1))
373        };
374
375        if token != Token::Eof {
376            if let Some(comments) = self.comments_buffer.as_mut() {
377                comments.pending_to_comment(BufferedCommentKind::Leading, start);
378            }
379
380            self.state.set_token_type(token);
381            self.state.prev_hi = self.last_pos();
382            self.state.had_line_break_before_last = self.had_line_break_before_last();
383        }
384        // Attach span to token.
385        TokenAndSpan {
386            token,
387            had_line_break: self.had_line_break_before_last(),
388            span,
389        }
390    }
391}
392
393impl Lexer<'_> {
394    fn next_token(&mut self, start: &mut BytePos) -> Result<Token, Error> {
395        if let Some(next_regexp) = self.state.next_regexp {
396            *start = next_regexp;
397            return self.read_regexp(next_regexp);
398        }
399
400        if self.state.is_first {
401            if let Some(shebang) = self.read_shebang()? {
402                self.state.set_token_value(TokenValue::Word(shebang));
403                return Ok(Token::Shebang);
404            }
405        }
406
407        self.state.had_line_break = self.state.is_first;
408        self.state.is_first = false;
409
410        self.skip_space::<true>();
411        *start = self.input.cur_pos();
412
413        if self.input.last_pos() == self.input.end_pos() {
414            // End of input.
415            self.consume_pending_comments();
416            return Ok(Token::Eof);
417        }
418
419        // println!(
420        //     "\tContext: ({:?}) {:?}",
421        //     self.input.cur().unwrap(),
422        //     self.state.context.0
423        // );
424
425        self.state.start = *start;
426
427        self.read_token()
428    }
429
430    fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> Result<Token, Error> {
431        debug_assert!(self.syntax.jsx());
432
433        if self.input_mut().as_str().is_empty() {
434            return Ok(Token::Eof);
435        };
436
437        if self.input.eat_byte(b'<') {
438            return Ok(if self.input.eat_byte(b'/') {
439                Token::LessSlash
440            } else {
441                Token::Lt
442            });
443        } else if self.input.eat_byte(b'{') {
444            return Ok(Token::LBrace);
445        }
446
447        let start = self.input.cur_pos();
448        let mut first_non_whitespace = 0;
449        let mut chunk_start = start;
450        let mut value = String::new();
451
452        while let Some(ch) = self.input_mut().cur() {
453            if ch == '{' {
454                break;
455            } else if ch == '<' {
456                // TODO: check git conflict mark
457                break;
458            }
459
460            if ch == '>' {
461                self.emit_error(
462                    self.input().cur_pos(),
463                    SyntaxError::UnexpectedTokenWithSuggestions {
464                        candidate_list: vec!["`{'>'}`", "`&gt;`"],
465                    },
466                );
467            } else if ch == '}' {
468                self.emit_error(
469                    self.input().cur_pos(),
470                    SyntaxError::UnexpectedTokenWithSuggestions {
471                        candidate_list: vec!["`{'}'}`", "`&rbrace;`"],
472                    },
473                );
474            }
475
476            if first_non_whitespace == 0 && ch.is_line_terminator() {
477                first_non_whitespace = -1;
478            } else if !allow_multiline_jsx_text
479                && ch.is_line_terminator()
480                && first_non_whitespace > 0
481            {
482                break;
483            } else if ch.is_whitespace() {
484                first_non_whitespace = self.cur_pos().0 as i32;
485            }
486
487            if ch == '&' {
488                let cur_pos = self.input().cur_pos();
489
490                let s = unsafe {
491                    // Safety: We already checked for the range
492                    self.input_slice(chunk_start, cur_pos)
493                };
494                value.push_str(s);
495
496                if let Ok(jsx_entity) = self.read_jsx_entity() {
497                    value.push(jsx_entity.0);
498
499                    chunk_start = self.input.cur_pos();
500                }
501            } else {
502                self.bump();
503            }
504        }
505
506        let end = self.input().cur_pos();
507        let raw = unsafe {
508            // Safety: Both of `start` and `end` are generated from `cur_pos()`
509            self.input_slice(start, end)
510        };
511        let value = if value.is_empty() {
512            self.atom(raw)
513        } else {
514            let s = unsafe {
515                // Safety: We already checked for the range
516                self.input_slice(chunk_start, end)
517            };
518            value.push_str(s);
519            self.atom(value)
520        };
521
522        let raw: swc_atoms::Atom = self.atom(raw);
523
524        self.state.set_token_value(TokenValue::Str { raw, value });
525
526        self.state.start = start;
527
528        Ok(Token::JSXText)
529    }
530
531    fn scan_jsx_attrs_terminal_token(&mut self) -> LexResult<Token> {
532        if self.input_mut().as_str().is_empty() {
533            Ok(Token::Eof)
534        } else if self.input.eat_byte(b'>') {
535            Ok(Token::Gt)
536        } else if self.input.eat_byte(b'/') {
537            Ok(Token::Slash)
538        } else {
539            self.read_token()
540        }
541    }
542
543    fn scan_identifier_parts(&mut self) -> String {
544        let mut v = String::with_capacity(16);
545        while let Some(ch) = self.input().cur() {
546            if ch.is_ident_part() {
547                v.push(ch);
548                self.input_mut().bump_bytes(ch.len_utf8());
549            } else if ch == '\\' {
550                self.bump(); // bump '\'
551                if !self.is(b'u') {
552                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
553                    continue;
554                }
555                self.bump(); // bump 'u'
556                let Ok(chars) = self.read_unicode_escape() else {
557                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
558                    break;
559                };
560                for c in chars {
561                    v.extend(c);
562                }
563                self.token_flags |= swc_ecma_lexer::lexer::TokenFlags::UNICODE;
564            } else {
565                break;
566            }
567        }
568        v
569    }
570}
571
572impl Iterator for Lexer<'_> {
573    type Item = TokenAndSpan;
574
575    fn next(&mut self) -> Option<Self::Item> {
576        let mut start = self.cur_pos();
577
578        let token = match self.next_token(&mut start) {
579            Ok(res) => res,
580            Err(error) => {
581                self.state.set_token_value(TokenValue::Error(error));
582                Token::Error
583            }
584        };
585
586        let span = self.span(start);
587        if token != Token::Eof {
588            if let Some(comments) = self.comments_buffer.as_mut() {
589                comments.pending_to_comment(BufferedCommentKind::Leading, start);
590            }
591
592            self.state.set_token_type(token);
593            self.state.prev_hi = self.last_pos();
594            self.state.had_line_break_before_last = self.had_line_break_before_last();
595            // Attach span to token.
596            Some(TokenAndSpan {
597                token,
598                had_line_break: self.had_line_break_before_last(),
599                span,
600            })
601        } else {
602            None
603        }
604    }
605}
606
607impl State {
608    pub fn new(start_pos: BytePos) -> Self {
609        State {
610            had_line_break: false,
611            had_line_break_before_last: false,
612            is_first: true,
613            next_regexp: None,
614            start: BytePos(0),
615            prev_hi: start_pos,
616            token_value: None,
617            token_type: None,
618        }
619    }
620
621    pub(crate) fn set_token_value(&mut self, token_value: TokenValue) {
622        self.token_value = Some(token_value);
623    }
624}
625
626impl swc_ecma_lexer::common::lexer::state::State for State {
627    type TokenKind = Token;
628    type TokenType = Token;
629
630    #[inline(always)]
631    fn is_expr_allowed(&self) -> bool {
632        unreachable!("is_expr_allowed should not be called in Parser/State")
633    }
634
635    #[inline(always)]
636    fn set_is_expr_allowed(&mut self, _: bool) {
637        // noop
638    }
639
640    #[inline(always)]
641    fn set_next_regexp(&mut self, start: Option<BytePos>) {
642        self.next_regexp = start;
643    }
644
645    #[inline(always)]
646    fn had_line_break(&self) -> bool {
647        self.had_line_break
648    }
649
650    #[inline(always)]
651    fn mark_had_line_break(&mut self) {
652        self.had_line_break = true;
653    }
654
655    #[inline(always)]
656    fn had_line_break_before_last(&self) -> bool {
657        self.had_line_break_before_last
658    }
659
660    #[inline(always)]
661    fn token_contexts(&self) -> &swc_ecma_lexer::TokenContexts {
662        unreachable!();
663    }
664
665    #[inline(always)]
666    fn mut_token_contexts(&mut self) -> &mut swc_ecma_lexer::TokenContexts {
667        unreachable!();
668    }
669
670    #[inline(always)]
671    fn set_token_type(&mut self, token_type: Self::TokenType) {
672        self.token_type = Some(token_type);
673    }
674
675    #[inline(always)]
676    fn token_type(&self) -> Option<Self::TokenType> {
677        self.token_type
678    }
679
680    #[inline(always)]
681    fn syntax(&self) -> SyntaxFlags {
682        unreachable!("syntax is not stored in State, but in Lexer")
683    }
684
685    #[inline(always)]
686    fn prev_hi(&self) -> BytePos {
687        self.prev_hi
688    }
689
690    #[inline(always)]
691    fn start(&self) -> BytePos {
692        self.start
693    }
694}