swc_ecma_parser/lexer/
mod.rs

1//! ECMAScript lexer.
2
3use std::{char, iter::FusedIterator, rc::Rc};
4
5use swc_atoms::AtomStoreCell;
6use swc_common::{
7    comments::Comments,
8    input::{Input, StringInput},
9    BytePos,
10};
11use swc_ecma_ast::EsVersion;
12use swc_ecma_lexer::{
13    common::{
14        lexer::{char::CharExt, fixed_len_span, pos_span, LexResult, Lexer as LexerTrait},
15        syntax::SyntaxFlags,
16    },
17    lexer::TokenFlags,
18};
19
20use self::table::{ByteHandler, BYTE_HANDLERS};
21use crate::{
22    error::{Error, SyntaxError},
23    input::Tokens,
24    lexer::comments_buffer::CommentsBuffer,
25    Context, Syntax,
26};
27
28#[cfg(feature = "unstable")]
29pub(crate) mod capturing;
30mod comments_buffer;
31mod state;
32mod table;
33pub(crate) mod token;
34
35pub(crate) use token::{NextTokenAndSpan, Token, TokenAndSpan, TokenValue};
36
37#[derive(Clone)]
38pub struct Lexer<'a> {
39    comments: Option<&'a dyn Comments>,
40    /// [Some] if comment comment parsing is enabled. Otherwise [None]
41    comments_buffer: Option<CommentsBuffer>,
42
43    pub ctx: Context,
44    input: StringInput<'a>,
45    start_pos: BytePos,
46
47    state: self::state::State,
48    token_flags: TokenFlags,
49    pub(crate) syntax: SyntaxFlags,
50    pub(crate) target: EsVersion,
51
52    errors: Vec<Error>,
53    module_errors: Vec<Error>,
54
55    atoms: Rc<AtomStoreCell>,
56}
57
58impl FusedIterator for Lexer<'_> {}
59
60impl<'a> swc_ecma_lexer::common::lexer::Lexer<'a, TokenAndSpan> for Lexer<'a> {
61    type CommentsBuffer = CommentsBuffer;
62    type State = self::state::State;
63    type Token = self::Token;
64
65    #[inline(always)]
66    fn input(&self) -> &StringInput<'a> {
67        &self.input
68    }
69
70    #[inline(always)]
71    fn input_mut(&mut self) -> &mut StringInput<'a> {
72        &mut self.input
73    }
74
75    #[inline(always)]
76    fn push_error(&mut self, error: Error) {
77        self.errors.push(error);
78    }
79
80    #[inline(always)]
81    fn state(&self) -> &Self::State {
82        &self.state
83    }
84
85    #[inline(always)]
86    fn state_mut(&mut self) -> &mut Self::State {
87        &mut self.state
88    }
89
90    #[inline(always)]
91    fn comments(&self) -> Option<&'a dyn swc_common::comments::Comments> {
92        self.comments
93    }
94
95    #[inline(always)]
96    fn comments_buffer(&self) -> Option<&Self::CommentsBuffer> {
97        self.comments_buffer.as_ref()
98    }
99
100    #[inline(always)]
101    fn comments_buffer_mut(&mut self) -> Option<&mut Self::CommentsBuffer> {
102        self.comments_buffer.as_mut()
103    }
104
105    #[inline(always)]
106    unsafe fn input_slice(&mut self, start: BytePos, end: BytePos) -> &'a str {
107        self.input.slice(start, end)
108    }
109
110    #[inline(always)]
111    fn input_uncons_while(&mut self, f: impl FnMut(char) -> bool) -> &'a str {
112        self.input_mut().uncons_while(f)
113    }
114
115    #[inline(always)]
116    fn atom<'b>(&self, s: impl Into<std::borrow::Cow<'b, str>>) -> swc_atoms::Atom {
117        self.atoms.atom(s)
118    }
119}
120
121impl<'a> Lexer<'a> {
122    pub fn new(
123        syntax: Syntax,
124        target: EsVersion,
125        input: StringInput<'a>,
126        comments: Option<&'a dyn Comments>,
127    ) -> Self {
128        let start_pos = input.last_pos();
129
130        Lexer {
131            comments,
132            comments_buffer: comments.is_some().then(CommentsBuffer::new),
133            ctx: Default::default(),
134            input,
135            start_pos,
136            state: self::state::State::new(start_pos),
137            syntax: syntax.into_flags(),
138            target,
139            errors: Default::default(),
140            module_errors: Default::default(),
141            atoms: Default::default(),
142            token_flags: TokenFlags::empty(),
143        }
144    }
145
146    /// babel: `getTokenFromCode`
147    fn read_token(&mut self) -> LexResult<Token> {
148        self.token_flags = TokenFlags::empty();
149        let byte = match self.input.as_str().as_bytes().first() {
150            Some(&v) => v,
151            None => return Ok(Token::Eof),
152        };
153
154        let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) };
155        handler(self)
156    }
157
158    fn read_token_plus_minus<const C: u8>(&mut self) -> LexResult<Token> {
159        let start = self.cur_pos();
160
161        unsafe {
162            // Safety: cur() is Some(c), if this method is called.
163            self.input.bump();
164        }
165
166        // '++', '--'
167        Ok(if self.input.cur() == Some(C as char) {
168            unsafe {
169                // Safety: cur() is Some(c)
170                self.input.bump();
171            }
172
173            // Handle -->
174            if self.state.had_line_break && C == b'-' && self.eat(b'>') {
175                self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule);
176                self.skip_line_comment(0);
177                self.skip_space::<true>();
178                return self.read_token();
179            }
180
181            if C == b'+' {
182                Token::PlusPlus
183            } else {
184                Token::MinusMinus
185            }
186        } else if self.input.eat_byte(b'=') {
187            if C == b'+' {
188                Token::PlusEq
189            } else {
190                Token::MinusEq
191            }
192        } else if C == b'+' {
193            Token::Plus
194        } else {
195            Token::Minus
196        })
197    }
198
199    fn read_token_bang_or_eq<const C: u8>(&mut self) -> LexResult<Token> {
200        let start = self.cur_pos();
201        let had_line_break_before_last = self.had_line_break_before_last();
202
203        unsafe {
204            // Safety: cur() is Some(c) if this method is called.
205            self.input.bump();
206        }
207
208        Ok(if self.input.eat_byte(b'=') {
209            // "=="
210
211            if self.input.eat_byte(b'=') {
212                if C == b'!' {
213                    Token::NotEqEq
214                } else {
215                    // =======
216                    //    ^
217                    if had_line_break_before_last && self.is_str("====") {
218                        self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185);
219                        self.skip_line_comment(4);
220                        self.skip_space::<true>();
221                        return self.read_token();
222                    }
223
224                    Token::EqEqEq
225                }
226            } else if C == b'!' {
227                Token::NotEq
228            } else {
229                Token::EqEq
230            }
231        } else if C == b'=' && self.input.eat_byte(b'>') {
232            // "=>"
233
234            Token::Arrow
235        } else if C == b'!' {
236            Token::Bang
237        } else {
238            Token::Eq
239        })
240    }
241}
242
243impl Lexer<'_> {
244    #[inline(never)]
245    fn read_token_lt_gt<const C: u8>(&mut self) -> LexResult<Token> {
246        let had_line_break_before_last = self.had_line_break_before_last();
247        let start = self.cur_pos();
248        self.bump();
249
250        if self.syntax.typescript()
251            && self.ctx.contains(Context::InType)
252            && !self.ctx.contains(Context::ShouldNotLexLtOrGtAsType)
253        {
254            if C == b'<' {
255                return Ok(Token::Lt);
256            } else if C == b'>' {
257                return Ok(Token::Gt);
258            }
259        }
260
261        // XML style comment. `<!--`
262        if C == b'<' && self.is(b'!') && self.peek() == Some('-') && self.peek_ahead() == Some('-')
263        {
264            self.skip_line_comment(3);
265            self.skip_space::<true>();
266            self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule);
267
268            return self.read_token();
269        }
270
271        let mut op = if C == b'<' { Token::Lt } else { Token::Gt };
272
273        // '<<', '>>'
274        if self.cur() == Some(C as char) {
275            self.bump();
276            op = if C == b'<' {
277                Token::LShift
278            } else {
279                Token::RShift
280            };
281
282            //'>>>'
283            if C == b'>' && self.cur() == Some(C as char) {
284                self.bump();
285                op = Token::ZeroFillRShift;
286            }
287        }
288
289        let token = if self.eat(b'=') {
290            match op {
291                Token::Lt => Token::LtEq,
292                Token::Gt => Token::GtEq,
293                Token::LShift => Token::LShiftEq,
294                Token::RShift => Token::RShiftEq,
295                Token::ZeroFillRShift => Token::ZeroFillRShiftEq,
296                _ => unreachable!(),
297            }
298        } else {
299            op
300        };
301
302        // All conflict markers consist of the same character repeated seven times.
303        // If it is a <<<<<<< or >>>>>>> marker then it is also followed by a space.
304        // <<<<<<<
305        //   ^
306        // >>>>>>>
307        //    ^
308        if had_line_break_before_last
309            && match op {
310                Token::LShift if self.is_str("<<<<< ") => true,
311                Token::ZeroFillRShift if self.is_str(">>>> ") => true,
312                _ => false,
313            }
314        {
315            self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185);
316            self.skip_line_comment(5);
317            self.skip_space::<true>();
318            return self.read_token();
319        }
320
321        Ok(token)
322    }
323
324    fn read_token_back_quote(&mut self) -> LexResult<Token> {
325        let start = self.cur_pos();
326        self.scan_template_token(start, true)
327    }
328
329    fn scan_template_token(
330        &mut self,
331        start: BytePos,
332        started_with_backtick: bool,
333    ) -> LexResult<Token> {
334        debug_assert!(self.cur() == Some(if started_with_backtick { '`' } else { '}' }));
335        let mut cooked = Ok(String::with_capacity(8));
336        self.bump(); // `}` or `\``
337        let mut cooked_slice_start = self.cur_pos();
338        let raw_slice_start = cooked_slice_start;
339        let raw_atom = |this: &mut Self| {
340            let last_pos = this.cur_pos();
341            let s = unsafe { this.input.slice(raw_slice_start, last_pos) };
342            this.atoms.atom(s)
343        };
344        macro_rules! consume_cooked {
345            () => {{
346                if let Ok(cooked) = &mut cooked {
347                    let last_pos = self.cur_pos();
348                    cooked.push_str(unsafe {
349                        // Safety: Both of start and last_pos are valid position because we got them
350                        // from `self.input`
351                        self.input.slice(cooked_slice_start, last_pos)
352                    });
353                }
354            }};
355        }
356
357        while let Some(c) = self.cur() {
358            if c == '`' {
359                consume_cooked!();
360                let cooked = cooked.map(|cooked| self.atoms.atom(cooked));
361                let raw = raw_atom(self);
362                self.bump();
363                return Ok(if started_with_backtick {
364                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
365                    Token::NoSubstitutionTemplateLiteral
366                } else {
367                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
368                    Token::TemplateTail
369                });
370            } else if c == '$' && self.input.peek() == Some('{') {
371                consume_cooked!();
372                let cooked = cooked.map(|cooked| self.atoms.atom(cooked));
373                let raw = raw_atom(self);
374                self.input.bump_bytes(2);
375                return Ok(if started_with_backtick {
376                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
377                    Token::TemplateHead
378                } else {
379                    self.set_token_value(Some(TokenValue::Template { raw, cooked }));
380                    Token::TemplateMiddle
381                });
382            } else if c == '\\' {
383                consume_cooked!();
384
385                match self.read_escaped_char(true) {
386                    Ok(Some(chars)) => {
387                        if let Ok(ref mut cooked) = cooked {
388                            for c in chars {
389                                cooked.extend(c);
390                            }
391                        }
392                    }
393                    Ok(None) => {}
394                    Err(error) => {
395                        cooked = Err(error);
396                    }
397                }
398
399                cooked_slice_start = self.cur_pos();
400            } else if c.is_line_terminator() {
401                consume_cooked!();
402
403                let c = if c == '\r' && self.peek() == Some('\n') {
404                    self.bump(); // '\r'
405                    '\n'
406                } else {
407                    match c {
408                        '\n' => '\n',
409                        '\r' => '\n',
410                        '\u{2028}' => '\u{2028}',
411                        '\u{2029}' => '\u{2029}',
412                        _ => unreachable!(),
413                    }
414                };
415
416                self.bump();
417
418                if let Ok(ref mut cooked) = cooked {
419                    cooked.push(c);
420                }
421                cooked_slice_start = self.cur_pos();
422            } else {
423                self.bump();
424            }
425        }
426
427        self.error(start, SyntaxError::UnterminatedTpl)?
428    }
429}