swc_css_parser/lexer/
mod.rs

1use std::{borrow::Cow, cell::RefCell, char::REPLACEMENT_CHARACTER, rc::Rc};
2
3use swc_atoms::{Atom, AtomStoreCell};
4use swc_common::{
5    comments::{Comment, CommentKind, Comments},
6    input::Input,
7    util::take::Take,
8    BytePos, Span,
9};
10use swc_css_ast::{
11    matches_eq_ignore_ascii_case, DimensionToken, NumberType, Token, TokenAndSpan, UrlKeyValue,
12};
13
14use crate::{
15    error::{Error, ErrorKind},
16    parser::{input::ParserInput, ParserConfig},
17};
18
19pub(crate) type LexResult<T> = Result<T, ErrorKind>;
20
21#[derive(Clone)]
22pub struct Lexer<'a, I>
23where
24    I: Input<'a>,
25{
26    comments: Option<&'a dyn Comments>,
27    pending_leading_comments: Vec<Comment>,
28    input: I,
29    cur: Option<char>,
30    cur_pos: BytePos,
31    start_pos: BytePos,
32    /// Used to override last_pos
33    override_pos: Option<BytePos>,
34    config: ParserConfig,
35    buf: Rc<RefCell<String>>,
36    raw_buf: Rc<RefCell<String>>,
37    sub_buf: Rc<RefCell<String>>,
38    errors: Rc<RefCell<Vec<Error>>>,
39    atoms: Rc<AtomStoreCell>,
40}
41
42impl<'a, I> Lexer<'a, I>
43where
44    I: Input<'a>,
45{
46    pub fn new(input: I, comments: Option<&'a dyn Comments>, config: ParserConfig) -> Self {
47        let start_pos = input.last_pos();
48
49        Lexer {
50            comments,
51            input,
52            cur: None,
53            cur_pos: start_pos,
54            start_pos,
55            override_pos: None,
56            config,
57            buf: Rc::new(RefCell::new(String::with_capacity(256))),
58            raw_buf: Rc::new(RefCell::new(String::with_capacity(256))),
59            sub_buf: Rc::new(RefCell::new(String::with_capacity(32))),
60            errors: Default::default(),
61            pending_leading_comments: Default::default(),
62            atoms: Default::default(),
63        }
64    }
65
66    fn with_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
67    where
68        F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
69    {
70        let b = self.buf.clone();
71        let mut buf = b.borrow_mut();
72
73        buf.clear();
74
75        op(self, &mut buf)
76    }
77
78    fn with_sub_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
79    where
80        F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
81    {
82        let b = self.sub_buf.clone();
83        let mut sub_buf = b.borrow_mut();
84
85        sub_buf.clear();
86
87        op(self, &mut sub_buf)
88    }
89
90    fn with_buf_and_raw_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
91    where
92        F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String, &mut String) -> LexResult<Ret>,
93    {
94        let b = self.buf.clone();
95        let r = self.raw_buf.clone();
96        let mut buf = b.borrow_mut();
97        let mut raw = r.borrow_mut();
98
99        buf.clear();
100        raw.clear();
101
102        op(self, &mut buf, &mut raw)
103    }
104}
105
106impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
107    type Item = TokenAndSpan;
108
109    fn next(&mut self) -> Option<Self::Item> {
110        let token = self.consume_token();
111
112        match token {
113            Ok(token) => {
114                let end = self
115                    .override_pos
116                    .take()
117                    .unwrap_or_else(|| self.input.last_pos());
118                let span = Span::new(self.start_pos, end);
119
120                let token_and_span = TokenAndSpan { span, token };
121
122                return Some(token_and_span);
123            }
124            Err(..) => {
125                return None;
126            }
127        }
128    }
129}
130
131#[derive(Debug, Clone, Copy)]
132pub struct LexerState {
133    pos: BytePos,
134}
135
136impl<'a, I> ParserInput for Lexer<'a, I>
137where
138    I: Input<'a>,
139{
140    type State = LexerState;
141
142    fn start_pos(&mut self) -> BytePos {
143        self.input.last_pos()
144    }
145
146    fn state(&mut self) -> Self::State {
147        LexerState {
148            pos: self.input.last_pos(),
149        }
150    }
151
152    fn reset(&mut self, state: &Self::State) {
153        unsafe {
154            // Safety: state.pos is created from a valid position.
155            self.input.reset_to(state.pos);
156        }
157    }
158
159    fn take_errors(&mut self) -> Vec<Error> {
160        self.errors.take()
161    }
162
163    fn skip_ws(&mut self) -> Option<BytePos> {
164        self.read_comments();
165
166        if let Some(c) = self.input.cur() {
167            if !is_whitespace(c) {
168                return None;
169            }
170        }
171
172        loop {
173            self.read_comments();
174
175            if self.input.uncons_while(is_whitespace).is_empty() {
176                break;
177            }
178        }
179
180        Some(self.input.last_pos())
181    }
182
183    fn atom(&self, s: Cow<str>) -> Atom {
184        self.atoms.atom(s)
185    }
186}
187
188impl<'a, I> Lexer<'a, I>
189where
190    I: Input<'a>,
191{
192    #[inline(always)]
193    fn cur(&mut self) -> Option<char> {
194        self.cur
195    }
196
197    #[inline(always)]
198    fn next(&mut self) -> Option<char> {
199        self.input.cur()
200    }
201
202    #[inline(always)]
203    fn next_next(&mut self) -> Option<char> {
204        self.input.peek()
205    }
206
207    #[inline(always)]
208    fn next_next_next(&mut self) -> Option<char> {
209        self.input.peek_ahead()
210    }
211
212    #[inline(always)]
213    fn consume(&mut self) -> Option<char> {
214        let cur = self.input.cur();
215
216        self.cur = cur;
217        self.cur_pos = self.input.last_pos();
218
219        if cur.is_some() {
220            unsafe {
221                // Safety: cur is Some
222                self.input.bump();
223            }
224        }
225
226        cur
227    }
228
229    #[inline(always)]
230    fn reconsume(&mut self) {
231        unsafe {
232            // Safety: self.cur_pos is a position generated by self.input, meaning it is
233            // valid.
234            self.input.reset_to(self.cur_pos);
235        }
236    }
237
238    #[cold]
239    fn emit_error(&mut self, kind: ErrorKind) {
240        self.errors.borrow_mut().push(Error::new(
241            Span::new(self.cur_pos, self.input.last_pos()),
242            kind,
243        ));
244    }
245
246    fn consume_token(&mut self) -> LexResult<Token> {
247        self.read_comments();
248        self.start_pos = self.input.last_pos();
249
250        if let Some(comments) = self.comments {
251            if !self.pending_leading_comments.is_empty() {
252                comments.add_leading_comments(self.start_pos, self.pending_leading_comments.take());
253            }
254        }
255
256        // Consume the next input code point.
257        match self.consume() {
258            // whitespace
259            // Consume as much whitespace as possible. Return a <whitespace-token>.
260            Some(c) if is_whitespace(c) => self.with_buf(|l, buf| {
261                buf.push(c);
262
263                loop {
264                    let c = l.next();
265
266                    match c {
267                        Some(c) if is_whitespace(c) => {
268                            l.consume();
269
270                            buf.push(c);
271                        }
272                        _ => {
273                            break;
274                        }
275                    }
276                }
277
278                return Ok(Token::WhiteSpace {
279                    value: l.atoms.atom(&**buf),
280                });
281            }),
282            // U+0022 QUOTATION MARK (")
283            // Consume a string token and return it.
284            Some('"') => self.read_str(None),
285            // U+0023 NUMBER SIGN (#)
286            Some('#') => {
287                let first = self.next();
288                let second = self.next_next();
289
290                // If the next input code point is a name code point or the next two input code
291                // points are a valid escape, then:
292                if (first.is_some() && is_name(first.unwrap()))
293                    || self.is_valid_escape(first, second)
294                {
295                    // Create a <hash-token>.
296
297                    // If the next 3 input code points would start an identifier, set the
298                    // <hash-token>’s type flag to "id".
299                    let third = self.next_next_next();
300                    let is_would_start_ident = self.would_start_ident(first, second, third);
301
302                    // Consume an ident sequence, and set the <hash-token>’s value to the returned
303                    // string.
304                    let ident_sequence = self.read_ident_sequence()?;
305
306                    // Return the <hash-token>.
307                    return Ok(Token::Hash {
308                        is_id: is_would_start_ident,
309                        value: ident_sequence.0,
310                        raw: ident_sequence.1,
311                    });
312                }
313
314                Ok(Token::Delim { value: '#' })
315            }
316            // U+0027 APOSTROPHE (')
317            // Consume a string token and return it.
318            Some('\'') => self.read_str(None),
319            // U+0028 LEFT PARENTHESIS (()
320            // Return a <(-token>.
321            Some('(') => Ok(tok!("(")),
322            // U+0029 RIGHT PARENTHESIS ())
323            // Return a <)-token>.
324            Some(')') => Ok(tok!(")")),
325            // U+002B PLUS SIGN (+)
326            Some('+') => {
327                // If the input stream starts with a number, reconsume the current input code
328                // point, consume a numeric token and return it.
329                if self.would_start_number(None, None, None) {
330                    self.reconsume();
331
332                    return self.read_numeric();
333                }
334
335                // Otherwise, return a <delim-token> with its value set to the current input
336                // code point.
337                Ok(tok!("+"))
338            }
339            // U+002C COMMA (,)
340            // Return a <comma-token>.
341            Some(',') => Ok(tok!(",")),
342            // U+002D HYPHEN-MINUS (-)
343            Some('-') => {
344                // If the input stream starts with a number, reconsume the current input code
345                // point, consume a numeric token, and return it.
346                if self.would_start_number(None, None, None) {
347                    self.reconsume();
348
349                    return self.read_numeric();
350                }
351                // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E
352                // GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
353                else if self.next() == Some('-') && self.next_next() == Some('>') {
354                    self.consume();
355                    self.consume();
356
357                    return Ok(Token::CDC);
358                }
359                // Otherwise, if the input stream starts with an identifier, reconsume the current
360                // input code point, consume an ident-like token, and return it.
361                else if self.would_start_ident(None, None, None) {
362                    self.reconsume();
363
364                    return self.read_ident_like();
365                }
366
367                // Otherwise, return a <delim-token> with its value set to the current input
368                // code point.
369                Ok(tok!("-"))
370            }
371            // U+002E FULL STOP (.)
372            Some('.') => {
373                // If the input stream starts with a number, reconsume the current input code
374                // point, consume a numeric token, and return it.
375                if self.would_start_number(None, None, None) {
376                    self.reconsume();
377
378                    return self.read_numeric();
379                }
380
381                // Otherwise, return a <delim-token> with its value set to the current input
382                // code point.
383                Ok(tok!("."))
384            }
385            // U+003A COLON (:)
386            // Return a <colon-token>.
387            Some(':') => Ok(tok!(":")),
388            // U+003B SEMICOLON (;)
389            // Return a <semicolon-token>.
390            Some(';') => Ok(tok!(";")),
391            // U+003C LESS-THAN SIGN (<)
392            Some('<') => {
393                // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D
394                // HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), consume them and return a
395                // <CDO-token>.
396                if self.next() == Some('!')
397                    && self.next_next() == Some('-')
398                    && self.next_next_next() == Some('-')
399                {
400                    self.consume(); // !
401                    self.consume(); // -
402                    self.consume(); // -
403
404                    return Ok(tok!("<!--"));
405                }
406
407                // Otherwise, return a <delim-token> with its value set to the current input
408                // code point.
409                Ok(tok!("<"))
410            }
411            // U+0040 COMMERCIAL AT (@)
412            Some('@') => {
413                let first = self.next();
414                let second = self.next_next();
415                let third = self.next_next_next();
416
417                // If the next 3 input code points would start an identifier, consume a name,
418                // create an <at-keyword-token> with its value set to the returned value, and
419                // return it.
420                if self.would_start_ident(first, second, third) {
421                    let ident_sequence = self.read_ident_sequence()?;
422
423                    return Ok(Token::AtKeyword {
424                        value: ident_sequence.0,
425                        raw: ident_sequence.1,
426                    });
427                }
428
429                // Otherwise, return a <delim-token> with its value set to the current input
430                // code point.
431                Ok(Token::Delim { value: '@' })
432            }
433            // U+005B LEFT SQUARE BRACKET ([)
434            // Return a <[-token>.
435            Some('[') => Ok(tok!("[")),
436            // U+005C REVERSE SOLIDUS (\)
437            Some('\\') => {
438                // If the input stream starts with a valid escape, reconsume the current input
439                // code point, consume an ident-like token, and return it.
440                if self.is_valid_escape(None, None) {
441                    self.reconsume();
442
443                    return self.read_ident_like();
444                }
445
446                // Otherwise, this is a parse error. Return a <delim-token> with its value set
447                // to the current input code point.
448                self.emit_error(ErrorKind::InvalidEscape);
449
450                Ok(Token::Delim { value: '\\' })
451            }
452            // U+005D RIGHT SQUARE BRACKET (])
453            // Return a <]-token>.
454            Some(']') => Ok(tok!("]")),
455            // U+007B LEFT CURLY BRACKET ({)
456            // Return a <{-token>.
457            Some('{') => Ok(tok!("{")),
458            // U+007D RIGHT CURLY BRACKET (})
459            // Return a <}-token>.
460            Some('}') => Ok(tok!("}")),
461            // digit
462            // Reconsume the current input code point, consume a numeric token, and return it.
463            Some('0'..='9') => {
464                self.reconsume();
465
466                self.read_numeric()
467            }
468            // name-start code point
469            // Reconsume the current input code point, consume an ident-like token, and return it.
470            Some(c) if is_name_start(c) => {
471                self.reconsume();
472
473                self.read_ident_like()
474            }
475            // EOF
476            // Return an <EOF-token>.
477            None => Err(ErrorKind::Eof),
478            // anything else
479            // Return a <delim-token> with its value set to the current input code point.
480            Some(c) => Ok(Token::Delim { value: c }),
481        }
482    }
483
484    // Consume comments.
485    // This section describes how to consume comments from a stream of code points.
486    // It returns nothing.
487    fn read_comments(&mut self) {
488        // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A
489        // ASTERISK (*), consume them and all following code points up to and including
490        // the first U+002A ASTERISK (*) followed by a U+002F SOLIDUS (/), or up to an
491        // EOF code point. Return to the start of this step.
492        // NOTE: We allow to parse line comments under the option.
493        if self.next() == Some('/') && self.next_next() == Some('*') {
494            let cmt_start = self.input.last_pos();
495
496            while self.next() == Some('/') && self.next_next() == Some('*') {
497                self.consume(); // '*'
498                self.consume(); // '/'
499
500                loop {
501                    match self.consume() {
502                        Some('*') if self.next() == Some('/') => {
503                            self.consume(); // '/'
504
505                            if self.comments.is_some() {
506                                let last_pos = self.input.last_pos();
507                                let text = unsafe {
508                                    // Safety: last_pos is a valid position
509                                    self.input.slice(cmt_start, last_pos)
510                                };
511
512                                self.pending_leading_comments.push(Comment {
513                                    kind: CommentKind::Block,
514                                    span: (self.start_pos, last_pos).into(),
515                                    text: self.atoms.atom(text),
516                                });
517                            }
518
519                            break;
520                        }
521                        None => {
522                            let span = Span::new(self.start_pos, self.input.last_pos());
523
524                            self.errors
525                                .borrow_mut()
526                                .push(Error::new(span, ErrorKind::UnterminatedBlockComment));
527
528                            return;
529                        }
530                        _ => {}
531                    }
532                }
533            }
534        } else if self.config.allow_wrong_line_comments
535            && self.next() == Some('/')
536            && self.next_next() == Some('/')
537        {
538            while self.next() == Some('/') && self.next_next() == Some('/') {
539                self.consume(); // '/'
540                self.consume(); // '/'
541
542                let start_of_content = self.input.last_pos();
543
544                loop {
545                    match self.consume() {
546                        Some(c) if is_newline(c) => {
547                            if self.comments.is_some() {
548                                let last_pos = self.input.last_pos();
549                                let text = unsafe {
550                                    // Safety: last_pos is a valid position
551                                    self.input.slice(start_of_content, last_pos)
552                                };
553
554                                self.pending_leading_comments.push(Comment {
555                                    kind: CommentKind::Line,
556                                    span: (self.start_pos, last_pos).into(),
557                                    text: self.atoms.atom(text),
558                                });
559                            }
560                            break;
561                        }
562                        None => return,
563                        _ => {}
564                    }
565                }
566            }
567        }
568    }
569
570    // This section describes how to consume a numeric token from a stream of code
571    // points. It returns either a <number-token>, <percentage-token>, or
572    // <dimension-token>.
573    fn read_numeric(&mut self) -> LexResult<Token> {
574        // Consume a number and let number be the result.
575        let number = self.read_number()?;
576
577        let next_first = self.next();
578        let next_second = self.next_next();
579        let next_third = self.next_next_next();
580
581        // If the next 3 input code points would start an identifier, then:
582        if self.would_start_ident(next_first, next_second, next_third) {
583            // Swap logic to avoid create empty strings, because it doesn't make sense
584            //
585            // Consume a name. Set the <dimension-token>’s unit to the returned value.
586            let ident_sequence = self.read_ident_sequence()?;
587            // Create a <dimension-token> with the same value and type flag as number, and a
588            // unit set initially to the empty string.
589            let token = Box::new(DimensionToken {
590                value: number.0,
591                raw_value: number.1,
592                unit: ident_sequence.0,
593                raw_unit: ident_sequence.1,
594                type_flag: number.2,
595            });
596            let token = Token::Dimension { dimension: token };
597
598            // Return the <dimension-token>.
599            return Ok(token);
600        }
601        // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. Create
602        // a <percentage-token> with the same value as number, and return it.
603        else if next_first == Some('%') {
604            self.consume();
605
606            return Ok(Token::Percentage {
607                value: number.0,
608                raw: number.1,
609            });
610        }
611
612        // Otherwise, create a <number-token> with the same value and type flag as
613        // number, and return it.
614        Ok(Token::Number {
615            value: number.0,
616            raw: number.1,
617            type_flag: number.2,
618        })
619    }
620
621    // This section describes how to consume an ident-like token from a stream of
622    // code points. It returns an <ident-token>, <function-token>, <url-token>, or
623    // <bad-url-token>.
624    fn read_ident_like(&mut self) -> LexResult<Token> {
625        // Consume a name, and let string be the result.
626        let ident_sequence = self.read_ident_sequence()?;
627
628        // If string’s value is an ASCII case-insensitive match for "url", and the next
629        // input code point is U+0028 LEFT PARENTHESIS ((), consume it.
630        if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some('(') {
631            self.consume();
632
633            let start_whitespace = self.input.last_pos();
634
635            // While the next two input code points are whitespace, consume the next input
636            // code point.
637            let whitespaces = self.with_buf(|l, buf| {
638                while let (Some(next), Some(next_next)) = (l.next(), l.next_next()) {
639                    if is_whitespace(next) && is_whitespace(next_next) {
640                        l.consume();
641
642                        buf.push(next);
643                    } else {
644                        break;
645                    }
646                }
647
648                Ok(buf.to_string())
649            })?;
650
651            match self.next() {
652                // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027
653                // APOSTROPHE ('), or whitespace followed by U+0022 QUOTATION MARK (") or U+0027
654                // APOSTROPHE ('), then create a <function-token> with its value set to string and
655                // return it.
656                Some(c)
657                    if is_whitespace(c)
658                        && (self.next_next() == Some('"') || self.next_next() == Some('\'')) =>
659                {
660                    // Override last position because we consumed whitespaces, but they
661                    // should not be part of token
662                    self.override_pos = Some(start_whitespace);
663
664                    return Ok(Token::Function {
665                        value: ident_sequence.0,
666                        raw: ident_sequence.1,
667                    });
668                }
669                Some('"' | '\'') => {
670                    return Ok(Token::Function {
671                        value: ident_sequence.0,
672                        raw: ident_sequence.1,
673                    });
674                }
675                // Otherwise, consume a url token, and return it.
676                _ => {
677                    return self.read_url(ident_sequence, whitespaces);
678                }
679            }
680        }
681        // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
682        // Create a <function-token> with its value set to string and return it.
683        else if self.next() == Some('(') {
684            self.consume();
685
686            return Ok(Token::Function {
687                value: ident_sequence.0,
688                raw: ident_sequence.1,
689            });
690        }
691
692        // Otherwise, create an <ident-token> with its value set to string and return
693        // it.
694        Ok(Token::Ident {
695            value: ident_sequence.0,
696            raw: ident_sequence.1,
697        })
698    }
699
700    // This section describes how to consume a string token from a stream of code
701    // points. It returns either a <string-token> or <bad-string-token>.
702    fn read_str(&mut self, maybe_ending_code_point: Option<char>) -> LexResult<Token> {
703        self.with_buf_and_raw_buf(|l, buf, raw| {
704            // This algorithm may be called with an ending code point, which denotes the
705            // code point that ends the string. If an ending code point is not specified,
706            // the current input code point is used.
707            let ending_code_point = maybe_ending_code_point.or_else(|| l.cur());
708
709            // Initially create a <string-token> with its value set to the empty string.
710            // Done above
711
712            raw.push(ending_code_point.unwrap());
713
714            // Repeatedly consume the next input code point from the stream:
715            loop {
716                match l.consume() {
717                    // ending code point
718                    // Return the <string-token>.
719                    Some(c) if c == ending_code_point.unwrap() => {
720                        raw.push(c);
721
722                        break;
723                    }
724
725                    // EOF
726                    // This is a parse error. Return the <string-token>.
727                    None => {
728                        l.emit_error(ErrorKind::UnterminatedString);
729
730                        return Ok(Token::String {
731                            value: l.atoms.atom(&**buf),
732                            raw: l.atoms.atom(&**raw),
733                        });
734                    }
735
736                    // Newline
737                    // This is a parse error. Reconsume the current input code point, create a
738                    // <bad-string-token>, and return it.
739                    Some(c) if is_newline(c) => {
740                        l.emit_error(ErrorKind::NewlineInString);
741                        l.reconsume();
742
743                        return Ok(Token::BadString {
744                            raw: l.atoms.atom(&**raw),
745                        });
746                    }
747
748                    // U+005C REVERSE SOLIDUS (\)
749                    Some(c) if c == '\\' => {
750                        let next = l.next();
751
752                        // If the next input code point is EOF, do nothing.
753                        if l.next().is_none() {
754                            continue;
755                        }
756                        // Otherwise, if the next input code point is a newline, consume it.
757                        else if l.next().is_some() && is_newline(l.next().unwrap()) {
758                            l.consume();
759
760                            raw.push(c);
761                            raw.push(next.unwrap());
762                        }
763                        // Otherwise, (the stream starts with a valid escape) consume an escaped
764                        // code point and append the returned code point to
765                        // the <string-token>’s value.
766                        else if l.is_valid_escape(None, None) {
767                            let escape = l.read_escape()?;
768
769                            buf.push(escape.0);
770                            raw.push(c);
771                            raw.push_str(&escape.1);
772                        }
773                    }
774
775                    // Anything else
776                    // Append the current input code point to the <string-token>’s value.
777                    Some(c) => {
778                        buf.push(c);
779                        raw.push(c);
780                    }
781                }
782            }
783
784            Ok(Token::String {
785                value: l.atoms.atom(&**buf),
786                raw: l.atoms.atom(&**raw),
787            })
788        })
789    }
790
791    // This section describes how to consume a url token from a stream of code
792    // points. It returns either a <url-token> or a <bad-url-token>.
793    fn read_url(&mut self, name: (Atom, Atom), before: String) -> LexResult<Token> {
794        // Initially create a <url-token> with its value set to the empty string.
795        self.with_buf_and_raw_buf(|l, out, raw| {
796            raw.push_str(&before);
797
798            // Consume as much whitespace as possible.
799            while let Some(c) = l.next() {
800                if is_whitespace(c) {
801                    l.consume();
802
803                    raw.push(c);
804                } else {
805                    break;
806                }
807            }
808
809            // Repeatedly consume the next input code point from the stream:
810            loop {
811                match l.consume() {
812                    // U+0029 RIGHT PARENTHESIS ())
813                    // Return the <url-token>.
814                    Some(')') => {
815                        return Ok(Token::Url {
816                            value: l.atoms.atom(&**out),
817                            raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
818                        });
819                    }
820
821                    // EOF
822                    // This is a parse error. Return the <url-token>.
823                    None => {
824                        l.emit_error(ErrorKind::UnterminatedUrl);
825
826                        return Ok(Token::Url {
827                            value: l.atoms.atom(&**out),
828                            raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
829                        });
830                    }
831
832                    // whitespace
833                    Some(c) if is_whitespace(c) => {
834                        // Consume as much whitespace as possible.
835                        let whitespaces: String = l.with_sub_buf(|l, buf| {
836                            buf.push(c);
837
838                            while let Some(c) = l.next() {
839                                if is_whitespace(c) {
840                                    l.consume();
841
842                                    buf.push(c);
843                                } else {
844                                    break;
845                                }
846                            }
847
848                            Ok(buf.to_string())
849                        })?;
850
851                        // if the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
852                        // consume it and return the <url-token> (if EOF was
853                        // encountered, this is a parse error);
854                        match l.next() {
855                            Some(')') => {
856                                l.consume();
857
858                                raw.push_str(&whitespaces);
859
860                                return Ok(Token::Url {
861                                    value: l.atoms.atom(&**out),
862                                    raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
863                                });
864                            }
865                            None => {
866                                l.emit_error(ErrorKind::UnterminatedUrl);
867
868                                raw.push_str(&whitespaces);
869
870                                return Ok(Token::Url {
871                                    value: l.atoms.atom(&**out),
872                                    raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
873                                });
874                            }
875                            _ => {}
876                        }
877
878                        // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
879                        // and return it.
880                        raw.push_str(&whitespaces);
881
882                        let remnants = l.read_bad_url_remnants()?;
883
884                        raw.push_str(&remnants);
885
886                        return Ok(Token::BadUrl {
887                            raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
888                        });
889                    }
890
891                    // U+0022 QUOTATION MARK (")
892                    // U+0027 APOSTROPHE (')
893                    // U+0028 LEFT PARENTHESIS (()
894                    // non-printable code point
895                    // This is a parse error. Consume the remnants of a bad url, create a
896                    // <bad-url-token>, and return it.
897                    Some(c) if c == '"' || c == '\'' || c == '(' || is_non_printable(c) => {
898                        l.emit_error(ErrorKind::UnexpectedCharInUrl);
899
900                        let remnants = l.read_bad_url_remnants()?;
901
902                        raw.push(c);
903                        raw.push_str(&remnants);
904
905                        return Ok(Token::BadUrl {
906                            raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
907                        });
908                    }
909
910                    // U+005C REVERSE SOLIDUS (\)
911                    Some(c) if c == '\\' => {
912                        // If the stream starts with a valid escape, consume an escaped code point
913                        // and append the returned code point to the
914                        // <url-token>’s value.
915                        if l.is_valid_escape(None, None) {
916                            let escaped = l.read_escape()?;
917
918                            out.push(escaped.0);
919                            raw.push(c);
920                            raw.push_str(&escaped.1);
921                        }
922                        // Otherwise, this is a parse error. Consume the remnants of a bad url,
923                        // create a <bad-url-token>, and return it.
924                        else {
925                            l.emit_error(ErrorKind::InvalidEscape);
926
927                            let remnants = l.read_bad_url_remnants()?;
928
929                            raw.push(c);
930                            raw.push_str(&remnants);
931
932                            return Ok(Token::BadUrl {
933                                raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
934                            });
935                        }
936                    }
937
938                    // anything else
939                    // Append the current input code point to the <url-token>’s value.
940                    Some(c) => {
941                        out.push(c);
942                        raw.push(c);
943                    }
944                }
945            }
946        })
947    }
948
949    // Consume an escaped code point
950    // This section describes how to consume an escaped code point. It assumes that
951    // the U+005C REVERSE SOLIDUS (\) has already been consumed and that the next
952    // input code point has already been verified to be part of a valid escape. It
953    // will return a code point.
954    fn read_escape(&mut self) -> LexResult<(char, String)> {
955        self.with_sub_buf(|l, buf| {
956            // Consume the next input code point.
957            match l.consume() {
958                // hex digit
959                Some(c) if is_hex_digit(c) => {
960                    let mut hex = c.to_digit(16).unwrap();
961
962                    buf.push(c);
963
964                    // Consume as many hex digits as possible, but no more than 5.
965                    // Note that this means 1-6 hex digits have been consumed in total.
966                    for _ in 0..5 {
967                        let next = l.next();
968                        let digit = match next.and_then(|c| c.to_digit(16)) {
969                            Some(v) => v,
970                            None => break,
971                        };
972
973                        l.consume();
974
975                        buf.push(next.unwrap());
976                        hex = hex * 16 + digit;
977                    }
978
979                    // If the next input code point is whitespace, consume it as well.
980                    let next = l.next();
981
982                    if let Some(next) = next {
983                        if is_whitespace(next) {
984                            l.consume();
985
986                            buf.push(next);
987                        }
988                    }
989
990                    // Interpret the hex digits as a hexadecimal number. If this number is zero, or
991                    // is for a surrogate, or is greater than the maximum allowed code point, return
992                    // U+FFFD REPLACEMENT CHARACTER (�).
993                    let hex = match hex {
994                        // If this number is zero
995                        0 => REPLACEMENT_CHARACTER,
996                        // or is for a surrogate
997                        55296..=57343 => REPLACEMENT_CHARACTER,
998                        // or is greater than the maximum allowed code point
999                        1114112.. => REPLACEMENT_CHARACTER,
1000                        _ => char::from_u32(hex).unwrap_or(REPLACEMENT_CHARACTER),
1001                    };
1002
1003                    // Otherwise, return the code point with that value.
1004                    Ok((hex, (&**buf).into()))
1005                }
1006                // EOF
1007                // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
1008                None => {
1009                    l.emit_error(ErrorKind::InvalidEscape);
1010
1011                    let value = REPLACEMENT_CHARACTER;
1012
1013                    buf.push(value);
1014
1015                    Ok((value, (&**buf).into()))
1016                }
1017                // anything else
1018                // Return the current input code point.
1019                Some(c) => {
1020                    buf.push(c);
1021
1022                    Ok((c, (&**buf).into()))
1023                }
1024            }
1025        })
1026    }
1027
1028    // Check if two code points are a valid escape
1029    // This section describes how to check if two code points are a valid escape.
1030    // The algorithm described here can be called explicitly with two code points,
1031    // or can be called with the input stream itself. In the latter case, the two
1032    // code points in question are the current input code point and the next input
1033    // code point, in that order.
1034    fn is_valid_escape(&mut self, maybe_first: Option<char>, maybe_second: Option<char>) -> bool {
1035        // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
1036        if maybe_first.or_else(|| self.cur()) != Some('\\') {
1037            return false;
1038        }
1039
1040        match maybe_second.or_else(|| self.next()) {
1041            // Otherwise, if the second code point is a newline, return false.
1042            Some(second) => !is_newline(second),
1043            // Otherwise, return true.
1044            None => false,
1045        }
1046    }
1047
1048    // Check if three code points would start an identifier
1049    // This section describes how to check if three code points would start an
1050    // identifier. The algorithm described here can be called explicitly with three
1051    // code points, or can be called with the input stream itself. In the latter
1052    // case, the three code points in question are the current input code point and
1053    // the next two input code points, in that order.
1054    fn would_start_ident(
1055        &mut self,
1056        maybe_first: Option<char>,
1057        maybe_second: Option<char>,
1058        maybe_third: Option<char>,
1059    ) -> bool {
1060        // Look at the first code point:
1061        let first = maybe_first.or_else(|| self.cur());
1062
1063        match first {
1064            // U+002D HYPHEN-MINUS
1065            Some('-') => {
1066                let second = maybe_second.or_else(|| self.next());
1067
1068                match second {
1069                    // If the second code point is a name-start code point
1070                    // return true.
1071                    Some(c) if is_name_start(c) => true,
1072                    // or a U+002D HYPHEN-MINUS,
1073                    // return true.
1074                    Some('-') => true,
1075                    // or the second and third code points are a valid escape
1076                    // return true.
1077                    Some(_) => {
1078                        let third = maybe_third.or_else(|| self.next_next());
1079
1080                        self.is_valid_escape(second, third)
1081                    }
1082                    // Otherwise, return false.
1083                    _ => false,
1084                }
1085            }
1086            // name-start code point
1087            // Return true.
1088            Some(c) if is_name_start(c) => true,
1089            // U+005C REVERSE SOLIDUS (\)
1090            // If the first and second code points are a valid escape, return true. Otherwise,
1091            // return false.
1092            Some('\\') => {
1093                let second = maybe_second.or_else(|| self.next());
1094
1095                self.is_valid_escape(first, second)
1096            }
1097            _ => false,
1098        }
1099    }
1100
1101    // Check if three code points would start a number
1102    // This section describes how to check if three code points would start a
1103    // number. The algorithm described here can be called explicitly with three code
1104    // points, or can be called with the input stream itself. In the latter case,
1105    // the three code points in question are the current input code point and the
1106    // next two input code points, in that order.
1107    #[allow(clippy::needless_return)]
1108    fn would_start_number(
1109        &mut self,
1110        maybe_first: Option<char>,
1111        maybe_second: Option<char>,
1112        maybe_third: Option<char>,
1113    ) -> bool {
1114        // Look at the first code point:
1115        let first = maybe_first.or_else(|| self.cur());
1116
1117        match first {
1118            // U+002B PLUS SIGN (+)
1119            // U+002D HYPHEN-MINUS (-)
1120            Some('+') | Some('-') => {
1121                match maybe_second.or_else(|| self.next()) {
1122                    // If the second code point is a digit, return true.
1123                    Some(second) if second.is_ascii_digit() => return true,
1124                    // Otherwise, if the second code point is a U+002E FULL STOP (.) and the
1125                    // third code point is a digit, return true.
1126                    Some('.') => {
1127                        if let Some(third) = maybe_third.or_else(|| self.next_next()) {
1128                            if third.is_ascii_digit() {
1129                                return true;
1130                            }
1131                        }
1132
1133                        return false;
1134                    }
1135                    // Otherwise, return false.
1136                    _ => return false,
1137                };
1138            }
1139            // U+002E FULL STOP (.)
1140            Some('.') => {
1141                // If the second code point is a digit, return true.
1142                if let Some(second) = self.next() {
1143                    if second.is_ascii_digit() {
1144                        return true;
1145                    }
1146                }
1147
1148                // Otherwise, return false.
1149                false
1150            }
1151            // digit
1152            // Return true.
1153            Some(first) if first.is_ascii_digit() => true,
1154            // anything else
1155            // Return false.
1156            _ => false,
1157        }
1158    }
1159
1160    // Consume an ident sequence
1161    // This section describes how to consume an ident sequence from a stream of code
1162    // points. It returns a string containing the largest name that can be formed
1163    // from adjacent code points in the stream, starting from the first.
1164    fn read_ident_sequence(&mut self) -> LexResult<(Atom, Atom)> {
1165        self.with_buf_and_raw_buf(|l, buf, raw| {
1166            // Let result initially be an empty string.
1167            // Done above
1168
1169            // Repeatedly consume the next input code point from the stream:
1170            loop {
1171                match l.consume() {
1172                    // name code point
1173                    // Append the code point to result.
1174                    Some(c) if is_name(c) => {
1175                        buf.push(c);
1176                        raw.push(c);
1177                    }
1178                    // the stream starts with a valid escape
1179                    // Consume an escaped code point. Append the returned code point to result.
1180                    Some(c) if l.is_valid_escape(None, None) => {
1181                        let escaped = l.read_escape()?;
1182
1183                        buf.push(escaped.0);
1184                        raw.push(c);
1185                        raw.push_str(&escaped.1);
1186                    }
1187                    // anything else
1188                    // Reconsume the current input code point. Return result.
1189                    _ => {
1190                        l.reconsume();
1191
1192                        break;
1193                    }
1194                }
1195            }
1196
1197            Ok((l.atoms.atom(&**buf), l.atoms.atom(&**raw)))
1198        })
1199    }
1200
1201    // This section describes how to consume a number from a stream of code points.
1202    // It returns a numeric value, and a type which is either "integer" or "number".
1203    fn read_number(&mut self) -> LexResult<(f64, Atom, NumberType)> {
1204        let parsed: (Atom, NumberType) = self.with_buf(|l, out| {
1205            // Initially set type to "integer". Let repr be the empty string.
1206            let mut type_flag = NumberType::Integer;
1207
1208            // If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
1209            // (-), consume it and append it to repr.
1210            let next = l.next();
1211
1212            if next == Some('+') || next == Some('-') {
1213                l.consume();
1214
1215                out.push(next.unwrap());
1216            }
1217
1218            // While the next input code point is a digit, consume it and append it to repr.
1219            while let Some(c) = l.next() {
1220                if c.is_ascii_digit() {
1221                    l.consume();
1222
1223                    out.push(c);
1224                } else {
1225                    break;
1226                }
1227            }
1228
1229            // If the next 2 input code points are U+002E FULL STOP (.) followed by a digit,
1230            // then:
1231            let next = l.next();
1232
1233            if next == Some('.') {
1234                if let Some(n) = l.next_next() {
1235                    if n.is_ascii_digit() {
1236                        // Consume them.
1237                        l.consume();
1238                        l.consume();
1239
1240                        // Append them to repr.
1241                        out.push(next.unwrap());
1242                        out.push(n);
1243
1244                        // Set type to "number".
1245                        type_flag = NumberType::Number;
1246
1247                        // While the next input code point is a digit, consume it and append it to
1248                        // repr.
1249                        while let Some(c) = l.next() {
1250                            if c.is_ascii_digit() {
1251                                l.consume();
1252
1253                                out.push(c);
1254                            } else {
1255                                break;
1256                            }
1257                        }
1258                    }
1259                }
1260            }
1261
1262            // If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or
1263            // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS
1264            // (-) or U+002B PLUS SIGN (+), followed by a digit, then:
1265            let next = l.next();
1266
1267            if next == Some('E') || next == Some('e') {
1268                let next_next = l.next_next();
1269                let next_next_next = l.next_next_next();
1270
1271                if (next_next == Some('-')
1272                    || next_next == Some('+')
1273                        && next_next_next.is_some()
1274                        && next_next_next.unwrap().is_ascii_digit())
1275                    || next_next.is_some() && next_next.unwrap().is_ascii_digit()
1276                {
1277                    // Consume them.
1278                    l.consume();
1279                    l.consume();
1280
1281                    // Append them to repr.
1282                    out.push(next.unwrap());
1283                    out.push(next_next.unwrap());
1284
1285                    // Set type to "number".
1286                    type_flag = NumberType::Number;
1287
1288                    // While the next input code point is a digit, consume it and append it
1289                    // to repr.
1290                    while let Some(c) = l.next() {
1291                        if c.is_ascii_digit() {
1292                            l.consume();
1293
1294                            out.push(c);
1295                        } else {
1296                            break;
1297                        }
1298                    }
1299                }
1300            }
1301
1302            // Return value and type.
1303            Ok((l.atoms.atom(&**out), type_flag))
1304        })?;
1305
1306        // Convert repr to a number, and set the value to the returned value.
1307        let value = lexical::parse(&*parsed.0).unwrap_or_else(|err| {
1308            unreachable!("failed to parse `{}` using lexical: {:?}", parsed.0, err)
1309        });
1310
1311        Ok((value, parsed.0, parsed.1))
1312    }
1313
1314    // Consume the remnants of a bad url
1315    // This section describes how to consume the remnants of a bad url from a stream
1316    // of code points, "cleaning up" after the tokenizer realizes that it’s in the
1317    // middle of a <bad-url-token> rather than a <url-token>. It returns nothing;
1318    // its sole use is to consume enough of the input stream to reach a recovery
1319    // point where normal tokenizing can resume. But for recovery purpose we return
1320    // bad URL remnants.
1321    fn read_bad_url_remnants(&mut self) -> LexResult<String> {
1322        self.with_sub_buf(|l, raw| {
1323            // Repeatedly consume the next input code point from the stream:
1324            loop {
1325                match l.consume() {
1326                    // U+0029 RIGHT PARENTHESIS ())
1327                    // EOF
1328                    // Return.
1329                    Some(c @ ')') => {
1330                        raw.push(c);
1331
1332                        break;
1333                    }
1334                    None => {
1335                        break;
1336                    }
1337                    // the input stream starts with a valid escape
1338                    Some(c) if l.is_valid_escape(None, None) => {
1339                        // Consume an escaped code point. This allows an escaped right parenthesis
1340                        // ("\)") to be encountered without ending the <bad-url-token>.
1341                        let escaped = l.read_escape()?;
1342
1343                        raw.push(c);
1344                        raw.push_str(&escaped.1);
1345                    }
1346                    // anything else
1347                    // Do nothing.
1348                    Some(c) => {
1349                        raw.push(c);
1350                    }
1351                }
1352            }
1353
1354            Ok((&**raw).into())
1355        })
1356    }
1357}
1358
1359#[inline(always)]
1360fn is_digit(c: char) -> bool {
1361    c.is_ascii_digit()
1362}
1363
1364#[inline(always)]
1365fn is_hex_digit(c: char) -> bool {
1366    match c {
1367        c if is_digit(c) => true,
1368        'A'..='F' => true,
1369        'a'..='f' => true,
1370        _ => false,
1371    }
1372}
1373
1374#[inline(always)]
1375fn is_uppercase_letter(c: char) -> bool {
1376    c.is_ascii_uppercase()
1377}
1378
1379#[inline(always)]
1380fn is_lowercase_letter(c: char) -> bool {
1381    c.is_ascii_lowercase()
1382}
1383
1384#[inline(always)]
1385fn is_letter(c: char) -> bool {
1386    is_uppercase_letter(c) || is_lowercase_letter(c)
1387}
1388
1389#[inline(always)]
1390fn is_non_ascii(c: char) -> bool {
1391    c as u32 >= 0x80
1392}
1393
1394#[inline(always)]
1395fn is_name_start(c: char) -> bool {
1396    matches!(c, c if is_letter(c) || is_non_ascii(c) || c == '_' || c == '\x00')
1397}
1398
1399#[inline(always)]
1400fn is_name(c: char) -> bool {
1401    is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == '-')
1402}
1403
1404#[inline(always)]
1405fn is_non_printable(c: char) -> bool {
1406    matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
1407}
1408
1409#[inline(always)]
1410fn is_newline(c: char) -> bool {
1411    matches!(c, '\n' | '\r' | '\x0C')
1412}
1413
1414#[inline(always)]
1415fn is_whitespace(c: char) -> bool {
1416    matches!(c, c if c == ' ' || c == '\t' || is_newline(c))
1417}