swc_css_parser/lexer/
mod.rs

1use std::{borrow::Cow, cell::RefCell, char::REPLACEMENT_CHARACTER, rc::Rc};
2
3use swc_atoms::{Atom, AtomStoreCell};
4use swc_common::{
5    comments::{Comment, CommentKind, Comments},
6    input::Input,
7    util::take::Take,
8    BytePos, Span,
9};
10use swc_css_ast::{
11    matches_eq_ignore_ascii_case, DimensionToken, NumberType, Token, TokenAndSpan, UrlKeyValue,
12};
13
14use crate::{
15    error::{Error, ErrorKind},
16    parser::{input::ParserInput, ParserConfig},
17};
18
19pub(crate) type LexResult<T> = Result<T, ErrorKind>;
20
21#[derive(Clone)]
22pub struct Lexer<'a, I>
23where
24    I: Input<'a>,
25{
26    comments: Option<&'a dyn Comments>,
27    pending_leading_comments: Vec<Comment>,
28    input: I,
29    cur: Option<char>,
30    cur_pos: BytePos,
31    start_pos: BytePos,
32    /// Used to override last_pos
33    override_pos: Option<BytePos>,
34    config: ParserConfig,
35    buf: Rc<RefCell<String>>,
36    raw_buf: Rc<RefCell<String>>,
37    sub_buf: Rc<RefCell<String>>,
38    errors: Rc<RefCell<Vec<Error>>>,
39    atoms: Rc<AtomStoreCell>,
40}
41
42impl<'a, I> Lexer<'a, I>
43where
44    I: Input<'a>,
45{
46    pub fn new(input: I, comments: Option<&'a dyn Comments>, config: ParserConfig) -> Self {
47        let start_pos = input.last_pos();
48
49        Lexer {
50            comments,
51            input,
52            cur: None,
53            cur_pos: start_pos,
54            start_pos,
55            override_pos: None,
56            config,
57            buf: Rc::new(RefCell::new(String::with_capacity(256))),
58            raw_buf: Rc::new(RefCell::new(String::with_capacity(256))),
59            sub_buf: Rc::new(RefCell::new(String::with_capacity(32))),
60            errors: Default::default(),
61            pending_leading_comments: Default::default(),
62            atoms: Default::default(),
63        }
64    }
65
66    fn with_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
67    where
68        F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
69    {
70        let b = self.buf.clone();
71        let mut buf = b.borrow_mut();
72
73        buf.clear();
74
75        op(self, &mut buf)
76    }
77
78    fn with_sub_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
79    where
80        F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
81    {
82        let b = self.sub_buf.clone();
83        let mut sub_buf = b.borrow_mut();
84
85        sub_buf.clear();
86
87        op(self, &mut sub_buf)
88    }
89
90    fn with_buf_and_raw_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
91    where
92        F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String, &mut String) -> LexResult<Ret>,
93    {
94        let b = self.buf.clone();
95        let r = self.raw_buf.clone();
96        let mut buf = b.borrow_mut();
97        let mut raw = r.borrow_mut();
98
99        buf.clear();
100        raw.clear();
101
102        op(self, &mut buf, &mut raw)
103    }
104}
105
106impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
107    type Item = TokenAndSpan;
108
109    fn next(&mut self) -> Option<Self::Item> {
110        let token = self.consume_token();
111
112        match token {
113            Ok(token) => {
114                let end = self
115                    .override_pos
116                    .take()
117                    .unwrap_or_else(|| self.input.last_pos());
118                let span = Span::new(self.start_pos, end);
119
120                let token_and_span = TokenAndSpan { span, token };
121
122                return Some(token_and_span);
123            }
124            Err(..) => {
125                return None;
126            }
127        }
128    }
129}
130
131#[derive(Debug, Clone, Copy)]
132pub struct LexerState {
133    pos: BytePos,
134}
135
136impl<'a, I> ParserInput for Lexer<'a, I>
137where
138    I: Input<'a>,
139{
140    type State = LexerState;
141
142    fn start_pos(&mut self) -> BytePos {
143        self.input.last_pos()
144    }
145
146    fn state(&mut self) -> Self::State {
147        LexerState {
148            pos: self.input.last_pos(),
149        }
150    }
151
152    fn reset(&mut self, state: &Self::State) {
153        unsafe {
154            // Safety: state.pos is created from a valid position.
155            self.input.reset_to(state.pos);
156        }
157    }
158
159    fn take_errors(&mut self) -> Vec<Error> {
160        self.errors.take()
161    }
162
163    fn skip_ws(&mut self) -> Option<BytePos> {
164        self.read_comments();
165
166        if let Some(c) = self.input.cur() {
167            if !is_whitespace(c) {
168                return None;
169            }
170        }
171
172        loop {
173            self.read_comments();
174
175            if self.input.uncons_while(is_whitespace).is_empty() {
176                break;
177            }
178        }
179
180        Some(self.input.last_pos())
181    }
182
183    fn atom(&self, s: Cow<str>) -> Atom {
184        self.atoms.atom(s)
185    }
186}
187
188impl<'a, I> Lexer<'a, I>
189where
190    I: Input<'a>,
191{
192    #[inline(always)]
193    fn cur(&mut self) -> Option<char> {
194        self.cur
195    }
196
197    #[inline(always)]
198    fn next(&mut self) -> Option<char> {
199        self.input.cur()
200    }
201
202    #[inline(always)]
203    fn next_next(&mut self) -> Option<char> {
204        self.input.peek()
205    }
206
207    #[inline(always)]
208    fn next_next_next(&mut self) -> Option<char> {
209        self.input.peek_ahead()
210    }
211
212    #[inline(always)]
213    fn consume(&mut self) -> Option<char> {
214        let cur = self.input.cur();
215
216        self.cur = cur;
217        self.cur_pos = self.input.last_pos();
218
219        if cur.is_some() {
220            unsafe {
221                // Safety: cur is Some
222                self.input.bump();
223            }
224        }
225
226        cur
227    }
228
229    #[inline(always)]
230    fn reconsume(&mut self) {
231        unsafe {
232            // Safety: self.cur_pos is a position generated by self.input, meaning it is
233            // valid.
234            self.input.reset_to(self.cur_pos);
235        }
236    }
237
238    #[cold]
239    fn emit_error(&mut self, kind: ErrorKind) {
240        self.errors.borrow_mut().push(Error::new(
241            Span::new(self.cur_pos, self.input.last_pos()),
242            kind,
243        ));
244    }
245
246    fn consume_token(&mut self) -> LexResult<Token> {
247        self.read_comments();
248        self.start_pos = self.input.last_pos();
249
250        if let Some(comments) = self.comments {
251            if !self.pending_leading_comments.is_empty() {
252                comments.add_leading_comments(self.start_pos, self.pending_leading_comments.take());
253            }
254        }
255
256        // Consume the next input code point.
257        match self.consume() {
258            // whitespace
259            // Consume as much whitespace as possible. Return a <whitespace-token>.
260            Some(c) if is_whitespace(c) => self.with_buf(|l, buf| {
261                buf.push(c);
262
263                loop {
264                    let c = l.next();
265
266                    match c {
267                        Some(c) if is_whitespace(c) => {
268                            l.consume();
269
270                            buf.push(c);
271                        }
272                        _ => {
273                            break;
274                        }
275                    }
276                }
277
278                return Ok(Token::WhiteSpace {
279                    value: l.atoms.atom(&**buf),
280                });
281            }),
282            // U+0022 QUOTATION MARK (")
283            // Consume a string token and return it.
284            Some('"') => self.read_str(None),
285            // U+0023 NUMBER SIGN (#)
286            Some('#') => {
287                let first = self.next();
288                let second = self.next_next();
289
290                // If the next input code point is a name code point or the next two input code
291                // points are a valid escape, then:
292                if (first.is_some() && is_name(first.unwrap()))
293                    || self.is_valid_escape(first, second)
294                {
295                    // Create a <hash-token>.
296
297                    // If the next 3 input code points would start an identifier, set the
298                    // <hash-token>’s type flag to "id".
299                    let third = self.next_next_next();
300                    let is_would_start_ident = self.would_start_ident(first, second, third);
301
302                    // Consume an ident sequence, and set the <hash-token>’s value to the returned
303                    // string.
304                    let ident_sequence = self.read_ident_sequence()?;
305
306                    // Return the <hash-token>.
307                    return Ok(Token::Hash {
308                        is_id: is_would_start_ident,
309                        value: ident_sequence.0,
310                        raw: ident_sequence.1,
311                    });
312                }
313
314                Ok(Token::Delim { value: '#' })
315            }
316            // U+0027 APOSTROPHE (')
317            // Consume a string token and return it.
318            Some('\'') => self.read_str(None),
319            // U+0028 LEFT PARENTHESIS (()
320            // Return a <(-token>.
321            Some('(') => Ok(tok!("(")),
322            // U+0029 RIGHT PARENTHESIS ())
323            // Return a <)-token>.
324            Some(')') => Ok(tok!(")")),
325            // U+002B PLUS SIGN (+)
326            Some('+') => {
327                // If the input stream starts with a number, reconsume the current input code
328                // point, consume a numeric token and return it.
329                if self.would_start_number(None, None, None) {
330                    self.reconsume();
331
332                    return self.read_numeric();
333                }
334
335                // Otherwise, return a <delim-token> with its value set to the current input
336                // code point.
337                Ok(tok!("+"))
338            }
339            // U+002C COMMA (,)
340            // Return a <comma-token>.
341            Some(',') => Ok(tok!(",")),
342            // U+002D HYPHEN-MINUS (-)
343            Some('-') => {
344                // If the input stream starts with a number, reconsume the current input code
345                // point, consume a numeric token, and return it.
346                if self.would_start_number(None, None, None) {
347                    self.reconsume();
348
349                    return self.read_numeric();
350                }
351                // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E
352                // GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
353                else if self.next() == Some('-') && self.next_next() == Some('>') {
354                    self.consume();
355                    self.consume();
356
357                    return Ok(Token::CDC);
358                }
359                // Otherwise, if the input stream starts with an identifier, reconsume the current
360                // input code point, consume an ident-like token, and return it.
361                else if self.would_start_ident(None, None, None) {
362                    self.reconsume();
363
364                    return self.read_ident_like();
365                }
366
367                // Otherwise, return a <delim-token> with its value set to the current input
368                // code point.
369                Ok(tok!("-"))
370            }
371            // U+002E FULL STOP (.)
372            Some('.') => {
373                // If the input stream starts with a number, reconsume the current input code
374                // point, consume a numeric token, and return it.
375                if self.would_start_number(None, None, None) {
376                    self.reconsume();
377
378                    return self.read_numeric();
379                }
380
381                // Otherwise, return a <delim-token> with its value set to the current input
382                // code point.
383                Ok(tok!("."))
384            }
385            // U+003A COLON (:)
386            // Return a <colon-token>.
387            Some(':') => Ok(tok!(":")),
388            // U+003B SEMICOLON (;)
389            // Return a <semicolon-token>.
390            Some(';') => Ok(tok!(";")),
391            // U+003C LESS-THAN SIGN (<)
392            Some('<') => {
393                // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D
394                // HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), consume them and return a
395                // <CDO-token>.
396                if self.next() == Some('!')
397                    && self.next_next() == Some('-')
398                    && self.next_next_next() == Some('-')
399                {
400                    self.consume(); // !
401                    self.consume(); // -
402                    self.consume(); // -
403
404                    return Ok(tok!("<!--"));
405                }
406
407                // Otherwise, return a <delim-token> with its value set to the current input
408                // code point.
409                Ok(tok!("<"))
410            }
411            // U+0040 COMMERCIAL AT (@)
412            Some('@') => {
413                let first = self.next();
414                let second = self.next_next();
415                let third = self.next_next_next();
416
417                // If the next 3 input code points would start an identifier, consume a name,
418                // create an <at-keyword-token> with its value set to the returned value, and
419                // return it.
420                if self.would_start_ident(first, second, third) {
421                    let ident_sequence = self.read_ident_sequence()?;
422
423                    return Ok(Token::AtKeyword {
424                        value: ident_sequence.0,
425                        raw: ident_sequence.1,
426                    });
427                }
428
429                // Otherwise, return a <delim-token> with its value set to the current input
430                // code point.
431                Ok(Token::Delim { value: '@' })
432            }
433            // U+005B LEFT SQUARE BRACKET ([)
434            // Return a <[-token>.
435            Some('[') => Ok(tok!("[")),
436            // U+005C REVERSE SOLIDUS (\)
437            Some('\\') => {
438                // If the input stream starts with a valid escape, reconsume the current input
439                // code point, consume an ident-like token, and return it.
440                if self.is_valid_escape(None, None) {
441                    self.reconsume();
442
443                    return self.read_ident_like();
444                }
445
446                // Otherwise, this is a parse error. Return a <delim-token> with its value set
447                // to the current input code point.
448                self.emit_error(ErrorKind::InvalidEscape);
449
450                Ok(Token::Delim { value: '\\' })
451            }
452            // U+005D RIGHT SQUARE BRACKET (])
453            // Return a <]-token>.
454            Some(']') => Ok(tok!("]")),
455            // U+007B LEFT CURLY BRACKET ({)
456            // Return a <{-token>.
457            Some('{') => Ok(tok!("{")),
458            // U+007D RIGHT CURLY BRACKET (})
459            // Return a <}-token>.
460            Some('}') => Ok(tok!("}")),
461            // digit
462            // Reconsume the current input code point, consume a numeric token, and return it.
463            Some('0'..='9') => {
464                self.reconsume();
465
466                self.read_numeric()
467            }
468            // name-start code point
469            // Reconsume the current input code point, consume an ident-like token, and return it.
470            Some(c) if is_name_start(c) => {
471                self.reconsume();
472
473                self.read_ident_like()
474            }
475            // EOF
476            // Return an <EOF-token>.
477            None => Err(ErrorKind::Eof),
478            // anything else
479            // Return a <delim-token> with its value set to the current input code point.
480            Some(c) => Ok(Token::Delim { value: c }),
481        }
482    }
483
484    // Consume comments.
485    // This section describes how to consume comments from a stream of code points.
486    // It returns nothing.
487    fn read_comments(&mut self) {
488        // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A
489        // ASTERISK (*), consume them and all following code points up to and including
490        // the first U+002A ASTERISK (*) followed by a U+002F SOLIDUS (/), or up to an
491        // EOF code point. Return to the start of this step.
492        // NOTE: We allow to parse line comments under the option.
493        if self.next() == Some('/') && self.next_next() == Some('*') {
494            let cmt_start = self.input.last_pos();
495
496            while self.next() == Some('/') && self.next_next() == Some('*') {
497                self.consume(); // '*'
498                self.consume(); // '/'
499
500                loop {
501                    match self.consume() {
502                        Some('*') if self.next() == Some('/') => {
503                            self.consume(); // '/'
504
505                            if self.comments.is_some() {
506                                let last_pos = self.input.last_pos();
507                                let text = unsafe {
508                                    // Safety: last_pos is a valid position
509                                    self.input.slice(cmt_start, last_pos)
510                                };
511
512                                self.pending_leading_comments.push(Comment {
513                                    kind: CommentKind::Block,
514                                    span: (self.start_pos, last_pos).into(),
515                                    text: self.atoms.atom(text),
516                                });
517                            }
518
519                            break;
520                        }
521                        None => {
522                            let span = Span::new(self.start_pos, self.input.last_pos());
523
524                            self.errors
525                                .borrow_mut()
526                                .push(Error::new(span, ErrorKind::UnterminatedBlockComment));
527
528                            return;
529                        }
530                        _ => {}
531                    }
532                }
533            }
534        } else if self.config.allow_wrong_line_comments
535            && self.next() == Some('/')
536            && self.next_next() == Some('/')
537        {
538            while self.next() == Some('/') && self.next_next() == Some('/') {
539                self.consume(); // '/'
540                self.consume(); // '/'
541
542                let start_of_content = self.input.last_pos();
543
544                loop {
545                    match self.consume() {
546                        Some(c) if is_newline(c) => {
547                            if self.comments.is_some() {
548                                let last_pos = self.input.last_pos();
549                                let text = unsafe {
550                                    // Safety: last_pos is a valid position
551                                    self.input.slice(start_of_content, last_pos)
552                                };
553
554                                self.pending_leading_comments.push(Comment {
555                                    kind: CommentKind::Line,
556                                    span: (self.start_pos, last_pos).into(),
557                                    text: self.atoms.atom(text),
558                                });
559                            }
560                            break;
561                        }
562                        None => return,
563                        _ => {}
564                    }
565                }
566            }
567        }
568    }
569
570    // This section describes how to consume a numeric token from a stream of code
571    // points. It returns either a <number-token>, <percentage-token>, or
572    // <dimension-token>.
573    fn read_numeric(&mut self) -> LexResult<Token> {
574        // Consume a number and let number be the result.
575        let number = self.read_number()?;
576
577        let next_first = self.next();
578        let next_second = self.next_next();
579        let next_third = self.next_next_next();
580
581        // If the next 3 input code points would start an identifier, then:
582        if self.would_start_ident(next_first, next_second, next_third) {
583            // Swap logic to avoid create empty strings, because it doesn't make sense
584            //
585            // Consume a name. Set the <dimension-token>’s unit to the returned value.
586            let ident_sequence = self.read_ident_sequence()?;
587            // Create a <dimension-token> with the same value and type flag as number, and a
588            // unit set initially to the empty string.
589            let token = Token::Dimension(Box::new(DimensionToken {
590                value: number.0,
591                raw_value: number.1,
592                unit: ident_sequence.0,
593                raw_unit: ident_sequence.1,
594                type_flag: number.2,
595            }));
596
597            // Return the <dimension-token>.
598            return Ok(token);
599        }
600        // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. Create
601        // a <percentage-token> with the same value as number, and return it.
602        else if next_first == Some('%') {
603            self.consume();
604
605            return Ok(Token::Percentage {
606                value: number.0,
607                raw: number.1,
608            });
609        }
610
611        // Otherwise, create a <number-token> with the same value and type flag as
612        // number, and return it.
613        Ok(Token::Number {
614            value: number.0,
615            raw: number.1,
616            type_flag: number.2,
617        })
618    }
619
620    // This section describes how to consume an ident-like token from a stream of
621    // code points. It returns an <ident-token>, <function-token>, <url-token>, or
622    // <bad-url-token>.
623    fn read_ident_like(&mut self) -> LexResult<Token> {
624        // Consume a name, and let string be the result.
625        let ident_sequence = self.read_ident_sequence()?;
626
627        // If string’s value is an ASCII case-insensitive match for "url", and the next
628        // input code point is U+0028 LEFT PARENTHESIS ((), consume it.
629        if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some('(') {
630            self.consume();
631
632            let start_whitespace = self.input.last_pos();
633
634            // While the next two input code points are whitespace, consume the next input
635            // code point.
636            let whitespaces = self.with_buf(|l, buf| {
637                while let (Some(next), Some(next_next)) = (l.next(), l.next_next()) {
638                    if is_whitespace(next) && is_whitespace(next_next) {
639                        l.consume();
640
641                        buf.push(next);
642                    } else {
643                        break;
644                    }
645                }
646
647                Ok(buf.to_string())
648            })?;
649
650            match self.next() {
651                // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027
652                // APOSTROPHE ('), or whitespace followed by U+0022 QUOTATION MARK (") or U+0027
653                // APOSTROPHE ('), then create a <function-token> with its value set to string and
654                // return it.
655                Some(c)
656                    if is_whitespace(c)
657                        && (self.next_next() == Some('"') || self.next_next() == Some('\'')) =>
658                {
659                    // Override last position because we consumed whitespaces, but they
660                    // should not be part of token
661                    self.override_pos = Some(start_whitespace);
662
663                    return Ok(Token::Function {
664                        value: ident_sequence.0,
665                        raw: ident_sequence.1,
666                    });
667                }
668                Some('"' | '\'') => {
669                    return Ok(Token::Function {
670                        value: ident_sequence.0,
671                        raw: ident_sequence.1,
672                    });
673                }
674                // Otherwise, consume a url token, and return it.
675                _ => {
676                    return self.read_url(ident_sequence, whitespaces);
677                }
678            }
679        }
680        // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
681        // Create a <function-token> with its value set to string and return it.
682        else if self.next() == Some('(') {
683            self.consume();
684
685            return Ok(Token::Function {
686                value: ident_sequence.0,
687                raw: ident_sequence.1,
688            });
689        }
690
691        // Otherwise, create an <ident-token> with its value set to string and return
692        // it.
693        Ok(Token::Ident {
694            value: ident_sequence.0,
695            raw: ident_sequence.1,
696        })
697    }
698
699    // This section describes how to consume a string token from a stream of code
700    // points. It returns either a <string-token> or <bad-string-token>.
701    fn read_str(&mut self, maybe_ending_code_point: Option<char>) -> LexResult<Token> {
702        self.with_buf_and_raw_buf(|l, buf, raw| {
703            // This algorithm may be called with an ending code point, which denotes the
704            // code point that ends the string. If an ending code point is not specified,
705            // the current input code point is used.
706            let ending_code_point = maybe_ending_code_point.or_else(|| l.cur());
707
708            // Initially create a <string-token> with its value set to the empty string.
709            // Done above
710
711            raw.push(ending_code_point.unwrap());
712
713            // Repeatedly consume the next input code point from the stream:
714            loop {
715                match l.consume() {
716                    // ending code point
717                    // Return the <string-token>.
718                    Some(c) if c == ending_code_point.unwrap() => {
719                        raw.push(c);
720
721                        break;
722                    }
723
724                    // EOF
725                    // This is a parse error. Return the <string-token>.
726                    None => {
727                        l.emit_error(ErrorKind::UnterminatedString);
728
729                        return Ok(Token::String {
730                            value: l.atoms.atom(&**buf),
731                            raw: l.atoms.atom(&**raw),
732                        });
733                    }
734
735                    // Newline
736                    // This is a parse error. Reconsume the current input code point, create a
737                    // <bad-string-token>, and return it.
738                    Some(c) if is_newline(c) => {
739                        l.emit_error(ErrorKind::NewlineInString);
740                        l.reconsume();
741
742                        return Ok(Token::BadString {
743                            raw: l.atoms.atom(&**raw),
744                        });
745                    }
746
747                    // U+005C REVERSE SOLIDUS (\)
748                    Some(c) if c == '\\' => {
749                        let next = l.next();
750
751                        // If the next input code point is EOF, do nothing.
752                        if l.next().is_none() {
753                            continue;
754                        }
755                        // Otherwise, if the next input code point is a newline, consume it.
756                        else if l.next().is_some() && is_newline(l.next().unwrap()) {
757                            l.consume();
758
759                            raw.push(c);
760                            raw.push(next.unwrap());
761                        }
762                        // Otherwise, (the stream starts with a valid escape) consume an escaped
763                        // code point and append the returned code point to
764                        // the <string-token>’s value.
765                        else if l.is_valid_escape(None, None) {
766                            let escape = l.read_escape()?;
767
768                            buf.push(escape.0);
769                            raw.push(c);
770                            raw.push_str(&escape.1);
771                        }
772                    }
773
774                    // Anything else
775                    // Append the current input code point to the <string-token>’s value.
776                    Some(c) => {
777                        buf.push(c);
778                        raw.push(c);
779                    }
780                }
781            }
782
783            Ok(Token::String {
784                value: l.atoms.atom(&**buf),
785                raw: l.atoms.atom(&**raw),
786            })
787        })
788    }
789
790    // This section describes how to consume a url token from a stream of code
791    // points. It returns either a <url-token> or a <bad-url-token>.
792    fn read_url(&mut self, name: (Atom, Atom), before: String) -> LexResult<Token> {
793        // Initially create a <url-token> with its value set to the empty string.
794        self.with_buf_and_raw_buf(|l, out, raw| {
795            raw.push_str(&before);
796
797            // Consume as much whitespace as possible.
798            while let Some(c) = l.next() {
799                if is_whitespace(c) {
800                    l.consume();
801
802                    raw.push(c);
803                } else {
804                    break;
805                }
806            }
807
808            // Repeatedly consume the next input code point from the stream:
809            loop {
810                match l.consume() {
811                    // U+0029 RIGHT PARENTHESIS ())
812                    // Return the <url-token>.
813                    Some(')') => {
814                        return Ok(Token::Url {
815                            value: l.atoms.atom(&**out),
816                            raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
817                        });
818                    }
819
820                    // EOF
821                    // This is a parse error. Return the <url-token>.
822                    None => {
823                        l.emit_error(ErrorKind::UnterminatedUrl);
824
825                        return Ok(Token::Url {
826                            value: l.atoms.atom(&**out),
827                            raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
828                        });
829                    }
830
831                    // whitespace
832                    Some(c) if is_whitespace(c) => {
833                        // Consume as much whitespace as possible.
834                        let whitespaces: String = l.with_sub_buf(|l, buf| {
835                            buf.push(c);
836
837                            while let Some(c) = l.next() {
838                                if is_whitespace(c) {
839                                    l.consume();
840
841                                    buf.push(c);
842                                } else {
843                                    break;
844                                }
845                            }
846
847                            Ok(buf.to_string())
848                        })?;
849
850                        // if the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
851                        // consume it and return the <url-token> (if EOF was
852                        // encountered, this is a parse error);
853                        match l.next() {
854                            Some(')') => {
855                                l.consume();
856
857                                raw.push_str(&whitespaces);
858
859                                return Ok(Token::Url {
860                                    value: l.atoms.atom(&**out),
861                                    raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
862                                });
863                            }
864                            None => {
865                                l.emit_error(ErrorKind::UnterminatedUrl);
866
867                                raw.push_str(&whitespaces);
868
869                                return Ok(Token::Url {
870                                    value: l.atoms.atom(&**out),
871                                    raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
872                                });
873                            }
874                            _ => {}
875                        }
876
877                        // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
878                        // and return it.
879                        raw.push_str(&whitespaces);
880
881                        let remnants = l.read_bad_url_remnants()?;
882
883                        raw.push_str(&remnants);
884
885                        return Ok(Token::BadUrl {
886                            raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
887                        });
888                    }
889
890                    // U+0022 QUOTATION MARK (")
891                    // U+0027 APOSTROPHE (')
892                    // U+0028 LEFT PARENTHESIS (()
893                    // non-printable code point
894                    // This is a parse error. Consume the remnants of a bad url, create a
895                    // <bad-url-token>, and return it.
896                    Some(c) if c == '"' || c == '\'' || c == '(' || is_non_printable(c) => {
897                        l.emit_error(ErrorKind::UnexpectedCharInUrl);
898
899                        let remnants = l.read_bad_url_remnants()?;
900
901                        raw.push(c);
902                        raw.push_str(&remnants);
903
904                        return Ok(Token::BadUrl {
905                            raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
906                        });
907                    }
908
909                    // U+005C REVERSE SOLIDUS (\)
910                    Some(c) if c == '\\' => {
911                        // If the stream starts with a valid escape, consume an escaped code point
912                        // and append the returned code point to the
913                        // <url-token>’s value.
914                        if l.is_valid_escape(None, None) {
915                            let escaped = l.read_escape()?;
916
917                            out.push(escaped.0);
918                            raw.push(c);
919                            raw.push_str(&escaped.1);
920                        }
921                        // Otherwise, this is a parse error. Consume the remnants of a bad url,
922                        // create a <bad-url-token>, and return it.
923                        else {
924                            l.emit_error(ErrorKind::InvalidEscape);
925
926                            let remnants = l.read_bad_url_remnants()?;
927
928                            raw.push(c);
929                            raw.push_str(&remnants);
930
931                            return Ok(Token::BadUrl {
932                                raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
933                            });
934                        }
935                    }
936
937                    // anything else
938                    // Append the current input code point to the <url-token>’s value.
939                    Some(c) => {
940                        out.push(c);
941                        raw.push(c);
942                    }
943                }
944            }
945        })
946    }
947
948    // Consume an escaped code point
949    // This section describes how to consume an escaped code point. It assumes that
950    // the U+005C REVERSE SOLIDUS (\) has already been consumed and that the next
951    // input code point has already been verified to be part of a valid escape. It
952    // will return a code point.
953    fn read_escape(&mut self) -> LexResult<(char, String)> {
954        self.with_sub_buf(|l, buf| {
955            // Consume the next input code point.
956            match l.consume() {
957                // hex digit
958                Some(c) if is_hex_digit(c) => {
959                    let mut hex = c.to_digit(16).unwrap();
960
961                    buf.push(c);
962
963                    // Consume as many hex digits as possible, but no more than 5.
964                    // Note that this means 1-6 hex digits have been consumed in total.
965                    for _ in 0..5 {
966                        let next = l.next();
967                        let digit = match next.and_then(|c| c.to_digit(16)) {
968                            Some(v) => v,
969                            None => break,
970                        };
971
972                        l.consume();
973
974                        buf.push(next.unwrap());
975                        hex = hex * 16 + digit;
976                    }
977
978                    // If the next input code point is whitespace, consume it as well.
979                    let next = l.next();
980
981                    if let Some(next) = next {
982                        if is_whitespace(next) {
983                            l.consume();
984
985                            buf.push(next);
986                        }
987                    }
988
989                    // Interpret the hex digits as a hexadecimal number. If this number is zero, or
990                    // is for a surrogate, or is greater than the maximum allowed code point, return
991                    // U+FFFD REPLACEMENT CHARACTER (�).
992                    let hex = match hex {
993                        // If this number is zero
994                        0 => REPLACEMENT_CHARACTER,
995                        // or is for a surrogate
996                        55296..=57343 => REPLACEMENT_CHARACTER,
997                        // or is greater than the maximum allowed code point
998                        1114112.. => REPLACEMENT_CHARACTER,
999                        _ => char::from_u32(hex).unwrap_or(REPLACEMENT_CHARACTER),
1000                    };
1001
1002                    // Otherwise, return the code point with that value.
1003                    Ok((hex, (&**buf).into()))
1004                }
1005                // EOF
1006                // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
1007                None => {
1008                    l.emit_error(ErrorKind::InvalidEscape);
1009
1010                    let value = REPLACEMENT_CHARACTER;
1011
1012                    buf.push(value);
1013
1014                    Ok((value, (&**buf).into()))
1015                }
1016                // anything else
1017                // Return the current input code point.
1018                Some(c) => {
1019                    buf.push(c);
1020
1021                    Ok((c, (&**buf).into()))
1022                }
1023            }
1024        })
1025    }
1026
1027    // Check if two code points are a valid escape
1028    // This section describes how to check if two code points are a valid escape.
1029    // The algorithm described here can be called explicitly with two code points,
1030    // or can be called with the input stream itself. In the latter case, the two
1031    // code points in question are the current input code point and the next input
1032    // code point, in that order.
1033    fn is_valid_escape(&mut self, maybe_first: Option<char>, maybe_second: Option<char>) -> bool {
1034        // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
1035        if maybe_first.or_else(|| self.cur()) != Some('\\') {
1036            return false;
1037        }
1038
1039        match maybe_second.or_else(|| self.next()) {
1040            // Otherwise, if the second code point is a newline, return false.
1041            Some(second) => !is_newline(second),
1042            // Otherwise, return true.
1043            None => false,
1044        }
1045    }
1046
1047    // Check if three code points would start an identifier
1048    // This section describes how to check if three code points would start an
1049    // identifier. The algorithm described here can be called explicitly with three
1050    // code points, or can be called with the input stream itself. In the latter
1051    // case, the three code points in question are the current input code point and
1052    // the next two input code points, in that order.
1053    fn would_start_ident(
1054        &mut self,
1055        maybe_first: Option<char>,
1056        maybe_second: Option<char>,
1057        maybe_third: Option<char>,
1058    ) -> bool {
1059        // Look at the first code point:
1060        let first = maybe_first.or_else(|| self.cur());
1061
1062        match first {
1063            // U+002D HYPHEN-MINUS
1064            Some('-') => {
1065                let second = maybe_second.or_else(|| self.next());
1066
1067                match second {
1068                    // If the second code point is a name-start code point
1069                    // return true.
1070                    Some(c) if is_name_start(c) => true,
1071                    // or a U+002D HYPHEN-MINUS,
1072                    // return true.
1073                    Some('-') => true,
1074                    // or the second and third code points are a valid escape
1075                    // return true.
1076                    Some(_) => {
1077                        let third = maybe_third.or_else(|| self.next_next());
1078
1079                        self.is_valid_escape(second, third)
1080                    }
1081                    // Otherwise, return false.
1082                    _ => false,
1083                }
1084            }
1085            // name-start code point
1086            // Return true.
1087            Some(c) if is_name_start(c) => true,
1088            // U+005C REVERSE SOLIDUS (\)
1089            // If the first and second code points are a valid escape, return true. Otherwise,
1090            // return false.
1091            Some('\\') => {
1092                let second = maybe_second.or_else(|| self.next());
1093
1094                self.is_valid_escape(first, second)
1095            }
1096            _ => false,
1097        }
1098    }
1099
1100    // Check if three code points would start a number
1101    // This section describes how to check if three code points would start a
1102    // number. The algorithm described here can be called explicitly with three code
1103    // points, or can be called with the input stream itself. In the latter case,
1104    // the three code points in question are the current input code point and the
1105    // next two input code points, in that order.
1106    #[allow(clippy::needless_return)]
1107    fn would_start_number(
1108        &mut self,
1109        maybe_first: Option<char>,
1110        maybe_second: Option<char>,
1111        maybe_third: Option<char>,
1112    ) -> bool {
1113        // Look at the first code point:
1114        let first = maybe_first.or_else(|| self.cur());
1115
1116        match first {
1117            // U+002B PLUS SIGN (+)
1118            // U+002D HYPHEN-MINUS (-)
1119            Some('+') | Some('-') => {
1120                match maybe_second.or_else(|| self.next()) {
1121                    // If the second code point is a digit, return true.
1122                    Some(second) if second.is_ascii_digit() => return true,
1123                    // Otherwise, if the second code point is a U+002E FULL STOP (.) and the
1124                    // third code point is a digit, return true.
1125                    Some('.') => {
1126                        if let Some(third) = maybe_third.or_else(|| self.next_next()) {
1127                            if third.is_ascii_digit() {
1128                                return true;
1129                            }
1130                        }
1131
1132                        return false;
1133                    }
1134                    // Otherwise, return false.
1135                    _ => return false,
1136                };
1137            }
1138            // U+002E FULL STOP (.)
1139            Some('.') => {
1140                // If the second code point is a digit, return true.
1141                if let Some(second) = self.next() {
1142                    if second.is_ascii_digit() {
1143                        return true;
1144                    }
1145                }
1146
1147                // Otherwise, return false.
1148                false
1149            }
1150            // digit
1151            // Return true.
1152            Some(first) if first.is_ascii_digit() => true,
1153            // anything else
1154            // Return false.
1155            _ => false,
1156        }
1157    }
1158
1159    // Consume an ident sequence
1160    // This section describes how to consume an ident sequence from a stream of code
1161    // points. It returns a string containing the largest name that can be formed
1162    // from adjacent code points in the stream, starting from the first.
1163    fn read_ident_sequence(&mut self) -> LexResult<(Atom, Atom)> {
1164        self.with_buf_and_raw_buf(|l, buf, raw| {
1165            // Let result initially be an empty string.
1166            // Done above
1167
1168            // Repeatedly consume the next input code point from the stream:
1169            loop {
1170                match l.consume() {
1171                    // name code point
1172                    // Append the code point to result.
1173                    Some(c) if is_name(c) => {
1174                        buf.push(c);
1175                        raw.push(c);
1176                    }
1177                    // the stream starts with a valid escape
1178                    // Consume an escaped code point. Append the returned code point to result.
1179                    Some(c) if l.is_valid_escape(None, None) => {
1180                        let escaped = l.read_escape()?;
1181
1182                        buf.push(escaped.0);
1183                        raw.push(c);
1184                        raw.push_str(&escaped.1);
1185                    }
1186                    // anything else
1187                    // Reconsume the current input code point. Return result.
1188                    _ => {
1189                        l.reconsume();
1190
1191                        break;
1192                    }
1193                }
1194            }
1195
1196            Ok((l.atoms.atom(&**buf), l.atoms.atom(&**raw)))
1197        })
1198    }
1199
1200    // This section describes how to consume a number from a stream of code points.
1201    // It returns a numeric value, and a type which is either "integer" or "number".
1202    fn read_number(&mut self) -> LexResult<(f64, Atom, NumberType)> {
1203        let parsed: (Atom, NumberType) = self.with_buf(|l, out| {
1204            // Initially set type to "integer". Let repr be the empty string.
1205            let mut type_flag = NumberType::Integer;
1206
1207            // If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
1208            // (-), consume it and append it to repr.
1209            let next = l.next();
1210
1211            if next == Some('+') || next == Some('-') {
1212                l.consume();
1213
1214                out.push(next.unwrap());
1215            }
1216
1217            // While the next input code point is a digit, consume it and append it to repr.
1218            while let Some(c) = l.next() {
1219                if c.is_ascii_digit() {
1220                    l.consume();
1221
1222                    out.push(c);
1223                } else {
1224                    break;
1225                }
1226            }
1227
1228            // If the next 2 input code points are U+002E FULL STOP (.) followed by a digit,
1229            // then:
1230            let next = l.next();
1231
1232            if next == Some('.') {
1233                if let Some(n) = l.next_next() {
1234                    if n.is_ascii_digit() {
1235                        // Consume them.
1236                        l.consume();
1237                        l.consume();
1238
1239                        // Append them to repr.
1240                        out.push(next.unwrap());
1241                        out.push(n);
1242
1243                        // Set type to "number".
1244                        type_flag = NumberType::Number;
1245
1246                        // While the next input code point is a digit, consume it and append it to
1247                        // repr.
1248                        while let Some(c) = l.next() {
1249                            if c.is_ascii_digit() {
1250                                l.consume();
1251
1252                                out.push(c);
1253                            } else {
1254                                break;
1255                            }
1256                        }
1257                    }
1258                }
1259            }
1260
1261            // If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or
1262            // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS
1263            // (-) or U+002B PLUS SIGN (+), followed by a digit, then:
1264            let next = l.next();
1265
1266            if next == Some('E') || next == Some('e') {
1267                let next_next = l.next_next();
1268                let next_next_next = l.next_next_next();
1269
1270                if (next_next == Some('-')
1271                    || next_next == Some('+')
1272                        && next_next_next.is_some()
1273                        && next_next_next.unwrap().is_ascii_digit())
1274                    || next_next.is_some() && next_next.unwrap().is_ascii_digit()
1275                {
1276                    // Consume them.
1277                    l.consume();
1278                    l.consume();
1279
1280                    // Append them to repr.
1281                    out.push(next.unwrap());
1282                    out.push(next_next.unwrap());
1283
1284                    // Set type to "number".
1285                    type_flag = NumberType::Number;
1286
1287                    // While the next input code point is a digit, consume it and append it
1288                    // to repr.
1289                    while let Some(c) = l.next() {
1290                        if c.is_ascii_digit() {
1291                            l.consume();
1292
1293                            out.push(c);
1294                        } else {
1295                            break;
1296                        }
1297                    }
1298                }
1299            }
1300
1301            // Return value and type.
1302            Ok((l.atoms.atom(&**out), type_flag))
1303        })?;
1304
1305        // Convert repr to a number, and set the value to the returned value.
1306        let value = lexical::parse(&*parsed.0).unwrap_or_else(|err| {
1307            unreachable!("failed to parse `{}` using lexical: {:?}", parsed.0, err)
1308        });
1309
1310        Ok((value, parsed.0, parsed.1))
1311    }
1312
1313    // Consume the remnants of a bad url
1314    // This section describes how to consume the remnants of a bad url from a stream
1315    // of code points, "cleaning up" after the tokenizer realizes that it’s in the
1316    // middle of a <bad-url-token> rather than a <url-token>. It returns nothing;
1317    // its sole use is to consume enough of the input stream to reach a recovery
1318    // point where normal tokenizing can resume. But for recovery purpose we return
1319    // bad URL remnants.
1320    fn read_bad_url_remnants(&mut self) -> LexResult<String> {
1321        self.with_sub_buf(|l, raw| {
1322            // Repeatedly consume the next input code point from the stream:
1323            loop {
1324                match l.consume() {
1325                    // U+0029 RIGHT PARENTHESIS ())
1326                    // EOF
1327                    // Return.
1328                    Some(c @ ')') => {
1329                        raw.push(c);
1330
1331                        break;
1332                    }
1333                    None => {
1334                        break;
1335                    }
1336                    // the input stream starts with a valid escape
1337                    Some(c) if l.is_valid_escape(None, None) => {
1338                        // Consume an escaped code point. This allows an escaped right parenthesis
1339                        // ("\)") to be encountered without ending the <bad-url-token>.
1340                        let escaped = l.read_escape()?;
1341
1342                        raw.push(c);
1343                        raw.push_str(&escaped.1);
1344                    }
1345                    // anything else
1346                    // Do nothing.
1347                    Some(c) => {
1348                        raw.push(c);
1349                    }
1350                }
1351            }
1352
1353            Ok((&**raw).into())
1354        })
1355    }
1356}
1357
1358#[inline(always)]
1359fn is_digit(c: char) -> bool {
1360    c.is_ascii_digit()
1361}
1362
1363#[inline(always)]
1364fn is_hex_digit(c: char) -> bool {
1365    match c {
1366        c if is_digit(c) => true,
1367        'A'..='F' => true,
1368        'a'..='f' => true,
1369        _ => false,
1370    }
1371}
1372
1373#[inline(always)]
1374fn is_uppercase_letter(c: char) -> bool {
1375    c.is_ascii_uppercase()
1376}
1377
1378#[inline(always)]
1379fn is_lowercase_letter(c: char) -> bool {
1380    c.is_ascii_lowercase()
1381}
1382
1383#[inline(always)]
1384fn is_letter(c: char) -> bool {
1385    is_uppercase_letter(c) || is_lowercase_letter(c)
1386}
1387
1388#[inline(always)]
1389fn is_non_ascii(c: char) -> bool {
1390    c as u32 >= 0x80
1391}
1392
1393#[inline(always)]
1394fn is_name_start(c: char) -> bool {
1395    matches!(c, c if is_letter(c) || is_non_ascii(c) || c == '_' || c == '\x00')
1396}
1397
1398#[inline(always)]
1399fn is_name(c: char) -> bool {
1400    is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == '-')
1401}
1402
1403#[inline(always)]
1404fn is_non_printable(c: char) -> bool {
1405    matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
1406}
1407
1408#[inline(always)]
1409fn is_newline(c: char) -> bool {
1410    matches!(c, '\n' | '\r' | '\x0C')
1411}
1412
1413#[inline(always)]
1414fn is_whitespace(c: char) -> bool {
1415    matches!(c, c if c == ' ' || c == '\t' || is_newline(c))
1416}