swc_css_parser/lexer/mod.rs
1use std::{borrow::Cow, cell::RefCell, char::REPLACEMENT_CHARACTER, rc::Rc};
2
3use swc_atoms::{Atom, AtomStoreCell};
4use swc_common::{
5 comments::{Comment, CommentKind, Comments},
6 input::Input,
7 util::take::Take,
8 BytePos, Span,
9};
10use swc_css_ast::{
11 matches_eq_ignore_ascii_case, DimensionToken, NumberType, Token, TokenAndSpan, UrlKeyValue,
12};
13
14use crate::{
15 error::{Error, ErrorKind},
16 parser::{input::ParserInput, ParserConfig},
17};
18
19pub(crate) type LexResult<T> = Result<T, ErrorKind>;
20
21#[derive(Clone)]
22pub struct Lexer<'a, I>
23where
24 I: Input<'a>,
25{
26 comments: Option<&'a dyn Comments>,
27 pending_leading_comments: Vec<Comment>,
28 input: I,
29 cur: Option<char>,
30 cur_pos: BytePos,
31 start_pos: BytePos,
32 /// Used to override last_pos
33 override_pos: Option<BytePos>,
34 config: ParserConfig,
35 buf: Rc<RefCell<String>>,
36 raw_buf: Rc<RefCell<String>>,
37 sub_buf: Rc<RefCell<String>>,
38 errors: Rc<RefCell<Vec<Error>>>,
39 atoms: Rc<AtomStoreCell>,
40}
41
42impl<'a, I> Lexer<'a, I>
43where
44 I: Input<'a>,
45{
46 pub fn new(input: I, comments: Option<&'a dyn Comments>, config: ParserConfig) -> Self {
47 let start_pos = input.last_pos();
48
49 Lexer {
50 comments,
51 input,
52 cur: None,
53 cur_pos: start_pos,
54 start_pos,
55 override_pos: None,
56 config,
57 buf: Rc::new(RefCell::new(String::with_capacity(256))),
58 raw_buf: Rc::new(RefCell::new(String::with_capacity(256))),
59 sub_buf: Rc::new(RefCell::new(String::with_capacity(32))),
60 errors: Default::default(),
61 pending_leading_comments: Default::default(),
62 atoms: Default::default(),
63 }
64 }
65
66 fn with_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
67 where
68 F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
69 {
70 let b = self.buf.clone();
71 let mut buf = b.borrow_mut();
72
73 buf.clear();
74
75 op(self, &mut buf)
76 }
77
78 fn with_sub_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
79 where
80 F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
81 {
82 let b = self.sub_buf.clone();
83 let mut sub_buf = b.borrow_mut();
84
85 sub_buf.clear();
86
87 op(self, &mut sub_buf)
88 }
89
90 fn with_buf_and_raw_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
91 where
92 F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String, &mut String) -> LexResult<Ret>,
93 {
94 let b = self.buf.clone();
95 let r = self.raw_buf.clone();
96 let mut buf = b.borrow_mut();
97 let mut raw = r.borrow_mut();
98
99 buf.clear();
100 raw.clear();
101
102 op(self, &mut buf, &mut raw)
103 }
104}
105
106impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
107 type Item = TokenAndSpan;
108
109 fn next(&mut self) -> Option<Self::Item> {
110 let token = self.consume_token();
111
112 match token {
113 Ok(token) => {
114 let end = self
115 .override_pos
116 .take()
117 .unwrap_or_else(|| self.input.last_pos());
118 let span = Span::new(self.start_pos, end);
119
120 let token_and_span = TokenAndSpan { span, token };
121
122 return Some(token_and_span);
123 }
124 Err(..) => {
125 return None;
126 }
127 }
128 }
129}
130
131#[derive(Debug, Clone, Copy)]
132pub struct LexerState {
133 pos: BytePos,
134}
135
136impl<'a, I> ParserInput for Lexer<'a, I>
137where
138 I: Input<'a>,
139{
140 type State = LexerState;
141
142 fn start_pos(&mut self) -> BytePos {
143 self.input.last_pos()
144 }
145
146 fn state(&mut self) -> Self::State {
147 LexerState {
148 pos: self.input.last_pos(),
149 }
150 }
151
152 fn reset(&mut self, state: &Self::State) {
153 unsafe {
154 // Safety: state.pos is created from a valid position.
155 self.input.reset_to(state.pos);
156 }
157 }
158
159 fn take_errors(&mut self) -> Vec<Error> {
160 self.errors.take()
161 }
162
163 fn skip_ws(&mut self) -> Option<BytePos> {
164 self.read_comments();
165
166 if let Some(c) = self.input.cur() {
167 if !is_whitespace(c) {
168 return None;
169 }
170 }
171
172 loop {
173 self.read_comments();
174
175 if self.input.uncons_while(is_whitespace).is_empty() {
176 break;
177 }
178 }
179
180 Some(self.input.last_pos())
181 }
182
183 fn atom(&self, s: Cow<str>) -> Atom {
184 self.atoms.atom(s)
185 }
186}
187
188impl<'a, I> Lexer<'a, I>
189where
190 I: Input<'a>,
191{
192 #[inline(always)]
193 fn cur(&mut self) -> Option<char> {
194 self.cur
195 }
196
197 #[inline(always)]
198 fn next(&mut self) -> Option<char> {
199 self.input.cur()
200 }
201
202 #[inline(always)]
203 fn next_next(&mut self) -> Option<char> {
204 self.input.peek()
205 }
206
207 #[inline(always)]
208 fn next_next_next(&mut self) -> Option<char> {
209 self.input.peek_ahead()
210 }
211
212 #[inline(always)]
213 fn consume(&mut self) -> Option<char> {
214 let cur = self.input.cur();
215
216 self.cur = cur;
217 self.cur_pos = self.input.last_pos();
218
219 if cur.is_some() {
220 unsafe {
221 // Safety: cur is Some
222 self.input.bump();
223 }
224 }
225
226 cur
227 }
228
229 #[inline(always)]
230 fn reconsume(&mut self) {
231 unsafe {
232 // Safety: self.cur_pos is a position generated by self.input, meaning it is
233 // valid.
234 self.input.reset_to(self.cur_pos);
235 }
236 }
237
238 #[cold]
239 fn emit_error(&mut self, kind: ErrorKind) {
240 self.errors.borrow_mut().push(Error::new(
241 Span::new(self.cur_pos, self.input.last_pos()),
242 kind,
243 ));
244 }
245
246 fn consume_token(&mut self) -> LexResult<Token> {
247 self.read_comments();
248 self.start_pos = self.input.last_pos();
249
250 if let Some(comments) = self.comments {
251 if !self.pending_leading_comments.is_empty() {
252 comments.add_leading_comments(self.start_pos, self.pending_leading_comments.take());
253 }
254 }
255
256 // Consume the next input code point.
257 match self.consume() {
258 // whitespace
259 // Consume as much whitespace as possible. Return a <whitespace-token>.
260 Some(c) if is_whitespace(c) => self.with_buf(|l, buf| {
261 buf.push(c);
262
263 loop {
264 let c = l.next();
265
266 match c {
267 Some(c) if is_whitespace(c) => {
268 l.consume();
269
270 buf.push(c);
271 }
272 _ => {
273 break;
274 }
275 }
276 }
277
278 return Ok(Token::WhiteSpace {
279 value: l.atoms.atom(&**buf),
280 });
281 }),
282 // U+0022 QUOTATION MARK (")
283 // Consume a string token and return it.
284 Some('"') => self.read_str(None),
285 // U+0023 NUMBER SIGN (#)
286 Some('#') => {
287 let first = self.next();
288 let second = self.next_next();
289
290 // If the next input code point is a name code point or the next two input code
291 // points are a valid escape, then:
292 if (first.is_some() && is_name(first.unwrap()))
293 || self.is_valid_escape(first, second)
294 {
295 // Create a <hash-token>.
296
297 // If the next 3 input code points would start an identifier, set the
298 // <hash-token>’s type flag to "id".
299 let third = self.next_next_next();
300 let is_would_start_ident = self.would_start_ident(first, second, third);
301
302 // Consume an ident sequence, and set the <hash-token>’s value to the returned
303 // string.
304 let ident_sequence = self.read_ident_sequence()?;
305
306 // Return the <hash-token>.
307 return Ok(Token::Hash {
308 is_id: is_would_start_ident,
309 value: ident_sequence.0,
310 raw: ident_sequence.1,
311 });
312 }
313
314 Ok(Token::Delim { value: '#' })
315 }
316 // U+0027 APOSTROPHE (')
317 // Consume a string token and return it.
318 Some('\'') => self.read_str(None),
319 // U+0028 LEFT PARENTHESIS (()
320 // Return a <(-token>.
321 Some('(') => Ok(tok!("(")),
322 // U+0029 RIGHT PARENTHESIS ())
323 // Return a <)-token>.
324 Some(')') => Ok(tok!(")")),
325 // U+002B PLUS SIGN (+)
326 Some('+') => {
327 // If the input stream starts with a number, reconsume the current input code
328 // point, consume a numeric token and return it.
329 if self.would_start_number(None, None, None) {
330 self.reconsume();
331
332 return self.read_numeric();
333 }
334
335 // Otherwise, return a <delim-token> with its value set to the current input
336 // code point.
337 Ok(tok!("+"))
338 }
339 // U+002C COMMA (,)
340 // Return a <comma-token>.
341 Some(',') => Ok(tok!(",")),
342 // U+002D HYPHEN-MINUS (-)
343 Some('-') => {
344 // If the input stream starts with a number, reconsume the current input code
345 // point, consume a numeric token, and return it.
346 if self.would_start_number(None, None, None) {
347 self.reconsume();
348
349 return self.read_numeric();
350 }
351 // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E
352 // GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
353 else if self.next() == Some('-') && self.next_next() == Some('>') {
354 self.consume();
355 self.consume();
356
357 return Ok(Token::CDC);
358 }
359 // Otherwise, if the input stream starts with an identifier, reconsume the current
360 // input code point, consume an ident-like token, and return it.
361 else if self.would_start_ident(None, None, None) {
362 self.reconsume();
363
364 return self.read_ident_like();
365 }
366
367 // Otherwise, return a <delim-token> with its value set to the current input
368 // code point.
369 Ok(tok!("-"))
370 }
371 // U+002E FULL STOP (.)
372 Some('.') => {
373 // If the input stream starts with a number, reconsume the current input code
374 // point, consume a numeric token, and return it.
375 if self.would_start_number(None, None, None) {
376 self.reconsume();
377
378 return self.read_numeric();
379 }
380
381 // Otherwise, return a <delim-token> with its value set to the current input
382 // code point.
383 Ok(tok!("."))
384 }
385 // U+003A COLON (:)
386 // Return a <colon-token>.
387 Some(':') => Ok(tok!(":")),
388 // U+003B SEMICOLON (;)
389 // Return a <semicolon-token>.
390 Some(';') => Ok(tok!(";")),
391 // U+003C LESS-THAN SIGN (<)
392 Some('<') => {
393 // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D
394 // HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), consume them and return a
395 // <CDO-token>.
396 if self.next() == Some('!')
397 && self.next_next() == Some('-')
398 && self.next_next_next() == Some('-')
399 {
400 self.consume(); // !
401 self.consume(); // -
402 self.consume(); // -
403
404 return Ok(tok!("<!--"));
405 }
406
407 // Otherwise, return a <delim-token> with its value set to the current input
408 // code point.
409 Ok(tok!("<"))
410 }
411 // U+0040 COMMERCIAL AT (@)
412 Some('@') => {
413 let first = self.next();
414 let second = self.next_next();
415 let third = self.next_next_next();
416
417 // If the next 3 input code points would start an identifier, consume a name,
418 // create an <at-keyword-token> with its value set to the returned value, and
419 // return it.
420 if self.would_start_ident(first, second, third) {
421 let ident_sequence = self.read_ident_sequence()?;
422
423 return Ok(Token::AtKeyword {
424 value: ident_sequence.0,
425 raw: ident_sequence.1,
426 });
427 }
428
429 // Otherwise, return a <delim-token> with its value set to the current input
430 // code point.
431 Ok(Token::Delim { value: '@' })
432 }
433 // U+005B LEFT SQUARE BRACKET ([)
434 // Return a <[-token>.
435 Some('[') => Ok(tok!("[")),
436 // U+005C REVERSE SOLIDUS (\)
437 Some('\\') => {
438 // If the input stream starts with a valid escape, reconsume the current input
439 // code point, consume an ident-like token, and return it.
440 if self.is_valid_escape(None, None) {
441 self.reconsume();
442
443 return self.read_ident_like();
444 }
445
446 // Otherwise, this is a parse error. Return a <delim-token> with its value set
447 // to the current input code point.
448 self.emit_error(ErrorKind::InvalidEscape);
449
450 Ok(Token::Delim { value: '\\' })
451 }
452 // U+005D RIGHT SQUARE BRACKET (])
453 // Return a <]-token>.
454 Some(']') => Ok(tok!("]")),
455 // U+007B LEFT CURLY BRACKET ({)
456 // Return a <{-token>.
457 Some('{') => Ok(tok!("{")),
458 // U+007D RIGHT CURLY BRACKET (})
459 // Return a <}-token>.
460 Some('}') => Ok(tok!("}")),
461 // digit
462 // Reconsume the current input code point, consume a numeric token, and return it.
463 Some('0'..='9') => {
464 self.reconsume();
465
466 self.read_numeric()
467 }
468 // name-start code point
469 // Reconsume the current input code point, consume an ident-like token, and return it.
470 Some(c) if is_name_start(c) => {
471 self.reconsume();
472
473 self.read_ident_like()
474 }
475 // EOF
476 // Return an <EOF-token>.
477 None => Err(ErrorKind::Eof),
478 // anything else
479 // Return a <delim-token> with its value set to the current input code point.
480 Some(c) => Ok(Token::Delim { value: c }),
481 }
482 }
483
484 // Consume comments.
485 // This section describes how to consume comments from a stream of code points.
486 // It returns nothing.
487 fn read_comments(&mut self) {
488 // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A
489 // ASTERISK (*), consume them and all following code points up to and including
490 // the first U+002A ASTERISK (*) followed by a U+002F SOLIDUS (/), or up to an
491 // EOF code point. Return to the start of this step.
492 // NOTE: We allow to parse line comments under the option.
493 if self.next() == Some('/') && self.next_next() == Some('*') {
494 let cmt_start = self.input.last_pos();
495
496 while self.next() == Some('/') && self.next_next() == Some('*') {
497 self.consume(); // '*'
498 self.consume(); // '/'
499
500 loop {
501 match self.consume() {
502 Some('*') if self.next() == Some('/') => {
503 self.consume(); // '/'
504
505 if self.comments.is_some() {
506 let last_pos = self.input.last_pos();
507 let text = unsafe {
508 // Safety: last_pos is a valid position
509 self.input.slice(cmt_start, last_pos)
510 };
511
512 self.pending_leading_comments.push(Comment {
513 kind: CommentKind::Block,
514 span: (self.start_pos, last_pos).into(),
515 text: self.atoms.atom(text),
516 });
517 }
518
519 break;
520 }
521 None => {
522 let span = Span::new(self.start_pos, self.input.last_pos());
523
524 self.errors
525 .borrow_mut()
526 .push(Error::new(span, ErrorKind::UnterminatedBlockComment));
527
528 return;
529 }
530 _ => {}
531 }
532 }
533 }
534 } else if self.config.allow_wrong_line_comments
535 && self.next() == Some('/')
536 && self.next_next() == Some('/')
537 {
538 while self.next() == Some('/') && self.next_next() == Some('/') {
539 self.consume(); // '/'
540 self.consume(); // '/'
541
542 let start_of_content = self.input.last_pos();
543
544 loop {
545 match self.consume() {
546 Some(c) if is_newline(c) => {
547 if self.comments.is_some() {
548 let last_pos = self.input.last_pos();
549 let text = unsafe {
550 // Safety: last_pos is a valid position
551 self.input.slice(start_of_content, last_pos)
552 };
553
554 self.pending_leading_comments.push(Comment {
555 kind: CommentKind::Line,
556 span: (self.start_pos, last_pos).into(),
557 text: self.atoms.atom(text),
558 });
559 }
560 break;
561 }
562 None => return,
563 _ => {}
564 }
565 }
566 }
567 }
568 }
569
570 // This section describes how to consume a numeric token from a stream of code
571 // points. It returns either a <number-token>, <percentage-token>, or
572 // <dimension-token>.
573 fn read_numeric(&mut self) -> LexResult<Token> {
574 // Consume a number and let number be the result.
575 let number = self.read_number()?;
576
577 let next_first = self.next();
578 let next_second = self.next_next();
579 let next_third = self.next_next_next();
580
581 // If the next 3 input code points would start an identifier, then:
582 if self.would_start_ident(next_first, next_second, next_third) {
583 // Swap logic to avoid create empty strings, because it doesn't make sense
584 //
585 // Consume a name. Set the <dimension-token>’s unit to the returned value.
586 let ident_sequence = self.read_ident_sequence()?;
587 // Create a <dimension-token> with the same value and type flag as number, and a
588 // unit set initially to the empty string.
589 let token = Box::new(DimensionToken {
590 value: number.0,
591 raw_value: number.1,
592 unit: ident_sequence.0,
593 raw_unit: ident_sequence.1,
594 type_flag: number.2,
595 });
596 let token = Token::Dimension { dimension: token };
597
598 // Return the <dimension-token>.
599 return Ok(token);
600 }
601 // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. Create
602 // a <percentage-token> with the same value as number, and return it.
603 else if next_first == Some('%') {
604 self.consume();
605
606 return Ok(Token::Percentage {
607 value: number.0,
608 raw: number.1,
609 });
610 }
611
612 // Otherwise, create a <number-token> with the same value and type flag as
613 // number, and return it.
614 Ok(Token::Number {
615 value: number.0,
616 raw: number.1,
617 type_flag: number.2,
618 })
619 }
620
621 // This section describes how to consume an ident-like token from a stream of
622 // code points. It returns an <ident-token>, <function-token>, <url-token>, or
623 // <bad-url-token>.
624 fn read_ident_like(&mut self) -> LexResult<Token> {
625 // Consume a name, and let string be the result.
626 let ident_sequence = self.read_ident_sequence()?;
627
628 // If string’s value is an ASCII case-insensitive match for "url", and the next
629 // input code point is U+0028 LEFT PARENTHESIS ((), consume it.
630 if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some('(') {
631 self.consume();
632
633 let start_whitespace = self.input.last_pos();
634
635 // While the next two input code points are whitespace, consume the next input
636 // code point.
637 let whitespaces = self.with_buf(|l, buf| {
638 while let (Some(next), Some(next_next)) = (l.next(), l.next_next()) {
639 if is_whitespace(next) && is_whitespace(next_next) {
640 l.consume();
641
642 buf.push(next);
643 } else {
644 break;
645 }
646 }
647
648 Ok(buf.to_string())
649 })?;
650
651 match self.next() {
652 // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027
653 // APOSTROPHE ('), or whitespace followed by U+0022 QUOTATION MARK (") or U+0027
654 // APOSTROPHE ('), then create a <function-token> with its value set to string and
655 // return it.
656 Some(c)
657 if is_whitespace(c)
658 && (self.next_next() == Some('"') || self.next_next() == Some('\'')) =>
659 {
660 // Override last position because we consumed whitespaces, but they
661 // should not be part of token
662 self.override_pos = Some(start_whitespace);
663
664 return Ok(Token::Function {
665 value: ident_sequence.0,
666 raw: ident_sequence.1,
667 });
668 }
669 Some('"' | '\'') => {
670 return Ok(Token::Function {
671 value: ident_sequence.0,
672 raw: ident_sequence.1,
673 });
674 }
675 // Otherwise, consume a url token, and return it.
676 _ => {
677 return self.read_url(ident_sequence, whitespaces);
678 }
679 }
680 }
681 // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
682 // Create a <function-token> with its value set to string and return it.
683 else if self.next() == Some('(') {
684 self.consume();
685
686 return Ok(Token::Function {
687 value: ident_sequence.0,
688 raw: ident_sequence.1,
689 });
690 }
691
692 // Otherwise, create an <ident-token> with its value set to string and return
693 // it.
694 Ok(Token::Ident {
695 value: ident_sequence.0,
696 raw: ident_sequence.1,
697 })
698 }
699
700 // This section describes how to consume a string token from a stream of code
701 // points. It returns either a <string-token> or <bad-string-token>.
702 fn read_str(&mut self, maybe_ending_code_point: Option<char>) -> LexResult<Token> {
703 self.with_buf_and_raw_buf(|l, buf, raw| {
704 // This algorithm may be called with an ending code point, which denotes the
705 // code point that ends the string. If an ending code point is not specified,
706 // the current input code point is used.
707 let ending_code_point = maybe_ending_code_point.or_else(|| l.cur());
708
709 // Initially create a <string-token> with its value set to the empty string.
710 // Done above
711
712 raw.push(ending_code_point.unwrap());
713
714 // Repeatedly consume the next input code point from the stream:
715 loop {
716 match l.consume() {
717 // ending code point
718 // Return the <string-token>.
719 Some(c) if c == ending_code_point.unwrap() => {
720 raw.push(c);
721
722 break;
723 }
724
725 // EOF
726 // This is a parse error. Return the <string-token>.
727 None => {
728 l.emit_error(ErrorKind::UnterminatedString);
729
730 return Ok(Token::String {
731 value: l.atoms.atom(&**buf),
732 raw: l.atoms.atom(&**raw),
733 });
734 }
735
736 // Newline
737 // This is a parse error. Reconsume the current input code point, create a
738 // <bad-string-token>, and return it.
739 Some(c) if is_newline(c) => {
740 l.emit_error(ErrorKind::NewlineInString);
741 l.reconsume();
742
743 return Ok(Token::BadString {
744 raw: l.atoms.atom(&**raw),
745 });
746 }
747
748 // U+005C REVERSE SOLIDUS (\)
749 Some(c) if c == '\\' => {
750 let next = l.next();
751
752 // If the next input code point is EOF, do nothing.
753 if l.next().is_none() {
754 continue;
755 }
756 // Otherwise, if the next input code point is a newline, consume it.
757 else if l.next().is_some() && is_newline(l.next().unwrap()) {
758 l.consume();
759
760 raw.push(c);
761 raw.push(next.unwrap());
762 }
763 // Otherwise, (the stream starts with a valid escape) consume an escaped
764 // code point and append the returned code point to
765 // the <string-token>’s value.
766 else if l.is_valid_escape(None, None) {
767 let escape = l.read_escape()?;
768
769 buf.push(escape.0);
770 raw.push(c);
771 raw.push_str(&escape.1);
772 }
773 }
774
775 // Anything else
776 // Append the current input code point to the <string-token>’s value.
777 Some(c) => {
778 buf.push(c);
779 raw.push(c);
780 }
781 }
782 }
783
784 Ok(Token::String {
785 value: l.atoms.atom(&**buf),
786 raw: l.atoms.atom(&**raw),
787 })
788 })
789 }
790
791 // This section describes how to consume a url token from a stream of code
792 // points. It returns either a <url-token> or a <bad-url-token>.
793 fn read_url(&mut self, name: (Atom, Atom), before: String) -> LexResult<Token> {
794 // Initially create a <url-token> with its value set to the empty string.
795 self.with_buf_and_raw_buf(|l, out, raw| {
796 raw.push_str(&before);
797
798 // Consume as much whitespace as possible.
799 while let Some(c) = l.next() {
800 if is_whitespace(c) {
801 l.consume();
802
803 raw.push(c);
804 } else {
805 break;
806 }
807 }
808
809 // Repeatedly consume the next input code point from the stream:
810 loop {
811 match l.consume() {
812 // U+0029 RIGHT PARENTHESIS ())
813 // Return the <url-token>.
814 Some(')') => {
815 return Ok(Token::Url {
816 value: l.atoms.atom(&**out),
817 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
818 });
819 }
820
821 // EOF
822 // This is a parse error. Return the <url-token>.
823 None => {
824 l.emit_error(ErrorKind::UnterminatedUrl);
825
826 return Ok(Token::Url {
827 value: l.atoms.atom(&**out),
828 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
829 });
830 }
831
832 // whitespace
833 Some(c) if is_whitespace(c) => {
834 // Consume as much whitespace as possible.
835 let whitespaces: String = l.with_sub_buf(|l, buf| {
836 buf.push(c);
837
838 while let Some(c) = l.next() {
839 if is_whitespace(c) {
840 l.consume();
841
842 buf.push(c);
843 } else {
844 break;
845 }
846 }
847
848 Ok(buf.to_string())
849 })?;
850
851 // if the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
852 // consume it and return the <url-token> (if EOF was
853 // encountered, this is a parse error);
854 match l.next() {
855 Some(')') => {
856 l.consume();
857
858 raw.push_str(&whitespaces);
859
860 return Ok(Token::Url {
861 value: l.atoms.atom(&**out),
862 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
863 });
864 }
865 None => {
866 l.emit_error(ErrorKind::UnterminatedUrl);
867
868 raw.push_str(&whitespaces);
869
870 return Ok(Token::Url {
871 value: l.atoms.atom(&**out),
872 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
873 });
874 }
875 _ => {}
876 }
877
878 // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
879 // and return it.
880 raw.push_str(&whitespaces);
881
882 let remnants = l.read_bad_url_remnants()?;
883
884 raw.push_str(&remnants);
885
886 return Ok(Token::BadUrl {
887 raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
888 });
889 }
890
891 // U+0022 QUOTATION MARK (")
892 // U+0027 APOSTROPHE (')
893 // U+0028 LEFT PARENTHESIS (()
894 // non-printable code point
895 // This is a parse error. Consume the remnants of a bad url, create a
896 // <bad-url-token>, and return it.
897 Some(c) if c == '"' || c == '\'' || c == '(' || is_non_printable(c) => {
898 l.emit_error(ErrorKind::UnexpectedCharInUrl);
899
900 let remnants = l.read_bad_url_remnants()?;
901
902 raw.push(c);
903 raw.push_str(&remnants);
904
905 return Ok(Token::BadUrl {
906 raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
907 });
908 }
909
910 // U+005C REVERSE SOLIDUS (\)
911 Some(c) if c == '\\' => {
912 // If the stream starts with a valid escape, consume an escaped code point
913 // and append the returned code point to the
914 // <url-token>’s value.
915 if l.is_valid_escape(None, None) {
916 let escaped = l.read_escape()?;
917
918 out.push(escaped.0);
919 raw.push(c);
920 raw.push_str(&escaped.1);
921 }
922 // Otherwise, this is a parse error. Consume the remnants of a bad url,
923 // create a <bad-url-token>, and return it.
924 else {
925 l.emit_error(ErrorKind::InvalidEscape);
926
927 let remnants = l.read_bad_url_remnants()?;
928
929 raw.push(c);
930 raw.push_str(&remnants);
931
932 return Ok(Token::BadUrl {
933 raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
934 });
935 }
936 }
937
938 // anything else
939 // Append the current input code point to the <url-token>’s value.
940 Some(c) => {
941 out.push(c);
942 raw.push(c);
943 }
944 }
945 }
946 })
947 }
948
949 // Consume an escaped code point
950 // This section describes how to consume an escaped code point. It assumes that
951 // the U+005C REVERSE SOLIDUS (\) has already been consumed and that the next
952 // input code point has already been verified to be part of a valid escape. It
953 // will return a code point.
954 fn read_escape(&mut self) -> LexResult<(char, String)> {
955 self.with_sub_buf(|l, buf| {
956 // Consume the next input code point.
957 match l.consume() {
958 // hex digit
959 Some(c) if is_hex_digit(c) => {
960 let mut hex = c.to_digit(16).unwrap();
961
962 buf.push(c);
963
964 // Consume as many hex digits as possible, but no more than 5.
965 // Note that this means 1-6 hex digits have been consumed in total.
966 for _ in 0..5 {
967 let next = l.next();
968 let digit = match next.and_then(|c| c.to_digit(16)) {
969 Some(v) => v,
970 None => break,
971 };
972
973 l.consume();
974
975 buf.push(next.unwrap());
976 hex = hex * 16 + digit;
977 }
978
979 // If the next input code point is whitespace, consume it as well.
980 let next = l.next();
981
982 if let Some(next) = next {
983 if is_whitespace(next) {
984 l.consume();
985
986 buf.push(next);
987 }
988 }
989
990 // Interpret the hex digits as a hexadecimal number. If this number is zero, or
991 // is for a surrogate, or is greater than the maximum allowed code point, return
992 // U+FFFD REPLACEMENT CHARACTER (�).
993 let hex = match hex {
994 // If this number is zero
995 0 => REPLACEMENT_CHARACTER,
996 // or is for a surrogate
997 55296..=57343 => REPLACEMENT_CHARACTER,
998 // or is greater than the maximum allowed code point
999 1114112.. => REPLACEMENT_CHARACTER,
1000 _ => char::from_u32(hex).unwrap_or(REPLACEMENT_CHARACTER),
1001 };
1002
1003 // Otherwise, return the code point with that value.
1004 Ok((hex, (&**buf).into()))
1005 }
1006 // EOF
1007 // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
1008 None => {
1009 l.emit_error(ErrorKind::InvalidEscape);
1010
1011 let value = REPLACEMENT_CHARACTER;
1012
1013 buf.push(value);
1014
1015 Ok((value, (&**buf).into()))
1016 }
1017 // anything else
1018 // Return the current input code point.
1019 Some(c) => {
1020 buf.push(c);
1021
1022 Ok((c, (&**buf).into()))
1023 }
1024 }
1025 })
1026 }
1027
1028 // Check if two code points are a valid escape
1029 // This section describes how to check if two code points are a valid escape.
1030 // The algorithm described here can be called explicitly with two code points,
1031 // or can be called with the input stream itself. In the latter case, the two
1032 // code points in question are the current input code point and the next input
1033 // code point, in that order.
1034 fn is_valid_escape(&mut self, maybe_first: Option<char>, maybe_second: Option<char>) -> bool {
1035 // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
1036 if maybe_first.or_else(|| self.cur()) != Some('\\') {
1037 return false;
1038 }
1039
1040 match maybe_second.or_else(|| self.next()) {
1041 // Otherwise, if the second code point is a newline, return false.
1042 Some(second) => !is_newline(second),
1043 // Otherwise, return true.
1044 None => false,
1045 }
1046 }
1047
1048 // Check if three code points would start an identifier
1049 // This section describes how to check if three code points would start an
1050 // identifier. The algorithm described here can be called explicitly with three
1051 // code points, or can be called with the input stream itself. In the latter
1052 // case, the three code points in question are the current input code point and
1053 // the next two input code points, in that order.
1054 fn would_start_ident(
1055 &mut self,
1056 maybe_first: Option<char>,
1057 maybe_second: Option<char>,
1058 maybe_third: Option<char>,
1059 ) -> bool {
1060 // Look at the first code point:
1061 let first = maybe_first.or_else(|| self.cur());
1062
1063 match first {
1064 // U+002D HYPHEN-MINUS
1065 Some('-') => {
1066 let second = maybe_second.or_else(|| self.next());
1067
1068 match second {
1069 // If the second code point is a name-start code point
1070 // return true.
1071 Some(c) if is_name_start(c) => true,
1072 // or a U+002D HYPHEN-MINUS,
1073 // return true.
1074 Some('-') => true,
1075 // or the second and third code points are a valid escape
1076 // return true.
1077 Some(_) => {
1078 let third = maybe_third.or_else(|| self.next_next());
1079
1080 self.is_valid_escape(second, third)
1081 }
1082 // Otherwise, return false.
1083 _ => false,
1084 }
1085 }
1086 // name-start code point
1087 // Return true.
1088 Some(c) if is_name_start(c) => true,
1089 // U+005C REVERSE SOLIDUS (\)
1090 // If the first and second code points are a valid escape, return true. Otherwise,
1091 // return false.
1092 Some('\\') => {
1093 let second = maybe_second.or_else(|| self.next());
1094
1095 self.is_valid_escape(first, second)
1096 }
1097 _ => false,
1098 }
1099 }
1100
1101 // Check if three code points would start a number
1102 // This section describes how to check if three code points would start a
1103 // number. The algorithm described here can be called explicitly with three code
1104 // points, or can be called with the input stream itself. In the latter case,
1105 // the three code points in question are the current input code point and the
1106 // next two input code points, in that order.
1107 #[allow(clippy::needless_return)]
1108 fn would_start_number(
1109 &mut self,
1110 maybe_first: Option<char>,
1111 maybe_second: Option<char>,
1112 maybe_third: Option<char>,
1113 ) -> bool {
1114 // Look at the first code point:
1115 let first = maybe_first.or_else(|| self.cur());
1116
1117 match first {
1118 // U+002B PLUS SIGN (+)
1119 // U+002D HYPHEN-MINUS (-)
1120 Some('+') | Some('-') => {
1121 match maybe_second.or_else(|| self.next()) {
1122 // If the second code point is a digit, return true.
1123 Some(second) if second.is_ascii_digit() => return true,
1124 // Otherwise, if the second code point is a U+002E FULL STOP (.) and the
1125 // third code point is a digit, return true.
1126 Some('.') => {
1127 if let Some(third) = maybe_third.or_else(|| self.next_next()) {
1128 if third.is_ascii_digit() {
1129 return true;
1130 }
1131 }
1132
1133 return false;
1134 }
1135 // Otherwise, return false.
1136 _ => return false,
1137 };
1138 }
1139 // U+002E FULL STOP (.)
1140 Some('.') => {
1141 // If the second code point is a digit, return true.
1142 if let Some(second) = self.next() {
1143 if second.is_ascii_digit() {
1144 return true;
1145 }
1146 }
1147
1148 // Otherwise, return false.
1149 false
1150 }
1151 // digit
1152 // Return true.
1153 Some(first) if first.is_ascii_digit() => true,
1154 // anything else
1155 // Return false.
1156 _ => false,
1157 }
1158 }
1159
1160 // Consume an ident sequence
1161 // This section describes how to consume an ident sequence from a stream of code
1162 // points. It returns a string containing the largest name that can be formed
1163 // from adjacent code points in the stream, starting from the first.
1164 fn read_ident_sequence(&mut self) -> LexResult<(Atom, Atom)> {
1165 self.with_buf_and_raw_buf(|l, buf, raw| {
1166 // Let result initially be an empty string.
1167 // Done above
1168
1169 // Repeatedly consume the next input code point from the stream:
1170 loop {
1171 match l.consume() {
1172 // name code point
1173 // Append the code point to result.
1174 Some(c) if is_name(c) => {
1175 buf.push(c);
1176 raw.push(c);
1177 }
1178 // the stream starts with a valid escape
1179 // Consume an escaped code point. Append the returned code point to result.
1180 Some(c) if l.is_valid_escape(None, None) => {
1181 let escaped = l.read_escape()?;
1182
1183 buf.push(escaped.0);
1184 raw.push(c);
1185 raw.push_str(&escaped.1);
1186 }
1187 // anything else
1188 // Reconsume the current input code point. Return result.
1189 _ => {
1190 l.reconsume();
1191
1192 break;
1193 }
1194 }
1195 }
1196
1197 Ok((l.atoms.atom(&**buf), l.atoms.atom(&**raw)))
1198 })
1199 }
1200
1201 // This section describes how to consume a number from a stream of code points.
1202 // It returns a numeric value, and a type which is either "integer" or "number".
1203 fn read_number(&mut self) -> LexResult<(f64, Atom, NumberType)> {
1204 let parsed: (Atom, NumberType) = self.with_buf(|l, out| {
1205 // Initially set type to "integer". Let repr be the empty string.
1206 let mut type_flag = NumberType::Integer;
1207
1208 // If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
1209 // (-), consume it and append it to repr.
1210 let next = l.next();
1211
1212 if next == Some('+') || next == Some('-') {
1213 l.consume();
1214
1215 out.push(next.unwrap());
1216 }
1217
1218 // While the next input code point is a digit, consume it and append it to repr.
1219 while let Some(c) = l.next() {
1220 if c.is_ascii_digit() {
1221 l.consume();
1222
1223 out.push(c);
1224 } else {
1225 break;
1226 }
1227 }
1228
1229 // If the next 2 input code points are U+002E FULL STOP (.) followed by a digit,
1230 // then:
1231 let next = l.next();
1232
1233 if next == Some('.') {
1234 if let Some(n) = l.next_next() {
1235 if n.is_ascii_digit() {
1236 // Consume them.
1237 l.consume();
1238 l.consume();
1239
1240 // Append them to repr.
1241 out.push(next.unwrap());
1242 out.push(n);
1243
1244 // Set type to "number".
1245 type_flag = NumberType::Number;
1246
1247 // While the next input code point is a digit, consume it and append it to
1248 // repr.
1249 while let Some(c) = l.next() {
1250 if c.is_ascii_digit() {
1251 l.consume();
1252
1253 out.push(c);
1254 } else {
1255 break;
1256 }
1257 }
1258 }
1259 }
1260 }
1261
1262 // If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or
1263 // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS
1264 // (-) or U+002B PLUS SIGN (+), followed by a digit, then:
1265 let next = l.next();
1266
1267 if next == Some('E') || next == Some('e') {
1268 let next_next = l.next_next();
1269 let next_next_next = l.next_next_next();
1270
1271 if (next_next == Some('-')
1272 || next_next == Some('+')
1273 && next_next_next.is_some()
1274 && next_next_next.unwrap().is_ascii_digit())
1275 || next_next.is_some() && next_next.unwrap().is_ascii_digit()
1276 {
1277 // Consume them.
1278 l.consume();
1279 l.consume();
1280
1281 // Append them to repr.
1282 out.push(next.unwrap());
1283 out.push(next_next.unwrap());
1284
1285 // Set type to "number".
1286 type_flag = NumberType::Number;
1287
1288 // While the next input code point is a digit, consume it and append it
1289 // to repr.
1290 while let Some(c) = l.next() {
1291 if c.is_ascii_digit() {
1292 l.consume();
1293
1294 out.push(c);
1295 } else {
1296 break;
1297 }
1298 }
1299 }
1300 }
1301
1302 // Return value and type.
1303 Ok((l.atoms.atom(&**out), type_flag))
1304 })?;
1305
1306 // Convert repr to a number, and set the value to the returned value.
1307 let value = lexical::parse(&*parsed.0).unwrap_or_else(|err| {
1308 unreachable!("failed to parse `{}` using lexical: {:?}", parsed.0, err)
1309 });
1310
1311 Ok((value, parsed.0, parsed.1))
1312 }
1313
1314 // Consume the remnants of a bad url
1315 // This section describes how to consume the remnants of a bad url from a stream
1316 // of code points, "cleaning up" after the tokenizer realizes that it’s in the
1317 // middle of a <bad-url-token> rather than a <url-token>. It returns nothing;
1318 // its sole use is to consume enough of the input stream to reach a recovery
1319 // point where normal tokenizing can resume. But for recovery purpose we return
1320 // bad URL remnants.
1321 fn read_bad_url_remnants(&mut self) -> LexResult<String> {
1322 self.with_sub_buf(|l, raw| {
1323 // Repeatedly consume the next input code point from the stream:
1324 loop {
1325 match l.consume() {
1326 // U+0029 RIGHT PARENTHESIS ())
1327 // EOF
1328 // Return.
1329 Some(c @ ')') => {
1330 raw.push(c);
1331
1332 break;
1333 }
1334 None => {
1335 break;
1336 }
1337 // the input stream starts with a valid escape
1338 Some(c) if l.is_valid_escape(None, None) => {
1339 // Consume an escaped code point. This allows an escaped right parenthesis
1340 // ("\)") to be encountered without ending the <bad-url-token>.
1341 let escaped = l.read_escape()?;
1342
1343 raw.push(c);
1344 raw.push_str(&escaped.1);
1345 }
1346 // anything else
1347 // Do nothing.
1348 Some(c) => {
1349 raw.push(c);
1350 }
1351 }
1352 }
1353
1354 Ok((&**raw).into())
1355 })
1356 }
1357}
1358
1359#[inline(always)]
1360fn is_digit(c: char) -> bool {
1361 c.is_ascii_digit()
1362}
1363
1364#[inline(always)]
1365fn is_hex_digit(c: char) -> bool {
1366 match c {
1367 c if is_digit(c) => true,
1368 'A'..='F' => true,
1369 'a'..='f' => true,
1370 _ => false,
1371 }
1372}
1373
1374#[inline(always)]
1375fn is_uppercase_letter(c: char) -> bool {
1376 c.is_ascii_uppercase()
1377}
1378
1379#[inline(always)]
1380fn is_lowercase_letter(c: char) -> bool {
1381 c.is_ascii_lowercase()
1382}
1383
1384#[inline(always)]
1385fn is_letter(c: char) -> bool {
1386 is_uppercase_letter(c) || is_lowercase_letter(c)
1387}
1388
1389#[inline(always)]
1390fn is_non_ascii(c: char) -> bool {
1391 c as u32 >= 0x80
1392}
1393
1394#[inline(always)]
1395fn is_name_start(c: char) -> bool {
1396 matches!(c, c if is_letter(c) || is_non_ascii(c) || c == '_' || c == '\x00')
1397}
1398
1399#[inline(always)]
1400fn is_name(c: char) -> bool {
1401 is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == '-')
1402}
1403
1404#[inline(always)]
1405fn is_non_printable(c: char) -> bool {
1406 matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
1407}
1408
1409#[inline(always)]
1410fn is_newline(c: char) -> bool {
1411 matches!(c, '\n' | '\r' | '\x0C')
1412}
1413
1414#[inline(always)]
1415fn is_whitespace(c: char) -> bool {
1416 matches!(c, c if c == ' ' || c == '\t' || is_newline(c))
1417}