swc_css_parser/lexer/mod.rs
1use std::{borrow::Cow, cell::RefCell, char::REPLACEMENT_CHARACTER, rc::Rc};
2
3use swc_atoms::{Atom, AtomStoreCell};
4use swc_common::{
5 comments::{Comment, CommentKind, Comments},
6 input::Input,
7 util::take::Take,
8 BytePos, Span,
9};
10use swc_css_ast::{
11 matches_eq_ignore_ascii_case, DimensionToken, NumberType, Token, TokenAndSpan, UrlKeyValue,
12};
13
14use crate::{
15 error::{Error, ErrorKind},
16 parser::{input::ParserInput, ParserConfig},
17};
18
19pub(crate) type LexResult<T> = Result<T, ErrorKind>;
20
21#[derive(Clone)]
22pub struct Lexer<'a, I>
23where
24 I: Input<'a>,
25{
26 comments: Option<&'a dyn Comments>,
27 pending_leading_comments: Vec<Comment>,
28 input: I,
29 cur: Option<char>,
30 cur_pos: BytePos,
31 start_pos: BytePos,
32 /// Used to override last_pos
33 override_pos: Option<BytePos>,
34 config: ParserConfig,
35 buf: Rc<RefCell<String>>,
36 raw_buf: Rc<RefCell<String>>,
37 sub_buf: Rc<RefCell<String>>,
38 errors: Rc<RefCell<Vec<Error>>>,
39 atoms: Rc<AtomStoreCell>,
40}
41
42impl<'a, I> Lexer<'a, I>
43where
44 I: Input<'a>,
45{
46 pub fn new(input: I, comments: Option<&'a dyn Comments>, config: ParserConfig) -> Self {
47 let start_pos = input.last_pos();
48
49 Lexer {
50 comments,
51 input,
52 cur: None,
53 cur_pos: start_pos,
54 start_pos,
55 override_pos: None,
56 config,
57 buf: Rc::new(RefCell::new(String::with_capacity(256))),
58 raw_buf: Rc::new(RefCell::new(String::with_capacity(256))),
59 sub_buf: Rc::new(RefCell::new(String::with_capacity(32))),
60 errors: Default::default(),
61 pending_leading_comments: Default::default(),
62 atoms: Default::default(),
63 }
64 }
65
66 fn with_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
67 where
68 F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
69 {
70 let b = self.buf.clone();
71 let mut buf = b.borrow_mut();
72
73 buf.clear();
74
75 op(self, &mut buf)
76 }
77
78 fn with_sub_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
79 where
80 F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String) -> LexResult<Ret>,
81 {
82 let b = self.sub_buf.clone();
83 let mut sub_buf = b.borrow_mut();
84
85 sub_buf.clear();
86
87 op(self, &mut sub_buf)
88 }
89
90 fn with_buf_and_raw_buf<F, Ret>(&mut self, op: F) -> LexResult<Ret>
91 where
92 F: for<'any> FnOnce(&mut Lexer<'a, I>, &mut String, &mut String) -> LexResult<Ret>,
93 {
94 let b = self.buf.clone();
95 let r = self.raw_buf.clone();
96 let mut buf = b.borrow_mut();
97 let mut raw = r.borrow_mut();
98
99 buf.clear();
100 raw.clear();
101
102 op(self, &mut buf, &mut raw)
103 }
104}
105
106impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
107 type Item = TokenAndSpan;
108
109 fn next(&mut self) -> Option<Self::Item> {
110 let token = self.consume_token();
111
112 match token {
113 Ok(token) => {
114 let end = self
115 .override_pos
116 .take()
117 .unwrap_or_else(|| self.input.last_pos());
118 let span = Span::new(self.start_pos, end);
119
120 let token_and_span = TokenAndSpan { span, token };
121
122 return Some(token_and_span);
123 }
124 Err(..) => {
125 return None;
126 }
127 }
128 }
129}
130
131#[derive(Debug, Clone, Copy)]
132pub struct LexerState {
133 pos: BytePos,
134}
135
136impl<'a, I> ParserInput for Lexer<'a, I>
137where
138 I: Input<'a>,
139{
140 type State = LexerState;
141
142 fn start_pos(&mut self) -> BytePos {
143 self.input.last_pos()
144 }
145
146 fn state(&mut self) -> Self::State {
147 LexerState {
148 pos: self.input.last_pos(),
149 }
150 }
151
152 fn reset(&mut self, state: &Self::State) {
153 unsafe {
154 // Safety: state.pos is created from a valid position.
155 self.input.reset_to(state.pos);
156 }
157 }
158
159 fn take_errors(&mut self) -> Vec<Error> {
160 self.errors.take()
161 }
162
163 fn skip_ws(&mut self) -> Option<BytePos> {
164 self.read_comments();
165
166 if let Some(c) = self.input.cur() {
167 if !is_whitespace(c) {
168 return None;
169 }
170 }
171
172 loop {
173 self.read_comments();
174
175 if self.input.uncons_while(is_whitespace).is_empty() {
176 break;
177 }
178 }
179
180 Some(self.input.last_pos())
181 }
182
183 fn atom(&self, s: Cow<str>) -> Atom {
184 self.atoms.atom(s)
185 }
186}
187
188impl<'a, I> Lexer<'a, I>
189where
190 I: Input<'a>,
191{
192 #[inline(always)]
193 fn cur(&mut self) -> Option<char> {
194 self.cur
195 }
196
197 #[inline(always)]
198 fn next(&mut self) -> Option<char> {
199 self.input.cur()
200 }
201
202 #[inline(always)]
203 fn next_next(&mut self) -> Option<char> {
204 self.input.peek()
205 }
206
207 #[inline(always)]
208 fn next_next_next(&mut self) -> Option<char> {
209 self.input.peek_ahead()
210 }
211
212 #[inline(always)]
213 fn consume(&mut self) -> Option<char> {
214 let cur = self.input.cur();
215
216 self.cur = cur;
217 self.cur_pos = self.input.last_pos();
218
219 if cur.is_some() {
220 unsafe {
221 // Safety: cur is Some
222 self.input.bump();
223 }
224 }
225
226 cur
227 }
228
229 #[inline(always)]
230 fn reconsume(&mut self) {
231 unsafe {
232 // Safety: self.cur_pos is a position generated by self.input, meaning it is
233 // valid.
234 self.input.reset_to(self.cur_pos);
235 }
236 }
237
238 #[cold]
239 fn emit_error(&mut self, kind: ErrorKind) {
240 self.errors.borrow_mut().push(Error::new(
241 Span::new(self.cur_pos, self.input.last_pos()),
242 kind,
243 ));
244 }
245
246 fn consume_token(&mut self) -> LexResult<Token> {
247 self.read_comments();
248 self.start_pos = self.input.last_pos();
249
250 if let Some(comments) = self.comments {
251 if !self.pending_leading_comments.is_empty() {
252 comments.add_leading_comments(self.start_pos, self.pending_leading_comments.take());
253 }
254 }
255
256 // Consume the next input code point.
257 match self.consume() {
258 // whitespace
259 // Consume as much whitespace as possible. Return a <whitespace-token>.
260 Some(c) if is_whitespace(c) => self.with_buf(|l, buf| {
261 buf.push(c);
262
263 loop {
264 let c = l.next();
265
266 match c {
267 Some(c) if is_whitespace(c) => {
268 l.consume();
269
270 buf.push(c);
271 }
272 _ => {
273 break;
274 }
275 }
276 }
277
278 return Ok(Token::WhiteSpace {
279 value: l.atoms.atom(&**buf),
280 });
281 }),
282 // U+0022 QUOTATION MARK (")
283 // Consume a string token and return it.
284 Some('"') => self.read_str(None),
285 // U+0023 NUMBER SIGN (#)
286 Some('#') => {
287 let first = self.next();
288 let second = self.next_next();
289
290 // If the next input code point is a name code point or the next two input code
291 // points are a valid escape, then:
292 if (first.is_some() && is_name(first.unwrap()))
293 || self.is_valid_escape(first, second)
294 {
295 // Create a <hash-token>.
296
297 // If the next 3 input code points would start an identifier, set the
298 // <hash-token>’s type flag to "id".
299 let third = self.next_next_next();
300 let is_would_start_ident = self.would_start_ident(first, second, third);
301
302 // Consume an ident sequence, and set the <hash-token>’s value to the returned
303 // string.
304 let ident_sequence = self.read_ident_sequence()?;
305
306 // Return the <hash-token>.
307 return Ok(Token::Hash {
308 is_id: is_would_start_ident,
309 value: ident_sequence.0,
310 raw: ident_sequence.1,
311 });
312 }
313
314 Ok(Token::Delim { value: '#' })
315 }
316 // U+0027 APOSTROPHE (')
317 // Consume a string token and return it.
318 Some('\'') => self.read_str(None),
319 // U+0028 LEFT PARENTHESIS (()
320 // Return a <(-token>.
321 Some('(') => Ok(tok!("(")),
322 // U+0029 RIGHT PARENTHESIS ())
323 // Return a <)-token>.
324 Some(')') => Ok(tok!(")")),
325 // U+002B PLUS SIGN (+)
326 Some('+') => {
327 // If the input stream starts with a number, reconsume the current input code
328 // point, consume a numeric token and return it.
329 if self.would_start_number(None, None, None) {
330 self.reconsume();
331
332 return self.read_numeric();
333 }
334
335 // Otherwise, return a <delim-token> with its value set to the current input
336 // code point.
337 Ok(tok!("+"))
338 }
339 // U+002C COMMA (,)
340 // Return a <comma-token>.
341 Some(',') => Ok(tok!(",")),
342 // U+002D HYPHEN-MINUS (-)
343 Some('-') => {
344 // If the input stream starts with a number, reconsume the current input code
345 // point, consume a numeric token, and return it.
346 if self.would_start_number(None, None, None) {
347 self.reconsume();
348
349 return self.read_numeric();
350 }
351 // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E
352 // GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
353 else if self.next() == Some('-') && self.next_next() == Some('>') {
354 self.consume();
355 self.consume();
356
357 return Ok(Token::CDC);
358 }
359 // Otherwise, if the input stream starts with an identifier, reconsume the current
360 // input code point, consume an ident-like token, and return it.
361 else if self.would_start_ident(None, None, None) {
362 self.reconsume();
363
364 return self.read_ident_like();
365 }
366
367 // Otherwise, return a <delim-token> with its value set to the current input
368 // code point.
369 Ok(tok!("-"))
370 }
371 // U+002E FULL STOP (.)
372 Some('.') => {
373 // If the input stream starts with a number, reconsume the current input code
374 // point, consume a numeric token, and return it.
375 if self.would_start_number(None, None, None) {
376 self.reconsume();
377
378 return self.read_numeric();
379 }
380
381 // Otherwise, return a <delim-token> with its value set to the current input
382 // code point.
383 Ok(tok!("."))
384 }
385 // U+003A COLON (:)
386 // Return a <colon-token>.
387 Some(':') => Ok(tok!(":")),
388 // U+003B SEMICOLON (;)
389 // Return a <semicolon-token>.
390 Some(';') => Ok(tok!(";")),
391 // U+003C LESS-THAN SIGN (<)
392 Some('<') => {
393 // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D
394 // HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), consume them and return a
395 // <CDO-token>.
396 if self.next() == Some('!')
397 && self.next_next() == Some('-')
398 && self.next_next_next() == Some('-')
399 {
400 self.consume(); // !
401 self.consume(); // -
402 self.consume(); // -
403
404 return Ok(tok!("<!--"));
405 }
406
407 // Otherwise, return a <delim-token> with its value set to the current input
408 // code point.
409 Ok(tok!("<"))
410 }
411 // U+0040 COMMERCIAL AT (@)
412 Some('@') => {
413 let first = self.next();
414 let second = self.next_next();
415 let third = self.next_next_next();
416
417 // If the next 3 input code points would start an identifier, consume a name,
418 // create an <at-keyword-token> with its value set to the returned value, and
419 // return it.
420 if self.would_start_ident(first, second, third) {
421 let ident_sequence = self.read_ident_sequence()?;
422
423 return Ok(Token::AtKeyword {
424 value: ident_sequence.0,
425 raw: ident_sequence.1,
426 });
427 }
428
429 // Otherwise, return a <delim-token> with its value set to the current input
430 // code point.
431 Ok(Token::Delim { value: '@' })
432 }
433 // U+005B LEFT SQUARE BRACKET ([)
434 // Return a <[-token>.
435 Some('[') => Ok(tok!("[")),
436 // U+005C REVERSE SOLIDUS (\)
437 Some('\\') => {
438 // If the input stream starts with a valid escape, reconsume the current input
439 // code point, consume an ident-like token, and return it.
440 if self.is_valid_escape(None, None) {
441 self.reconsume();
442
443 return self.read_ident_like();
444 }
445
446 // Otherwise, this is a parse error. Return a <delim-token> with its value set
447 // to the current input code point.
448 self.emit_error(ErrorKind::InvalidEscape);
449
450 Ok(Token::Delim { value: '\\' })
451 }
452 // U+005D RIGHT SQUARE BRACKET (])
453 // Return a <]-token>.
454 Some(']') => Ok(tok!("]")),
455 // U+007B LEFT CURLY BRACKET ({)
456 // Return a <{-token>.
457 Some('{') => Ok(tok!("{")),
458 // U+007D RIGHT CURLY BRACKET (})
459 // Return a <}-token>.
460 Some('}') => Ok(tok!("}")),
461 // digit
462 // Reconsume the current input code point, consume a numeric token, and return it.
463 Some('0'..='9') => {
464 self.reconsume();
465
466 self.read_numeric()
467 }
468 // name-start code point
469 // Reconsume the current input code point, consume an ident-like token, and return it.
470 Some(c) if is_name_start(c) => {
471 self.reconsume();
472
473 self.read_ident_like()
474 }
475 // EOF
476 // Return an <EOF-token>.
477 None => Err(ErrorKind::Eof),
478 // anything else
479 // Return a <delim-token> with its value set to the current input code point.
480 Some(c) => Ok(Token::Delim { value: c }),
481 }
482 }
483
484 // Consume comments.
485 // This section describes how to consume comments from a stream of code points.
486 // It returns nothing.
487 fn read_comments(&mut self) {
488 // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A
489 // ASTERISK (*), consume them and all following code points up to and including
490 // the first U+002A ASTERISK (*) followed by a U+002F SOLIDUS (/), or up to an
491 // EOF code point. Return to the start of this step.
492 // NOTE: We allow to parse line comments under the option.
493 if self.next() == Some('/') && self.next_next() == Some('*') {
494 let cmt_start = self.input.last_pos();
495
496 while self.next() == Some('/') && self.next_next() == Some('*') {
497 self.consume(); // '*'
498 self.consume(); // '/'
499
500 loop {
501 match self.consume() {
502 Some('*') if self.next() == Some('/') => {
503 self.consume(); // '/'
504
505 if self.comments.is_some() {
506 let last_pos = self.input.last_pos();
507 let text = unsafe {
508 // Safety: last_pos is a valid position
509 self.input.slice(cmt_start, last_pos)
510 };
511
512 self.pending_leading_comments.push(Comment {
513 kind: CommentKind::Block,
514 span: (self.start_pos, last_pos).into(),
515 text: self.atoms.atom(text),
516 });
517 }
518
519 break;
520 }
521 None => {
522 let span = Span::new(self.start_pos, self.input.last_pos());
523
524 self.errors
525 .borrow_mut()
526 .push(Error::new(span, ErrorKind::UnterminatedBlockComment));
527
528 return;
529 }
530 _ => {}
531 }
532 }
533 }
534 } else if self.config.allow_wrong_line_comments
535 && self.next() == Some('/')
536 && self.next_next() == Some('/')
537 {
538 while self.next() == Some('/') && self.next_next() == Some('/') {
539 self.consume(); // '/'
540 self.consume(); // '/'
541
542 let start_of_content = self.input.last_pos();
543
544 loop {
545 match self.consume() {
546 Some(c) if is_newline(c) => {
547 if self.comments.is_some() {
548 let last_pos = self.input.last_pos();
549 let text = unsafe {
550 // Safety: last_pos is a valid position
551 self.input.slice(start_of_content, last_pos)
552 };
553
554 self.pending_leading_comments.push(Comment {
555 kind: CommentKind::Line,
556 span: (self.start_pos, last_pos).into(),
557 text: self.atoms.atom(text),
558 });
559 }
560 break;
561 }
562 None => return,
563 _ => {}
564 }
565 }
566 }
567 }
568 }
569
570 // This section describes how to consume a numeric token from a stream of code
571 // points. It returns either a <number-token>, <percentage-token>, or
572 // <dimension-token>.
573 fn read_numeric(&mut self) -> LexResult<Token> {
574 // Consume a number and let number be the result.
575 let number = self.read_number()?;
576
577 let next_first = self.next();
578 let next_second = self.next_next();
579 let next_third = self.next_next_next();
580
581 // If the next 3 input code points would start an identifier, then:
582 if self.would_start_ident(next_first, next_second, next_third) {
583 // Swap logic to avoid create empty strings, because it doesn't make sense
584 //
585 // Consume a name. Set the <dimension-token>’s unit to the returned value.
586 let ident_sequence = self.read_ident_sequence()?;
587 // Create a <dimension-token> with the same value and type flag as number, and a
588 // unit set initially to the empty string.
589 let token = Token::Dimension(Box::new(DimensionToken {
590 value: number.0,
591 raw_value: number.1,
592 unit: ident_sequence.0,
593 raw_unit: ident_sequence.1,
594 type_flag: number.2,
595 }));
596
597 // Return the <dimension-token>.
598 return Ok(token);
599 }
600 // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. Create
601 // a <percentage-token> with the same value as number, and return it.
602 else if next_first == Some('%') {
603 self.consume();
604
605 return Ok(Token::Percentage {
606 value: number.0,
607 raw: number.1,
608 });
609 }
610
611 // Otherwise, create a <number-token> with the same value and type flag as
612 // number, and return it.
613 Ok(Token::Number {
614 value: number.0,
615 raw: number.1,
616 type_flag: number.2,
617 })
618 }
619
620 // This section describes how to consume an ident-like token from a stream of
621 // code points. It returns an <ident-token>, <function-token>, <url-token>, or
622 // <bad-url-token>.
623 fn read_ident_like(&mut self) -> LexResult<Token> {
624 // Consume a name, and let string be the result.
625 let ident_sequence = self.read_ident_sequence()?;
626
627 // If string’s value is an ASCII case-insensitive match for "url", and the next
628 // input code point is U+0028 LEFT PARENTHESIS ((), consume it.
629 if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some('(') {
630 self.consume();
631
632 let start_whitespace = self.input.last_pos();
633
634 // While the next two input code points are whitespace, consume the next input
635 // code point.
636 let whitespaces = self.with_buf(|l, buf| {
637 while let (Some(next), Some(next_next)) = (l.next(), l.next_next()) {
638 if is_whitespace(next) && is_whitespace(next_next) {
639 l.consume();
640
641 buf.push(next);
642 } else {
643 break;
644 }
645 }
646
647 Ok(buf.to_string())
648 })?;
649
650 match self.next() {
651 // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027
652 // APOSTROPHE ('), or whitespace followed by U+0022 QUOTATION MARK (") or U+0027
653 // APOSTROPHE ('), then create a <function-token> with its value set to string and
654 // return it.
655 Some(c)
656 if is_whitespace(c)
657 && (self.next_next() == Some('"') || self.next_next() == Some('\'')) =>
658 {
659 // Override last position because we consumed whitespaces, but they
660 // should not be part of token
661 self.override_pos = Some(start_whitespace);
662
663 return Ok(Token::Function {
664 value: ident_sequence.0,
665 raw: ident_sequence.1,
666 });
667 }
668 Some('"' | '\'') => {
669 return Ok(Token::Function {
670 value: ident_sequence.0,
671 raw: ident_sequence.1,
672 });
673 }
674 // Otherwise, consume a url token, and return it.
675 _ => {
676 return self.read_url(ident_sequence, whitespaces);
677 }
678 }
679 }
680 // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
681 // Create a <function-token> with its value set to string and return it.
682 else if self.next() == Some('(') {
683 self.consume();
684
685 return Ok(Token::Function {
686 value: ident_sequence.0,
687 raw: ident_sequence.1,
688 });
689 }
690
691 // Otherwise, create an <ident-token> with its value set to string and return
692 // it.
693 Ok(Token::Ident {
694 value: ident_sequence.0,
695 raw: ident_sequence.1,
696 })
697 }
698
699 // This section describes how to consume a string token from a stream of code
700 // points. It returns either a <string-token> or <bad-string-token>.
701 fn read_str(&mut self, maybe_ending_code_point: Option<char>) -> LexResult<Token> {
702 self.with_buf_and_raw_buf(|l, buf, raw| {
703 // This algorithm may be called with an ending code point, which denotes the
704 // code point that ends the string. If an ending code point is not specified,
705 // the current input code point is used.
706 let ending_code_point = maybe_ending_code_point.or_else(|| l.cur());
707
708 // Initially create a <string-token> with its value set to the empty string.
709 // Done above
710
711 raw.push(ending_code_point.unwrap());
712
713 // Repeatedly consume the next input code point from the stream:
714 loop {
715 match l.consume() {
716 // ending code point
717 // Return the <string-token>.
718 Some(c) if c == ending_code_point.unwrap() => {
719 raw.push(c);
720
721 break;
722 }
723
724 // EOF
725 // This is a parse error. Return the <string-token>.
726 None => {
727 l.emit_error(ErrorKind::UnterminatedString);
728
729 return Ok(Token::String {
730 value: l.atoms.atom(&**buf),
731 raw: l.atoms.atom(&**raw),
732 });
733 }
734
735 // Newline
736 // This is a parse error. Reconsume the current input code point, create a
737 // <bad-string-token>, and return it.
738 Some(c) if is_newline(c) => {
739 l.emit_error(ErrorKind::NewlineInString);
740 l.reconsume();
741
742 return Ok(Token::BadString {
743 raw: l.atoms.atom(&**raw),
744 });
745 }
746
747 // U+005C REVERSE SOLIDUS (\)
748 Some(c) if c == '\\' => {
749 let next = l.next();
750
751 // If the next input code point is EOF, do nothing.
752 if l.next().is_none() {
753 continue;
754 }
755 // Otherwise, if the next input code point is a newline, consume it.
756 else if l.next().is_some() && is_newline(l.next().unwrap()) {
757 l.consume();
758
759 raw.push(c);
760 raw.push(next.unwrap());
761 }
762 // Otherwise, (the stream starts with a valid escape) consume an escaped
763 // code point and append the returned code point to
764 // the <string-token>’s value.
765 else if l.is_valid_escape(None, None) {
766 let escape = l.read_escape()?;
767
768 buf.push(escape.0);
769 raw.push(c);
770 raw.push_str(&escape.1);
771 }
772 }
773
774 // Anything else
775 // Append the current input code point to the <string-token>’s value.
776 Some(c) => {
777 buf.push(c);
778 raw.push(c);
779 }
780 }
781 }
782
783 Ok(Token::String {
784 value: l.atoms.atom(&**buf),
785 raw: l.atoms.atom(&**raw),
786 })
787 })
788 }
789
790 // This section describes how to consume a url token from a stream of code
791 // points. It returns either a <url-token> or a <bad-url-token>.
792 fn read_url(&mut self, name: (Atom, Atom), before: String) -> LexResult<Token> {
793 // Initially create a <url-token> with its value set to the empty string.
794 self.with_buf_and_raw_buf(|l, out, raw| {
795 raw.push_str(&before);
796
797 // Consume as much whitespace as possible.
798 while let Some(c) = l.next() {
799 if is_whitespace(c) {
800 l.consume();
801
802 raw.push(c);
803 } else {
804 break;
805 }
806 }
807
808 // Repeatedly consume the next input code point from the stream:
809 loop {
810 match l.consume() {
811 // U+0029 RIGHT PARENTHESIS ())
812 // Return the <url-token>.
813 Some(')') => {
814 return Ok(Token::Url {
815 value: l.atoms.atom(&**out),
816 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
817 });
818 }
819
820 // EOF
821 // This is a parse error. Return the <url-token>.
822 None => {
823 l.emit_error(ErrorKind::UnterminatedUrl);
824
825 return Ok(Token::Url {
826 value: l.atoms.atom(&**out),
827 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
828 });
829 }
830
831 // whitespace
832 Some(c) if is_whitespace(c) => {
833 // Consume as much whitespace as possible.
834 let whitespaces: String = l.with_sub_buf(|l, buf| {
835 buf.push(c);
836
837 while let Some(c) = l.next() {
838 if is_whitespace(c) {
839 l.consume();
840
841 buf.push(c);
842 } else {
843 break;
844 }
845 }
846
847 Ok(buf.to_string())
848 })?;
849
850 // if the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
851 // consume it and return the <url-token> (if EOF was
852 // encountered, this is a parse error);
853 match l.next() {
854 Some(')') => {
855 l.consume();
856
857 raw.push_str(&whitespaces);
858
859 return Ok(Token::Url {
860 value: l.atoms.atom(&**out),
861 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
862 });
863 }
864 None => {
865 l.emit_error(ErrorKind::UnterminatedUrl);
866
867 raw.push_str(&whitespaces);
868
869 return Ok(Token::Url {
870 value: l.atoms.atom(&**out),
871 raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
872 });
873 }
874 _ => {}
875 }
876
877 // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
878 // and return it.
879 raw.push_str(&whitespaces);
880
881 let remnants = l.read_bad_url_remnants()?;
882
883 raw.push_str(&remnants);
884
885 return Ok(Token::BadUrl {
886 raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
887 });
888 }
889
890 // U+0022 QUOTATION MARK (")
891 // U+0027 APOSTROPHE (')
892 // U+0028 LEFT PARENTHESIS (()
893 // non-printable code point
894 // This is a parse error. Consume the remnants of a bad url, create a
895 // <bad-url-token>, and return it.
896 Some(c) if c == '"' || c == '\'' || c == '(' || is_non_printable(c) => {
897 l.emit_error(ErrorKind::UnexpectedCharInUrl);
898
899 let remnants = l.read_bad_url_remnants()?;
900
901 raw.push(c);
902 raw.push_str(&remnants);
903
904 return Ok(Token::BadUrl {
905 raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
906 });
907 }
908
909 // U+005C REVERSE SOLIDUS (\)
910 Some(c) if c == '\\' => {
911 // If the stream starts with a valid escape, consume an escaped code point
912 // and append the returned code point to the
913 // <url-token>’s value.
914 if l.is_valid_escape(None, None) {
915 let escaped = l.read_escape()?;
916
917 out.push(escaped.0);
918 raw.push(c);
919 raw.push_str(&escaped.1);
920 }
921 // Otherwise, this is a parse error. Consume the remnants of a bad url,
922 // create a <bad-url-token>, and return it.
923 else {
924 l.emit_error(ErrorKind::InvalidEscape);
925
926 let remnants = l.read_bad_url_remnants()?;
927
928 raw.push(c);
929 raw.push_str(&remnants);
930
931 return Ok(Token::BadUrl {
932 raw: Atom::new(format!("{}{}{}", name.1, "(", raw)),
933 });
934 }
935 }
936
937 // anything else
938 // Append the current input code point to the <url-token>’s value.
939 Some(c) => {
940 out.push(c);
941 raw.push(c);
942 }
943 }
944 }
945 })
946 }
947
948 // Consume an escaped code point
949 // This section describes how to consume an escaped code point. It assumes that
950 // the U+005C REVERSE SOLIDUS (\) has already been consumed and that the next
951 // input code point has already been verified to be part of a valid escape. It
952 // will return a code point.
953 fn read_escape(&mut self) -> LexResult<(char, String)> {
954 self.with_sub_buf(|l, buf| {
955 // Consume the next input code point.
956 match l.consume() {
957 // hex digit
958 Some(c) if is_hex_digit(c) => {
959 let mut hex = c.to_digit(16).unwrap();
960
961 buf.push(c);
962
963 // Consume as many hex digits as possible, but no more than 5.
964 // Note that this means 1-6 hex digits have been consumed in total.
965 for _ in 0..5 {
966 let next = l.next();
967 let digit = match next.and_then(|c| c.to_digit(16)) {
968 Some(v) => v,
969 None => break,
970 };
971
972 l.consume();
973
974 buf.push(next.unwrap());
975 hex = hex * 16 + digit;
976 }
977
978 // If the next input code point is whitespace, consume it as well.
979 let next = l.next();
980
981 if let Some(next) = next {
982 if is_whitespace(next) {
983 l.consume();
984
985 buf.push(next);
986 }
987 }
988
989 // Interpret the hex digits as a hexadecimal number. If this number is zero, or
990 // is for a surrogate, or is greater than the maximum allowed code point, return
991 // U+FFFD REPLACEMENT CHARACTER (�).
992 let hex = match hex {
993 // If this number is zero
994 0 => REPLACEMENT_CHARACTER,
995 // or is for a surrogate
996 55296..=57343 => REPLACEMENT_CHARACTER,
997 // or is greater than the maximum allowed code point
998 1114112.. => REPLACEMENT_CHARACTER,
999 _ => char::from_u32(hex).unwrap_or(REPLACEMENT_CHARACTER),
1000 };
1001
1002 // Otherwise, return the code point with that value.
1003 Ok((hex, (&**buf).into()))
1004 }
1005 // EOF
1006 // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
1007 None => {
1008 l.emit_error(ErrorKind::InvalidEscape);
1009
1010 let value = REPLACEMENT_CHARACTER;
1011
1012 buf.push(value);
1013
1014 Ok((value, (&**buf).into()))
1015 }
1016 // anything else
1017 // Return the current input code point.
1018 Some(c) => {
1019 buf.push(c);
1020
1021 Ok((c, (&**buf).into()))
1022 }
1023 }
1024 })
1025 }
1026
1027 // Check if two code points are a valid escape
1028 // This section describes how to check if two code points are a valid escape.
1029 // The algorithm described here can be called explicitly with two code points,
1030 // or can be called with the input stream itself. In the latter case, the two
1031 // code points in question are the current input code point and the next input
1032 // code point, in that order.
1033 fn is_valid_escape(&mut self, maybe_first: Option<char>, maybe_second: Option<char>) -> bool {
1034 // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
1035 if maybe_first.or_else(|| self.cur()) != Some('\\') {
1036 return false;
1037 }
1038
1039 match maybe_second.or_else(|| self.next()) {
1040 // Otherwise, if the second code point is a newline, return false.
1041 Some(second) => !is_newline(second),
1042 // Otherwise, return true.
1043 None => false,
1044 }
1045 }
1046
1047 // Check if three code points would start an identifier
1048 // This section describes how to check if three code points would start an
1049 // identifier. The algorithm described here can be called explicitly with three
1050 // code points, or can be called with the input stream itself. In the latter
1051 // case, the three code points in question are the current input code point and
1052 // the next two input code points, in that order.
1053 fn would_start_ident(
1054 &mut self,
1055 maybe_first: Option<char>,
1056 maybe_second: Option<char>,
1057 maybe_third: Option<char>,
1058 ) -> bool {
1059 // Look at the first code point:
1060 let first = maybe_first.or_else(|| self.cur());
1061
1062 match first {
1063 // U+002D HYPHEN-MINUS
1064 Some('-') => {
1065 let second = maybe_second.or_else(|| self.next());
1066
1067 match second {
1068 // If the second code point is a name-start code point
1069 // return true.
1070 Some(c) if is_name_start(c) => true,
1071 // or a U+002D HYPHEN-MINUS,
1072 // return true.
1073 Some('-') => true,
1074 // or the second and third code points are a valid escape
1075 // return true.
1076 Some(_) => {
1077 let third = maybe_third.or_else(|| self.next_next());
1078
1079 self.is_valid_escape(second, third)
1080 }
1081 // Otherwise, return false.
1082 _ => false,
1083 }
1084 }
1085 // name-start code point
1086 // Return true.
1087 Some(c) if is_name_start(c) => true,
1088 // U+005C REVERSE SOLIDUS (\)
1089 // If the first and second code points are a valid escape, return true. Otherwise,
1090 // return false.
1091 Some('\\') => {
1092 let second = maybe_second.or_else(|| self.next());
1093
1094 self.is_valid_escape(first, second)
1095 }
1096 _ => false,
1097 }
1098 }
1099
1100 // Check if three code points would start a number
1101 // This section describes how to check if three code points would start a
1102 // number. The algorithm described here can be called explicitly with three code
1103 // points, or can be called with the input stream itself. In the latter case,
1104 // the three code points in question are the current input code point and the
1105 // next two input code points, in that order.
1106 #[allow(clippy::needless_return)]
1107 fn would_start_number(
1108 &mut self,
1109 maybe_first: Option<char>,
1110 maybe_second: Option<char>,
1111 maybe_third: Option<char>,
1112 ) -> bool {
1113 // Look at the first code point:
1114 let first = maybe_first.or_else(|| self.cur());
1115
1116 match first {
1117 // U+002B PLUS SIGN (+)
1118 // U+002D HYPHEN-MINUS (-)
1119 Some('+') | Some('-') => {
1120 match maybe_second.or_else(|| self.next()) {
1121 // If the second code point is a digit, return true.
1122 Some(second) if second.is_ascii_digit() => return true,
1123 // Otherwise, if the second code point is a U+002E FULL STOP (.) and the
1124 // third code point is a digit, return true.
1125 Some('.') => {
1126 if let Some(third) = maybe_third.or_else(|| self.next_next()) {
1127 if third.is_ascii_digit() {
1128 return true;
1129 }
1130 }
1131
1132 return false;
1133 }
1134 // Otherwise, return false.
1135 _ => return false,
1136 };
1137 }
1138 // U+002E FULL STOP (.)
1139 Some('.') => {
1140 // If the second code point is a digit, return true.
1141 if let Some(second) = self.next() {
1142 if second.is_ascii_digit() {
1143 return true;
1144 }
1145 }
1146
1147 // Otherwise, return false.
1148 false
1149 }
1150 // digit
1151 // Return true.
1152 Some(first) if first.is_ascii_digit() => true,
1153 // anything else
1154 // Return false.
1155 _ => false,
1156 }
1157 }
1158
1159 // Consume an ident sequence
1160 // This section describes how to consume an ident sequence from a stream of code
1161 // points. It returns a string containing the largest name that can be formed
1162 // from adjacent code points in the stream, starting from the first.
1163 fn read_ident_sequence(&mut self) -> LexResult<(Atom, Atom)> {
1164 self.with_buf_and_raw_buf(|l, buf, raw| {
1165 // Let result initially be an empty string.
1166 // Done above
1167
1168 // Repeatedly consume the next input code point from the stream:
1169 loop {
1170 match l.consume() {
1171 // name code point
1172 // Append the code point to result.
1173 Some(c) if is_name(c) => {
1174 buf.push(c);
1175 raw.push(c);
1176 }
1177 // the stream starts with a valid escape
1178 // Consume an escaped code point. Append the returned code point to result.
1179 Some(c) if l.is_valid_escape(None, None) => {
1180 let escaped = l.read_escape()?;
1181
1182 buf.push(escaped.0);
1183 raw.push(c);
1184 raw.push_str(&escaped.1);
1185 }
1186 // anything else
1187 // Reconsume the current input code point. Return result.
1188 _ => {
1189 l.reconsume();
1190
1191 break;
1192 }
1193 }
1194 }
1195
1196 Ok((l.atoms.atom(&**buf), l.atoms.atom(&**raw)))
1197 })
1198 }
1199
1200 // This section describes how to consume a number from a stream of code points.
1201 // It returns a numeric value, and a type which is either "integer" or "number".
1202 fn read_number(&mut self) -> LexResult<(f64, Atom, NumberType)> {
1203 let parsed: (Atom, NumberType) = self.with_buf(|l, out| {
1204 // Initially set type to "integer". Let repr be the empty string.
1205 let mut type_flag = NumberType::Integer;
1206
1207 // If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
1208 // (-), consume it and append it to repr.
1209 let next = l.next();
1210
1211 if next == Some('+') || next == Some('-') {
1212 l.consume();
1213
1214 out.push(next.unwrap());
1215 }
1216
1217 // While the next input code point is a digit, consume it and append it to repr.
1218 while let Some(c) = l.next() {
1219 if c.is_ascii_digit() {
1220 l.consume();
1221
1222 out.push(c);
1223 } else {
1224 break;
1225 }
1226 }
1227
1228 // If the next 2 input code points are U+002E FULL STOP (.) followed by a digit,
1229 // then:
1230 let next = l.next();
1231
1232 if next == Some('.') {
1233 if let Some(n) = l.next_next() {
1234 if n.is_ascii_digit() {
1235 // Consume them.
1236 l.consume();
1237 l.consume();
1238
1239 // Append them to repr.
1240 out.push(next.unwrap());
1241 out.push(n);
1242
1243 // Set type to "number".
1244 type_flag = NumberType::Number;
1245
1246 // While the next input code point is a digit, consume it and append it to
1247 // repr.
1248 while let Some(c) = l.next() {
1249 if c.is_ascii_digit() {
1250 l.consume();
1251
1252 out.push(c);
1253 } else {
1254 break;
1255 }
1256 }
1257 }
1258 }
1259 }
1260
1261 // If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or
1262 // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS
1263 // (-) or U+002B PLUS SIGN (+), followed by a digit, then:
1264 let next = l.next();
1265
1266 if next == Some('E') || next == Some('e') {
1267 let next_next = l.next_next();
1268 let next_next_next = l.next_next_next();
1269
1270 if (next_next == Some('-')
1271 || next_next == Some('+')
1272 && next_next_next.is_some()
1273 && next_next_next.unwrap().is_ascii_digit())
1274 || next_next.is_some() && next_next.unwrap().is_ascii_digit()
1275 {
1276 // Consume them.
1277 l.consume();
1278 l.consume();
1279
1280 // Append them to repr.
1281 out.push(next.unwrap());
1282 out.push(next_next.unwrap());
1283
1284 // Set type to "number".
1285 type_flag = NumberType::Number;
1286
1287 // While the next input code point is a digit, consume it and append it
1288 // to repr.
1289 while let Some(c) = l.next() {
1290 if c.is_ascii_digit() {
1291 l.consume();
1292
1293 out.push(c);
1294 } else {
1295 break;
1296 }
1297 }
1298 }
1299 }
1300
1301 // Return value and type.
1302 Ok((l.atoms.atom(&**out), type_flag))
1303 })?;
1304
1305 // Convert repr to a number, and set the value to the returned value.
1306 let value = lexical::parse(&*parsed.0).unwrap_or_else(|err| {
1307 unreachable!("failed to parse `{}` using lexical: {:?}", parsed.0, err)
1308 });
1309
1310 Ok((value, parsed.0, parsed.1))
1311 }
1312
1313 // Consume the remnants of a bad url
1314 // This section describes how to consume the remnants of a bad url from a stream
1315 // of code points, "cleaning up" after the tokenizer realizes that it’s in the
1316 // middle of a <bad-url-token> rather than a <url-token>. It returns nothing;
1317 // its sole use is to consume enough of the input stream to reach a recovery
1318 // point where normal tokenizing can resume. But for recovery purpose we return
1319 // bad URL remnants.
1320 fn read_bad_url_remnants(&mut self) -> LexResult<String> {
1321 self.with_sub_buf(|l, raw| {
1322 // Repeatedly consume the next input code point from the stream:
1323 loop {
1324 match l.consume() {
1325 // U+0029 RIGHT PARENTHESIS ())
1326 // EOF
1327 // Return.
1328 Some(c @ ')') => {
1329 raw.push(c);
1330
1331 break;
1332 }
1333 None => {
1334 break;
1335 }
1336 // the input stream starts with a valid escape
1337 Some(c) if l.is_valid_escape(None, None) => {
1338 // Consume an escaped code point. This allows an escaped right parenthesis
1339 // ("\)") to be encountered without ending the <bad-url-token>.
1340 let escaped = l.read_escape()?;
1341
1342 raw.push(c);
1343 raw.push_str(&escaped.1);
1344 }
1345 // anything else
1346 // Do nothing.
1347 Some(c) => {
1348 raw.push(c);
1349 }
1350 }
1351 }
1352
1353 Ok((&**raw).into())
1354 })
1355 }
1356}
1357
1358#[inline(always)]
1359fn is_digit(c: char) -> bool {
1360 c.is_ascii_digit()
1361}
1362
1363#[inline(always)]
1364fn is_hex_digit(c: char) -> bool {
1365 match c {
1366 c if is_digit(c) => true,
1367 'A'..='F' => true,
1368 'a'..='f' => true,
1369 _ => false,
1370 }
1371}
1372
1373#[inline(always)]
1374fn is_uppercase_letter(c: char) -> bool {
1375 c.is_ascii_uppercase()
1376}
1377
1378#[inline(always)]
1379fn is_lowercase_letter(c: char) -> bool {
1380 c.is_ascii_lowercase()
1381}
1382
1383#[inline(always)]
1384fn is_letter(c: char) -> bool {
1385 is_uppercase_letter(c) || is_lowercase_letter(c)
1386}
1387
1388#[inline(always)]
1389fn is_non_ascii(c: char) -> bool {
1390 c as u32 >= 0x80
1391}
1392
1393#[inline(always)]
1394fn is_name_start(c: char) -> bool {
1395 matches!(c, c if is_letter(c) || is_non_ascii(c) || c == '_' || c == '\x00')
1396}
1397
1398#[inline(always)]
1399fn is_name(c: char) -> bool {
1400 is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == '-')
1401}
1402
1403#[inline(always)]
1404fn is_non_printable(c: char) -> bool {
1405 matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
1406}
1407
1408#[inline(always)]
1409fn is_newline(c: char) -> bool {
1410 matches!(c, '\n' | '\r' | '\x0C')
1411}
1412
1413#[inline(always)]
1414fn is_whitespace(c: char) -> bool {
1415 matches!(c, c if c == ' ' || c == '\t' || is_newline(c))
1416}