swc_html_parser/lexer/
mod.rs

1use std::{cell::RefCell, char::REPLACEMENT_CHARACTER, collections::VecDeque, mem::take, rc::Rc};
2
3use rustc_hash::FxHashSet;
4use swc_atoms::{atom, Atom};
5use swc_common::{input::Input, BytePos, Span};
6use swc_html_ast::{AttributeToken, Raw, Token, TokenAndSpan};
7use swc_html_utils::{Entity, HTML_ENTITIES};
8
9use crate::{
10    error::{Error, ErrorKind},
11    parser::input::ParserInput,
12};
13
14#[derive(Debug, Clone)]
15pub enum State {
16    Data,
17    Rcdata,
18    Rawtext,
19    ScriptData,
20    PlainText,
21    TagOpen,
22    EndTagOpen,
23    TagName,
24    RcdataLessThanSign,
25    RcdataEndTagOpen,
26    RcdataEndTagName,
27    RawtextLessThanSign,
28    RawtextEndTagOpen,
29    RawtextEndTagName,
30    ScriptDataLessThanSign,
31    ScriptDataEndTagOpen,
32    ScriptDataEndTagName,
33    ScriptDataEscapeStart,
34    ScriptDataEscapeStartDash,
35    ScriptDataEscaped,
36    ScriptDataEscapedDash,
37    ScriptDataEscapedDashDash,
38    ScriptDataEscapedLessThanSign,
39    ScriptDataEscapedEndTagOpen,
40    ScriptDataEscapedEndTagName,
41    ScriptDataDoubleEscapeStart,
42    ScriptDataDoubleEscaped,
43    ScriptDataDoubleEscapedDash,
44    ScriptDataDoubleEscapedDashDash,
45    ScriptDataDoubleEscapedLessThanSign,
46    ScriptDataDoubleEscapeEnd,
47    BeforeAttributeName,
48    AttributeName,
49    AfterAttributeName,
50    BeforeAttributeValue,
51    AttributeValueDoubleQuoted,
52    AttributeValueSingleQuoted,
53    AttributeValueUnquoted,
54    AfterAttributeValueQuoted,
55    SelfClosingStartTag,
56    BogusComment,
57    MarkupDeclarationOpen,
58    CommentStart,
59    CommentStartDash,
60    Comment,
61    CommentLessThanSign,
62    CommentLessThanSignBang,
63    CommentLessThanSignBangDash,
64    CommentLessThanSignBangDashDash,
65    CommentEndDash,
66    CommentEnd,
67    CommentEndBang,
68    Doctype,
69    BeforeDoctypeName,
70    DoctypeName,
71    AfterDoctypeName,
72    AfterDoctypePublicKeyword,
73    BeforeDoctypePublicIdentifier,
74    DoctypePublicIdentifierDoubleQuoted,
75    DoctypePublicIdentifierSingleQuoted,
76    AfterDoctypePublicIdentifier,
77    BetweenDoctypePublicAndSystemIdentifiers,
78    AfterDoctypeSystemKeyword,
79    BeforeDoctypeSystemIdentifier,
80    DoctypeSystemIdentifierDoubleQuoted,
81    DoctypeSystemIdentifierSingleQuoted,
82    AfterDoctypeSystemIdentifier,
83    BogusDoctype,
84    CdataSection,
85    CdataSectionBracket,
86    CdataSectionEnd,
87    CharacterReference,
88    NamedCharacterReference,
89    AmbiguousAmpersand,
90    NumericCharacterReference,
91    HexademicalCharacterReferenceStart,
92    DecimalCharacterReferenceStart,
93    HexademicalCharacterReference,
94    DecimalCharacterReference,
95    NumericCharacterReferenceEnd,
96}
97
98pub(crate) type LexResult<T> = Result<T, ErrorKind>;
99
100pub struct Lexer<'a, I>
101where
102    I: Input<'a>,
103{
104    input: I,
105    cur: Option<char>,
106    cur_pos: BytePos,
107    last_token_pos: BytePos,
108    finished: bool,
109    state: State,
110    return_state: State,
111    errors: Vec<Error>,
112    last_start_tag_name: Option<Atom>,
113    pending_tokens: VecDeque<TokenAndSpan>,
114    buf: Rc<RefCell<String>>,
115    sub_buf: Rc<RefCell<String>>,
116    current_token: Option<Token>,
117    attributes_validator: FxHashSet<Atom>,
118    attribute_start_position: Option<BytePos>,
119    character_reference_code: Option<Vec<(u8, u32, Option<char>)>>,
120    temporary_buffer: String,
121    is_adjusted_current_node_is_element_in_html_namespace: Option<bool>,
122    phantom: std::marker::PhantomData<&'a ()>,
123}
124
125impl<'a, I> Lexer<'a, I>
126where
127    I: Input<'a>,
128{
129    pub fn new(input: I) -> Self {
130        let start_pos = input.last_pos();
131
132        let mut lexer = Lexer {
133            input,
134            cur: None,
135            cur_pos: start_pos,
136            last_token_pos: start_pos,
137            finished: false,
138            state: State::Data,
139            return_state: State::Data,
140            errors: Vec::new(),
141            last_start_tag_name: None,
142            pending_tokens: VecDeque::with_capacity(16),
143            buf: Rc::new(RefCell::new(String::with_capacity(256))),
144            sub_buf: Rc::new(RefCell::new(String::with_capacity(256))),
145            current_token: None,
146            attributes_validator: Default::default(),
147            attribute_start_position: None,
148            character_reference_code: None,
149            // Do this without a new allocation.
150            temporary_buffer: String::with_capacity(33),
151            is_adjusted_current_node_is_element_in_html_namespace: None,
152            phantom: std::marker::PhantomData,
153        };
154
155        // A leading Byte Order Mark (BOM) causes the character encoding argument to be
156        // ignored and will itself be skipped.
157        if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') {
158            unsafe {
159                // Safety: We know that the current character is '\u{feff}'.
160                lexer.input.bump();
161            }
162        }
163
164        lexer
165    }
166}
167
168impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
169    type Item = TokenAndSpan;
170
171    fn next(&mut self) -> Option<Self::Item> {
172        let token_and_span = self.read_token_and_span();
173
174        match token_and_span {
175            Ok(token_and_span) => {
176                return Some(token_and_span);
177            }
178            Err(..) => {
179                return None;
180            }
181        }
182    }
183}
184
185impl<'a, I> ParserInput for Lexer<'a, I>
186where
187    I: Input<'a>,
188{
189    fn start_pos(&mut self) -> BytePos {
190        self.input.cur_pos()
191    }
192
193    fn last_pos(&mut self) -> BytePos {
194        self.input.last_pos()
195    }
196
197    fn take_errors(&mut self) -> Vec<Error> {
198        take(&mut self.errors)
199    }
200
201    fn set_last_start_tag_name(&mut self, tag_name: &Atom) {
202        self.last_start_tag_name = Some(tag_name.clone());
203    }
204
205    fn set_adjusted_current_node_to_html_namespace(&mut self, value: bool) {
206        self.is_adjusted_current_node_is_element_in_html_namespace = Some(value);
207    }
208
209    fn set_input_state(&mut self, state: State) {
210        self.state = state;
211    }
212}
213
214impl<'a, I> Lexer<'a, I>
215where
216    I: Input<'a>,
217{
218    #[inline(always)]
219    fn next(&mut self) -> Option<char> {
220        self.input.cur()
221    }
222
223    // Any occurrences of surrogates are surrogate-in-input-stream parse errors. Any
224    // occurrences of noncharacters are noncharacter-in-input-stream parse errors
225    // and any occurrences of controls other than ASCII whitespace and U+0000 NULL
226    // characters are control-character-in-input-stream parse errors.
227    //
228    // Postpone validation for each character for perf reasons and do it in
229    // `anything else`
230    #[inline(always)]
231    fn validate_input_stream_character(&mut self, c: char) {
232        let code = c as u32;
233
234        if is_surrogate(code) {
235            self.emit_error(ErrorKind::SurrogateInInputStream);
236        } else if is_allowed_control_character(code) {
237            self.emit_error(ErrorKind::ControlCharacterInInputStream);
238        } else if is_noncharacter(code) {
239            self.emit_error(ErrorKind::NoncharacterInInputStream);
240        }
241    }
242
243    #[inline(always)]
244    fn consume(&mut self) {
245        self.cur = self.input.cur();
246        self.cur_pos = self.input.cur_pos();
247
248        if self.cur.is_some() {
249            unsafe {
250                // Safety: self.cur is Some()
251                self.input.bump();
252            }
253        }
254    }
255
256    #[inline(always)]
257    fn reconsume(&mut self) {
258        unsafe {
259            // Safety: self.cur_pos is valid position because we got it from self.input
260            self.input.reset_to(self.cur_pos);
261        }
262    }
263
264    #[inline(always)]
265    fn reconsume_in_state(&mut self, state: State) {
266        self.state = state;
267        self.reconsume();
268    }
269
270    #[inline(always)]
271    fn consume_next_char(&mut self) -> Option<char> {
272        // The next input character is the first character in the input stream that has
273        // not yet been consumed or explicitly ignored by the requirements in this
274        // section. Initially, the next input character is the first character in the
275        // input. The current input character is the last character to have been
276        // consumed.
277        let c = self.next();
278
279        self.consume();
280
281        c
282    }
283
284    #[cold]
285    fn emit_error(&mut self, kind: ErrorKind) {
286        self.errors.push(Error::new(
287            Span::new(self.cur_pos, self.input.cur_pos()),
288            kind,
289        ));
290    }
291
292    #[inline(always)]
293    fn emit_token(&mut self, token: Token) {
294        let cur_pos = self.input.cur_pos();
295
296        let span = Span::new(self.last_token_pos, cur_pos);
297
298        self.last_token_pos = cur_pos;
299        self.pending_tokens.push_back(TokenAndSpan { span, token });
300    }
301
302    #[inline(always)]
303    fn is_consumed_as_part_of_an_attribute(&mut self) -> bool {
304        matches!(
305            self.return_state,
306            State::AttributeValueSingleQuoted
307                | State::AttributeValueDoubleQuoted
308                | State::AttributeValueUnquoted
309        )
310    }
311
312    // An appropriate end tag token is an end tag token whose tag name matches the
313    // tag name of the last start tag to have been emitted from this tokenizer, if
314    // any. If no start tag has been emitted from this tokenizer, then no end tag
315    // token is appropriate.
316    #[inline(always)]
317    fn current_end_tag_token_is_an_appropriate_end_tag_token(&mut self) -> bool {
318        if let Some(last_start_tag_name) = &self.last_start_tag_name {
319            let b = self.buf.clone();
320            let buf = b.borrow();
321
322            return *last_start_tag_name == *buf;
323        }
324
325        false
326    }
327
328    #[inline(always)]
329    fn emit_temporary_buffer_as_character_tokens(&mut self) {
330        for c in take(&mut self.temporary_buffer).chars() {
331            self.emit_token(Token::Character {
332                value: c,
333                raw: Some(Raw::Same),
334            });
335        }
336    }
337
338    fn flush_code_points_consumed_as_character_reference(&mut self, raw: Option<String>) {
339        if self.is_consumed_as_part_of_an_attribute() {
340            let b = self.buf.clone();
341            let mut buf = b.borrow_mut();
342            let b = self.sub_buf.clone();
343            let mut sub_buf = b.borrow_mut();
344
345            // When the length of raw is more than the length of temporary buffer we emit a
346            // raw character in the first character token
347            let mut once_raw = raw;
348            let mut once_emitted = false;
349
350            for c in take(&mut self.temporary_buffer).chars() {
351                buf.push(c);
352
353                let raw = match once_raw {
354                    Some(_) => {
355                        once_emitted = true;
356                        once_raw.take()
357                    }
358                    _ => {
359                        if once_emitted {
360                            None
361                        } else {
362                            Some(String::from(c))
363                        }
364                    }
365                };
366
367                if let Some(raw) = raw {
368                    sub_buf.push_str(&raw);
369                }
370            }
371        } else {
372            // When the length of raw is more than the length of temporary buffer we emit a
373            // raw character in the first character token
374            let mut once_raw = raw;
375
376            let is_value_eq_raw = if let Some(raw) = &once_raw {
377                *raw == self.temporary_buffer
378            } else {
379                true
380            };
381
382            for c in take(&mut self.temporary_buffer).chars() {
383                self.emit_token(Token::Character {
384                    value: c,
385                    raw: if is_value_eq_raw {
386                        Some(Raw::Same)
387                    } else {
388                        once_raw.take().map(|x| Raw::Atom(Atom::new(x)))
389                    },
390                });
391            }
392        }
393    }
394
395    #[inline(always)]
396    fn create_doctype_token(&mut self) {
397        self.current_token = Some(Token::Doctype {
398            name: None,
399            force_quirks: false,
400            public_id: None,
401            system_id: None,
402            raw: None,
403        });
404    }
405
406    fn append_raw_to_doctype_token(&mut self, c: char) {
407        let b = self.sub_buf.clone();
408        let mut sub_buf = b.borrow_mut();
409
410        let is_cr = c == '\r';
411
412        if is_cr {
413            sub_buf.push(c);
414
415            if self.input.cur() == Some('\n') {
416                unsafe {
417                    // Safety: cur() is Some('\n')
418                    self.input.bump();
419                }
420
421                sub_buf.push('\n');
422            }
423        } else {
424            sub_buf.push(c);
425        }
426    }
427
428    fn append_to_doctype_token(
429        &mut self,
430        name: Option<char>,
431        public_id: Option<char>,
432        system_id: Option<char>,
433    ) {
434        let b = self.buf.clone();
435        let mut buf = b.borrow_mut();
436
437        if let Some(name) = name {
438            buf.push(name);
439        }
440
441        if let Some(public_id) = public_id {
442            buf.push(public_id);
443        }
444
445        if let Some(system_id) = system_id {
446            buf.push(system_id);
447        }
448    }
449
450    fn consume_and_append_to_doctype_token_name<F>(&mut self, c: char, f: F)
451    where
452        F: Fn(char) -> bool,
453    {
454        let b = self.buf.clone();
455        let mut buf = b.borrow_mut();
456        let b = self.sub_buf.clone();
457        let mut sub_buf = b.borrow_mut();
458
459        buf.push(c.to_ascii_lowercase());
460        sub_buf.push(c);
461
462        let value = self.input.uncons_while(f);
463
464        buf.push_str(&value.to_ascii_lowercase());
465        sub_buf.push_str(value);
466    }
467
468    fn consume_and_append_to_doctype_token_public_id<F>(&mut self, c: char, f: F)
469    where
470        F: Fn(char) -> bool,
471    {
472        let b = self.buf.clone();
473        let mut buf = b.borrow_mut();
474        let b = self.sub_buf.clone();
475        let mut sub_buf = b.borrow_mut();
476
477        let is_cr = c == '\r';
478
479        if is_cr {
480            buf.push('\n');
481            sub_buf.push(c);
482
483            if self.input.cur() == Some('\n') {
484                unsafe {
485                    // Safety: cur() is Some('\n')
486                    self.input.bump();
487                }
488
489                sub_buf.push('\n');
490            }
491        } else {
492            buf.push(c);
493            sub_buf.push(c);
494        }
495
496        let value = self.input.uncons_while(f);
497
498        buf.push_str(value);
499        sub_buf.push_str(value);
500    }
501
502    fn consume_and_append_to_doctype_token_system_id<F>(&mut self, c: char, f: F)
503    where
504        F: Fn(char) -> bool,
505    {
506        let b = self.buf.clone();
507        let mut buf = b.borrow_mut();
508        let b = self.sub_buf.clone();
509        let mut sub_buf = b.borrow_mut();
510
511        let is_cr = c == '\r';
512
513        if is_cr {
514            buf.push('\n');
515            sub_buf.push(c);
516
517            if self.input.cur() == Some('\n') {
518                unsafe {
519                    // Safety: cur() is Some('\n')
520                    self.input.bump();
521                }
522
523                sub_buf.push('\n');
524            }
525        } else {
526            buf.push(c);
527            sub_buf.push(c);
528        }
529
530        let value = self.input.uncons_while(f);
531
532        buf.push_str(value);
533        sub_buf.push_str(value);
534    }
535
536    #[inline(always)]
537    fn set_doctype_token_force_quirks(&mut self) {
538        if let Some(Token::Doctype { force_quirks, .. }) = &mut self.current_token {
539            *force_quirks = true;
540        }
541    }
542
543    #[inline(always)]
544    fn set_doctype_token_name(&mut self, c: char) {
545        let b = self.buf.clone();
546        let mut buf = b.borrow_mut();
547
548        buf.push(c);
549    }
550
551    #[inline(always)]
552    fn set_doctype_token_public_id(&mut self) {
553        if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token {
554            *public_id = Some(atom!(""));
555        }
556    }
557
558    #[inline(always)]
559    fn set_doctype_token_system_id(&mut self) {
560        if let Some(Token::Doctype { system_id, .. }) = &mut self.current_token {
561            // The Longest system id is `http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd`
562            *system_id = Some(atom!(""));
563        }
564    }
565
566    fn finish_doctype_token_name(&mut self) {
567        if let Some(Token::Doctype { name, .. }) = &mut self.current_token {
568            let b = self.buf.clone();
569            let mut buf = b.borrow_mut();
570
571            *name = Some(buf.clone().into());
572
573            buf.clear();
574        }
575    }
576
577    fn finish_doctype_token_public_id(&mut self) {
578        if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token {
579            let b = self.buf.clone();
580            let mut buf = b.borrow_mut();
581
582            *public_id = Some(buf.clone().into());
583
584            buf.clear();
585        }
586    }
587
588    fn finish_doctype_token_system_id(&mut self) {
589        if let Some(Token::Doctype { system_id, .. }) = &mut self.current_token {
590            let b = self.buf.clone();
591            let mut buf = b.borrow_mut();
592
593            *system_id = Some(buf.clone().into());
594
595            buf.clear();
596        }
597    }
598
599    fn emit_doctype_token(&mut self) {
600        if let Some(mut token @ Token::Doctype { .. }) = self.current_token.take() {
601            let b = self.sub_buf.clone();
602            let mut sub_buf = b.borrow_mut();
603
604            match &mut token {
605                Token::Doctype { raw, .. } => {
606                    *raw = Some(Atom::new(sub_buf.clone()));
607                }
608                _ => {
609                    unreachable!();
610                }
611            }
612
613            sub_buf.clear();
614
615            self.emit_token(token);
616        }
617    }
618
619    #[inline(always)]
620    fn create_start_tag_token(&mut self) {
621        self.current_token = Some(Token::StartTag {
622            // Maximum known tag is `feComponentTransfer` (SVG)
623            tag_name: atom!(""),
624            raw_tag_name: None,
625            is_self_closing: false,
626            attributes: Vec::new(),
627        });
628    }
629
630    #[inline(always)]
631    fn create_end_tag_token(&mut self) {
632        self.current_token = Some(Token::EndTag {
633            // Maximum known tag is `feComponentTransfer` (SVG)
634            tag_name: atom!(""),
635            raw_tag_name: None,
636            is_self_closing: false,
637            // In valid HTML code closed tags do not have attributes
638            attributes: Vec::new(),
639        });
640    }
641
642    fn append_to_tag_token_name(&mut self, c: char, raw_c: char) {
643        if let Some(Token::StartTag { .. } | Token::EndTag { .. }) = &mut self.current_token {
644            let b = self.buf.clone();
645            let mut buf = b.borrow_mut();
646            let b = self.sub_buf.clone();
647            let mut sub_buf = b.borrow_mut();
648
649            buf.push(c);
650            sub_buf.push(raw_c);
651        }
652    }
653
654    fn consume_and_append_to_tag_token_name<F>(&mut self, c: char, f: F)
655    where
656        F: Fn(char) -> bool,
657    {
658        let b = self.buf.clone();
659        let mut buf = b.borrow_mut();
660        let b = self.sub_buf.clone();
661        let mut sub_buf = b.borrow_mut();
662
663        buf.push(c.to_ascii_lowercase());
664        sub_buf.push(c);
665
666        let value = self.input.uncons_while(f);
667
668        buf.push_str(&value.to_ascii_lowercase());
669        sub_buf.push_str(value);
670    }
671
672    fn finish_tag_token_name(&mut self) {
673        if let Some(
674            Token::StartTag {
675                tag_name,
676                raw_tag_name,
677                ..
678            }
679            | Token::EndTag {
680                tag_name,
681                raw_tag_name,
682                ..
683            },
684        ) = &mut self.current_token
685        {
686            let b = self.buf.clone();
687            let mut buf = b.borrow_mut();
688            let b = self.sub_buf.clone();
689            let mut sub_buf = b.borrow_mut();
690
691            *tag_name = buf.clone().into();
692            *raw_tag_name = Some(Atom::new(sub_buf.clone()));
693
694            buf.clear();
695            sub_buf.clear();
696        }
697    }
698
699    fn start_new_attribute_token(&mut self) {
700        if let Some(Token::StartTag { attributes, .. } | Token::EndTag { attributes, .. }) =
701            &mut self.current_token
702        {
703            attributes.push(AttributeToken {
704                span: Default::default(),
705                name: atom!(""),
706                raw_name: None,
707                value: None,
708                raw_value: None,
709            });
710
711            self.attribute_start_position = Some(self.cur_pos);
712        }
713    }
714
715    fn append_to_attribute_token_name(&mut self, c: char, raw_c: char) {
716        let b = self.buf.clone();
717        let mut buf = b.borrow_mut();
718        let b = self.sub_buf.clone();
719        let mut sub_buf = b.borrow_mut();
720
721        buf.push(c);
722        sub_buf.push(raw_c);
723    }
724
725    fn consume_and_append_to_attribute_token_name<F>(&mut self, c: char, f: F)
726    where
727        F: FnMut(char) -> bool,
728    {
729        let b = self.buf.clone();
730        let mut buf = b.borrow_mut();
731        let b = self.sub_buf.clone();
732        let mut sub_buf = b.borrow_mut();
733
734        buf.push(c.to_ascii_lowercase());
735        sub_buf.push(c);
736
737        let value = self.input.uncons_while(f);
738
739        buf.push_str(&value.to_ascii_lowercase());
740        sub_buf.push_str(value);
741    }
742
743    fn consume_and_append_to_attribute_token_name_and_temp_buf<F>(&mut self, c: char, f: F)
744    where
745        F: FnMut(char) -> bool,
746    {
747        let b = self.buf.clone();
748        let mut buf = b.borrow_mut();
749        let b = self.sub_buf.clone();
750        let mut sub_buf = b.borrow_mut();
751
752        buf.push(c.to_ascii_lowercase());
753        sub_buf.push(c);
754
755        self.temporary_buffer.push(c);
756
757        let value = self.input.uncons_while(f);
758
759        buf.push_str(&value.to_ascii_lowercase());
760        sub_buf.push_str(value);
761
762        self.temporary_buffer.push_str(value);
763    }
764
765    fn finish_attribute_token_name(&mut self) {
766        if let Some(attribute_start_position) = self.attribute_start_position {
767            if let Some(
768                Token::StartTag {
769                    ref mut attributes, ..
770                }
771                | Token::EndTag {
772                    ref mut attributes, ..
773                },
774            ) = self.current_token
775            {
776                if let Some(last) = attributes.last_mut() {
777                    let b = self.buf.clone();
778                    let mut buf = b.borrow_mut();
779                    let b = self.sub_buf.clone();
780                    let mut sub_buf = b.borrow_mut();
781
782                    let name: Atom = buf.clone().into();
783                    let raw_name = Atom::new(sub_buf.clone());
784                    let span = Span::new(attribute_start_position, self.cur_pos);
785
786                    if self.attributes_validator.contains(&name) {
787                        self.errors
788                            .push(Error::new(span, ErrorKind::DuplicateAttribute));
789                    }
790
791                    self.attributes_validator.insert(name.clone());
792
793                    last.name = name;
794                    last.raw_name = Some(raw_name);
795
796                    buf.clear();
797                    sub_buf.clear();
798
799                    last.span = span;
800                }
801            }
802        }
803    }
804
805    fn append_to_attribute_token_value(&mut self, c: Option<char>, raw_c: Option<char>) {
806        let b = self.buf.clone();
807        let mut buf = b.borrow_mut();
808        let b = self.sub_buf.clone();
809        let mut sub_buf = b.borrow_mut();
810
811        let is_cr = raw_c == Some('\r');
812
813        if is_cr {
814            buf.push('\n');
815            sub_buf.push('\r');
816
817            if self.input.cur() == Some('\n') {
818                unsafe {
819                    // Safety: cur() is Some('\n')
820                    self.input.bump();
821                }
822
823                sub_buf.push('\n');
824            }
825        } else {
826            if let Some(c) = c {
827                buf.push(c);
828            }
829
830            if let Some(raw_c) = raw_c {
831                sub_buf.push(raw_c);
832            }
833        }
834    }
835
836    fn consume_and_append_to_attribute_token_value<F>(&mut self, c: char, f: F)
837    where
838        F: FnMut(char) -> bool,
839    {
840        let b = self.buf.clone();
841        let mut buf = b.borrow_mut();
842        let b = self.sub_buf.clone();
843        let mut sub_buf = b.borrow_mut();
844
845        let is_cr = c == '\r';
846
847        if is_cr {
848            buf.push('\n');
849            sub_buf.push(c);
850
851            if self.input.cur() == Some('\n') {
852                unsafe {
853                    // Safety: cur() is Some('\n')
854                    self.input.bump();
855                }
856
857                sub_buf.push('\n');
858            }
859        } else {
860            buf.push(c);
861            sub_buf.push(c);
862        }
863
864        let value = self.input.uncons_while(f);
865
866        buf.push_str(value);
867        sub_buf.push_str(value);
868    }
869
870    fn finish_attribute_token_value(&mut self) {
871        if let Some(attribute_start_position) = self.attribute_start_position {
872            if let Some(
873                Token::StartTag {
874                    ref mut attributes, ..
875                }
876                | Token::EndTag {
877                    ref mut attributes, ..
878                },
879            ) = self.current_token
880            {
881                if let Some(last) = attributes.last_mut() {
882                    let b = self.buf.clone();
883                    let mut buf = b.borrow_mut();
884                    let b = self.sub_buf.clone();
885                    let mut sub_buf = b.borrow_mut();
886
887                    if !buf.is_empty() {
888                        last.value = Some(buf.clone().into());
889                    } else if !sub_buf.is_empty() {
890                        last.value = Some("".into());
891                    }
892
893                    buf.clear();
894
895                    if !sub_buf.is_empty() {
896                        last.raw_value = Some(Atom::new(sub_buf.clone()));
897
898                        sub_buf.clear();
899                    }
900
901                    last.span = Span::new(attribute_start_position, self.cur_pos);
902                }
903            }
904        }
905    }
906
907    fn emit_tag_token(&mut self) {
908        if let Some(current_tag_token) = self.current_token.take() {
909            self.attributes_validator.clear();
910
911            match &current_tag_token {
912                Token::StartTag { ref tag_name, .. } => {
913                    self.last_start_tag_name = Some(tag_name.clone());
914                }
915                Token::EndTag {
916                    ref is_self_closing,
917                    ref attributes,
918                    ..
919                } => {
920                    if !attributes.is_empty() {
921                        self.emit_error(ErrorKind::EndTagWithAttributes);
922                    }
923
924                    if *is_self_closing {
925                        self.emit_error(ErrorKind::EndTagWithTrailingSolidus);
926                    }
927                }
928                _ => {
929                    unreachable!();
930                }
931            }
932
933            self.emit_token(current_tag_token);
934        }
935    }
936
937    #[inline(always)]
938    fn create_comment_token(&mut self, raw_start: &str) {
939        let b = self.sub_buf.clone();
940        let mut sub_buf = b.borrow_mut();
941
942        sub_buf.push_str(raw_start);
943    }
944
945    #[inline(always)]
946    fn create_comment_token_with_cdata(&mut self) {
947        let b = self.buf.clone();
948        let mut buf = b.borrow_mut();
949        let b = self.sub_buf.clone();
950        let mut sub_buf = b.borrow_mut();
951
952        buf.push_str("[CDATA[");
953        sub_buf.push_str("<!");
954        sub_buf.push_str("[CDATA[");
955    }
956
957    fn append_to_comment_token(&mut self, c: char, raw_c: char) {
958        let b = self.buf.clone();
959        let mut buf = b.borrow_mut();
960        let b = self.sub_buf.clone();
961        let mut sub_buf = b.borrow_mut();
962
963        buf.push(c);
964        sub_buf.push(raw_c);
965    }
966
967    fn consume_and_append_to_comment_token<F>(&mut self, c: char, f: F)
968    where
969        F: Fn(char) -> bool,
970    {
971        let b = self.buf.clone();
972        let mut buf = b.borrow_mut();
973        let b = self.sub_buf.clone();
974        let mut sub_buf = b.borrow_mut();
975
976        let is_cr = c == '\r';
977
978        if is_cr {
979            buf.push('\n');
980            sub_buf.push(c);
981
982            if self.input.cur() == Some('\n') {
983                unsafe {
984                    // Safety: cur() is Some('\n')
985                    self.input.bump();
986                }
987
988                sub_buf.push('\n');
989            }
990        } else {
991            buf.push(c);
992            sub_buf.push(c);
993        }
994
995        let value = self.input.uncons_while(f);
996
997        buf.push_str(value);
998        sub_buf.push_str(value);
999    }
1000
1001    fn emit_comment_token(&mut self, raw_end: Option<&str>) {
1002        let b = self.buf.clone();
1003        let mut buf = b.borrow_mut();
1004        let b = self.sub_buf.clone();
1005        let mut sub_buf = b.borrow_mut();
1006
1007        if let Some(raw_end) = raw_end {
1008            sub_buf.push_str(raw_end);
1009        }
1010
1011        self.emit_token(Token::Comment {
1012            data: buf.clone().into(),
1013            raw: Some(Atom::new(sub_buf.clone())),
1014        });
1015
1016        buf.clear();
1017        sub_buf.clear();
1018    }
1019
1020    #[inline(always)]
1021    fn emit_character_token(&mut self, value: char) {
1022        self.emit_token(Token::Character {
1023            value,
1024            raw: Some(Raw::Same),
1025        });
1026    }
1027
1028    #[inline(always)]
1029    fn emit_character_token_with_raw(&mut self, c: char, raw_c: char) {
1030        let b = self.buf.clone();
1031        let mut buf = b.borrow_mut();
1032
1033        buf.push(raw_c);
1034
1035        self.emit_token(Token::Character {
1036            value: c,
1037            raw: Some(Raw::Atom(Atom::new(&**buf))),
1038        });
1039
1040        buf.clear();
1041    }
1042
1043    fn handle_raw_and_emit_character_token(&mut self, c: char) {
1044        let is_cr = c == '\r';
1045
1046        if is_cr {
1047            let b = self.buf.clone();
1048            let mut buf = b.borrow_mut();
1049
1050            buf.push(c);
1051
1052            if self.input.cur() == Some('\n') {
1053                unsafe {
1054                    // Safety: cur() is Some('\n')
1055                    self.input.bump();
1056                }
1057                buf.push('\n');
1058            }
1059
1060            self.emit_token(Token::Character {
1061                value: '\n',
1062                raw: Some(Raw::Atom(Atom::new(&**buf))),
1063            });
1064
1065            buf.clear();
1066        } else {
1067            self.emit_token(Token::Character {
1068                value: c,
1069                raw: Some(Raw::Same),
1070            });
1071        }
1072    }
1073
1074    fn read_token_and_span(&mut self) -> LexResult<TokenAndSpan> {
1075        if self.finished {
1076            return Err(ErrorKind::Eof);
1077        } else {
1078            while self.pending_tokens.is_empty() {
1079                self.run()?;
1080            }
1081        }
1082
1083        let token_and_span = self.pending_tokens.pop_front().unwrap();
1084
1085        match token_and_span.token {
1086            Token::Eof => {
1087                self.finished = true;
1088
1089                return Err(ErrorKind::Eof);
1090            }
1091            _ => {
1092                return Ok(token_and_span);
1093            }
1094        }
1095    }
1096
1097    fn run(&mut self) -> LexResult<()> {
1098        match self.state {
1099            // https://html.spec.whatwg.org/multipage/parsing.html#data-state
1100            State::Data => {
1101                // Consume the next input character:
1102                match self.consume_next_char() {
1103                    // U+0026 AMPERSAND (&)
1104                    // Set the return state to the data state. Switch to the character reference
1105                    // state.
1106                    Some('&') => {
1107                        self.return_state = State::Data;
1108                        self.state = State::CharacterReference;
1109                    }
1110                    // U+003C LESS-THAN SIGN (<)
1111                    // Switch to the tag open state.
1112                    Some('<') => {
1113                        self.state = State::TagOpen;
1114                    }
1115                    // U+0000 NULL
1116                    // This is an unexpected-null-character parse error. Emit the current input
1117                    // character as a character token.
1118                    Some(c @ '\x00') => {
1119                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1120                        self.emit_character_token(c);
1121                    }
1122                    // EOF
1123                    // Emit an end-of-file token.
1124                    None => {
1125                        self.emit_token(Token::Eof);
1126
1127                        return Ok(());
1128                    }
1129                    // Anything else
1130                    // Emit the current input character as a character token.
1131                    Some(c) => {
1132                        self.validate_input_stream_character(c);
1133                        self.handle_raw_and_emit_character_token(c);
1134                    }
1135                }
1136            }
1137            // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1138            State::Rcdata => {
1139                // Consume the next input character:
1140                match self.consume_next_char() {
1141                    // U+0026 AMPERSAND (&)
1142                    // Set the return state to the RCDATA state. Switch to the character
1143                    // reference state.
1144                    Some('&') => {
1145                        self.return_state = State::Rcdata;
1146                        self.state = State::CharacterReference;
1147                    }
1148                    // U+003C LESS-THAN SIGN (<)
1149                    // Switch to the RCDATA less-than sign state.
1150                    Some('<') => {
1151                        self.state = State::RcdataLessThanSign;
1152                    }
1153                    // U+0000 NULL
1154                    // This is an unexpected-null-character parse error. Emit a U+FFFD
1155                    // REPLACEMENT CHARACTER character token.
1156                    Some(c @ '\x00') => {
1157                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1158                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1159                    }
1160                    // EOF
1161                    // Emit an end-of-file token.
1162                    None => {
1163                        self.emit_token(Token::Eof);
1164
1165                        return Ok(());
1166                    }
1167                    // Anything else
1168                    // Emit the current input character as a character token.
1169                    Some(c) => {
1170                        self.validate_input_stream_character(c);
1171                        self.handle_raw_and_emit_character_token(c);
1172                    }
1173                }
1174            }
1175            // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1176            State::Rawtext => {
1177                // Consume the next input character:
1178                match self.consume_next_char() {
1179                    // U+003C LESS-THAN SIGN (<)
1180                    // Switch to the RAWTEXT less-than sign state.
1181                    Some('<') => self.state = State::RawtextLessThanSign,
1182                    // U+0000 NULL
1183                    // This is an unexpected-null-character parse error. Emit a U+FFFD
1184                    // REPLACEMENT CHARACTER character token.
1185                    Some(c @ '\x00') => {
1186                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1187                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1188                    }
1189                    // EOF
1190                    // Emit an end-of-file token.
1191                    None => {
1192                        self.emit_token(Token::Eof);
1193
1194                        return Ok(());
1195                    }
1196                    // Anything else
1197                    // Emit the current input character as a character token.
1198                    Some(c) => {
1199                        self.validate_input_stream_character(c);
1200                        self.handle_raw_and_emit_character_token(c);
1201                    }
1202                }
1203            }
1204            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1205            State::ScriptData => {
1206                // Consume the next input character:
1207                match self.consume_next_char() {
1208                    // U+003C LESS-THAN SIGN (<)
1209                    // Switch to the script data less-than sign state.
1210                    Some('<') => self.state = State::ScriptDataLessThanSign,
1211                    // U+0000 NULL
1212                    // This is an unexpected-null-character parse error. Emit a U+FFFD
1213                    // REPLACEMENT CHARACTER character token.
1214                    Some(c @ '\x00') => {
1215                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1216                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1217                    }
1218                    // EOF
1219                    // Emit an end-of-file token.
1220                    None => {
1221                        self.emit_token(Token::Eof);
1222
1223                        return Ok(());
1224                    }
1225                    // Anything else
1226                    // Emit the current input character as a character token.
1227                    Some(c) => {
1228                        self.validate_input_stream_character(c);
1229                        self.handle_raw_and_emit_character_token(c);
1230                    }
1231                }
1232            }
1233            // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
1234            State::PlainText => {
1235                // Consume the next input character:
1236                match self.consume_next_char() {
1237                    // U+0000 NULL
1238                    // This is an unexpected-null-character parse error. Emit a U+FFFD
1239                    // REPLACEMENT CHARACTER character token.
1240                    Some(c @ '\x00') => {
1241                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1242                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1243                    }
1244                    // EOF
1245                    // Emit an end-of-file token.
1246                    None => {
1247                        self.emit_token(Token::Eof);
1248
1249                        return Ok(());
1250                    }
1251                    // Anything else
1252                    // Emit the current input character as a character token.
1253                    Some(c) => {
1254                        self.validate_input_stream_character(c);
1255                        self.handle_raw_and_emit_character_token(c);
1256                    }
1257                }
1258            }
1259            // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1260            State::TagOpen => {
1261                // Consume the next input character:
1262                match self.consume_next_char() {
1263                    // U+002F SOLIDUS (/)
1264                    // Switch to the end tag open state.
1265                    Some('/') => {
1266                        self.state = State::EndTagOpen;
1267                    }
1268                    // U+0021 EXCLAMATION MARK (!)
1269                    // Switch to the markup declaration open state.
1270                    Some('!') => {
1271                        self.state = State::MarkupDeclarationOpen;
1272                    }
1273                    // ASCII alpha
1274                    // Create a new start tag token, set its tag name to the empty string.
1275                    // Reconsume in the tag name state.
1276                    Some(c) if is_ascii_alpha(c) => {
1277                        self.create_start_tag_token();
1278                        self.reconsume_in_state(State::TagName);
1279                    }
1280                    // U+003F QUESTION MARK (?)
1281                    // This is an unexpected-question-mark-instead-of-tag-name parse error.
1282                    // Create a comment token whose data is the empty string. Reconsume in the
1283                    // bogus comment state.
1284                    Some('?') => {
1285                        self.emit_error(ErrorKind::UnexpectedQuestionMarkInsteadOfTagName);
1286                        self.create_comment_token("<");
1287                        self.reconsume_in_state(State::BogusComment);
1288                    }
1289                    // EOF
1290                    // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1291                    // character token and an end-of-file token.
1292                    None => {
1293                        self.emit_error(ErrorKind::EofBeforeTagName);
1294                        self.emit_character_token('<');
1295                        self.emit_token(Token::Eof);
1296
1297                        return Ok(());
1298                    }
1299                    // Anything else
1300                    // This is an invalid-first-character-of-tag-name parse error. Emit a U+003C
1301                    // LESS-THAN SIGN character token. Reconsume in the data state.
1302                    _ => {
1303                        self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1304                        self.emit_character_token('<');
1305                        self.reconsume_in_state(State::Data);
1306                    }
1307                }
1308            }
1309            // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1310            State::EndTagOpen => {
1311                // Consume the next input character:
1312                match self.consume_next_char() {
1313                    // ASCII alpha
1314                    // Create a new end tag token, set its tag name to the empty string.
1315                    // Reconsume in the tag name state.
1316                    Some(c) if is_ascii_alpha(c) => {
1317                        self.create_end_tag_token();
1318                        self.reconsume_in_state(State::TagName);
1319                    }
1320                    // U+003E GREATER-THAN SIGN (>)
1321                    // This is a missing-end-tag-name parse error. Switch to the data state.
1322                    Some('>') => {
1323                        self.emit_error(ErrorKind::MissingEndTagName);
1324                        self.state = State::Data;
1325                    }
1326                    // EOF
1327                    // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1328                    // character token, a U+002F SOLIDUS character token and an end-of-file
1329                    // token.
1330                    None => {
1331                        self.emit_error(ErrorKind::EofBeforeTagName);
1332                        self.emit_character_token('<');
1333                        self.emit_character_token('/');
1334                        self.emit_token(Token::Eof);
1335
1336                        return Ok(());
1337                    }
1338                    // Anything else
1339                    // This is an invalid-first-character-of-tag-name parse error. Create a
1340                    // comment token whose data is the empty string. Reconsume in the bogus
1341                    // comment state.
1342                    _ => {
1343                        self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1344                        self.create_comment_token("</");
1345                        self.reconsume_in_state(State::BogusComment);
1346                    }
1347                }
1348            }
1349            // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
1350            State::TagName => {
1351                // Consume the next input character:
1352                match self.consume_next_char() {
1353                    // U+0009 CHARACTER TABULATION (tab)
1354                    // U+000A LINE FEED (LF)
1355                    // U+000C FORM FEED (FF)
1356                    // U+0020 SPACE
1357                    // Switch to the before attribute name state.
1358                    Some(c) if is_spacy(c) => {
1359                        self.finish_tag_token_name();
1360                        self.skip_whitespaces(c);
1361                        self.state = State::BeforeAttributeName;
1362                    }
1363                    // U+002F SOLIDUS (/)
1364                    // Switch to the self-closing start tag state.
1365                    Some('/') => {
1366                        self.finish_tag_token_name();
1367                        self.state = State::SelfClosingStartTag;
1368                    }
1369                    // U+003E GREATER-THAN SIGN (>)
1370                    // Switch to the data state. Emit the current tag token.
1371                    Some('>') => {
1372                        self.finish_tag_token_name();
1373                        self.state = State::Data;
1374                        self.emit_tag_token();
1375                    }
1376                    // ASCII upper alpha
1377                    // Append the lowercase version of the current input character (add 0x0020
1378                    // to the character's code point) to the current tag token's tag name.
1379                    Some(c) if is_ascii_upper_alpha(c) => {
1380                        self.consume_and_append_to_tag_token_name(c, is_ascii_upper_alpha);
1381                    }
1382                    // U+0000 NULL
1383                    // This is an unexpected-null-character parse error. Append a U+FFFD
1384                    // REPLACEMENT CHARACTER character to the current tag token's tag name.
1385                    Some(c @ '\x00') => {
1386                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1387                        self.append_to_tag_token_name(REPLACEMENT_CHARACTER, c);
1388                    }
1389                    // EOF
1390                    // This is an eof-in-tag parse error. Emit an end-of-file token.
1391                    None => {
1392                        self.finish_tag_token_name();
1393                        self.emit_error(ErrorKind::EofInTag);
1394                        self.emit_token(Token::Eof);
1395
1396                        return Ok(());
1397                    }
1398                    // Anything else
1399                    // Append the current input character to the current tag token's tag name.
1400                    Some(c) => {
1401                        self.validate_input_stream_character(c);
1402                        self.consume_and_append_to_tag_token_name(c, |c| {
1403                            if !is_allowed_character(c) {
1404                                return false;
1405                            }
1406
1407                            // List of characters from above to stop consumption and a certain
1408                            // branch took control
1409                            !is_spacy(c)
1410                                && !matches!(c, '/' | '>' | '\x00')
1411                                && !is_ascii_upper_alpha(c)
1412                        });
1413                    }
1414                }
1415            }
1416            // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
1417            State::RcdataLessThanSign => {
1418                // Consume the next input character:
1419                match self.consume_next_char() {
1420                    // U+002F SOLIDUS (/)
1421                    // Set the temporary buffer to the empty string. Switch to the RCDATA end
1422                    // tag open state.
1423                    Some('/') => {
1424                        self.temporary_buffer.clear();
1425                        self.state = State::RcdataEndTagOpen;
1426                    }
1427                    // Anything else
1428                    // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RCDATA
1429                    // state.
1430                    _ => {
1431                        self.emit_character_token('<');
1432                        self.reconsume_in_state(State::Rcdata);
1433                    }
1434                }
1435            }
1436            // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1437            State::RcdataEndTagOpen => {
1438                // Consume the next input character:
1439                match self.consume_next_char() {
1440                    // ASCII alpha
1441                    // Create a new end tag token, set its tag name to the empty string.
1442                    // Reconsume in the RCDATA end tag name state.
1443                    Some(c) if is_ascii_alpha(c) => {
1444                        self.create_end_tag_token();
1445                        self.reconsume_in_state(State::RcdataEndTagName);
1446                    }
1447                    // Anything else
1448                    // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
1449                    // character token. Reconsume in the RCDATA state.
1450                    _ => {
1451                        self.emit_character_token('<');
1452                        self.emit_character_token('/');
1453                        self.reconsume_in_state(State::Rcdata);
1454                    }
1455                }
1456            }
1457            // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
1458            State::RcdataEndTagName => {
1459                let anything_else = |lexer: &mut Lexer<'a, I>| {
1460                    lexer.finish_tag_token_name();
1461                    lexer.emit_character_token('<');
1462                    lexer.emit_character_token('/');
1463                    lexer.emit_temporary_buffer_as_character_tokens();
1464                    lexer.reconsume_in_state(State::Rcdata);
1465                };
1466
1467                // Consume the next input character:
1468                match self.consume_next_char() {
1469                    // U+0009 CHARACTER TABULATION (tab)
1470                    // U+000A LINE FEED (LF)
1471                    // U+000C FORM FEED (FF)
1472                    // U+0020 SPACE
1473                    // If the current end tag token is an appropriate end tag token, then switch
1474                    // to the before attribute name state. Otherwise, treat it as per the
1475                    // "anything else" entry below.
1476                    Some(c) if is_spacy(c) => {
1477                        self.skip_whitespaces(c);
1478
1479                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1480                            self.finish_tag_token_name();
1481                            self.state = State::BeforeAttributeName;
1482                        } else {
1483                            anything_else(self);
1484                        }
1485                    }
1486                    // U+002F SOLIDUS (/)
1487                    // If the current end tag token is an appropriate end tag token, then switch
1488                    // to the self-closing start tag state. Otherwise, treat it as per the
1489                    // "anything else" entry below.
1490                    Some('/') => {
1491                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1492                            self.finish_tag_token_name();
1493                            self.state = State::SelfClosingStartTag;
1494                        } else {
1495                            anything_else(self);
1496                        }
1497                    }
1498                    // U+003E GREATER-THAN SIGN (>)
1499                    // If the current end tag token is an appropriate end tag token, then switch
1500                    // to the data state and emit the current tag token. Otherwise, treat it as
1501                    // per the "anything else" entry below.
1502                    Some('>') => {
1503                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1504                            self.finish_tag_token_name();
1505                            self.state = State::Data;
1506                            self.emit_tag_token();
1507                        } else {
1508                            anything_else(self);
1509                        }
1510                    }
1511                    // ASCII upper alpha
1512                    // Append the lowercase version of the current input character (add 0x0020
1513                    // to the character's code point) to the current tag token's tag name.
1514                    // Append the current input character to the temporary buffer.
1515                    Some(c) if is_ascii_upper_alpha(c) => {
1516                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
1517                            c,
1518                            is_ascii_upper_alpha,
1519                        );
1520                    }
1521                    // ASCII lower alpha
1522                    // Append the current input character to the current tag token's tag name.
1523                    // Append the current input character to the temporary buffer.
1524                    Some(c) if is_ascii_lower_alpha(c) => {
1525                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
1526                            c,
1527                            is_ascii_lower_alpha,
1528                        );
1529                    }
1530                    // Anything else
1531                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
1532                    // token, and a character token for each of the characters in the temporary
1533                    // buffer (in the order they were added to the buffer). Reconsume in the
1534                    // RCDATA state.
1535                    _ => {
1536                        anything_else(self);
1537                    }
1538                }
1539            }
1540            // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
1541            State::RawtextLessThanSign => {
1542                // Consume the next input character:
1543                match self.consume_next_char() {
1544                    // U+002F SOLIDUS (/)
1545                    // Set the temporary buffer to the empty string. Switch to the RAWTEXT end
1546                    // tag open state.
1547                    Some('/') => {
1548                        self.temporary_buffer.clear();
1549                        self.state = State::RawtextEndTagOpen;
1550                    }
1551                    // Anything else
1552                    // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RAWTEXT
1553                    // state.
1554                    _ => {
1555                        self.emit_character_token('<');
1556                        self.reconsume_in_state(State::Rawtext);
1557                    }
1558                }
1559            }
1560            // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1561            State::RawtextEndTagOpen => {
1562                // Consume the next input character:
1563                match self.consume_next_char() {
1564                    // ASCII alpha
1565                    // Create a new end tag token, set its tag name to the empty string.
1566                    // Reconsume in the RAWTEXT end tag name state.
1567                    Some(c) if is_ascii_alpha(c) => {
1568                        self.create_end_tag_token();
1569                        self.reconsume_in_state(State::RawtextEndTagName);
1570                    }
1571                    // Anything else
1572                    // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
1573                    // character token. Reconsume in the RAWTEXT state.
1574                    _ => {
1575                        self.emit_character_token('<');
1576                        self.emit_character_token('/');
1577                        self.reconsume_in_state(State::Rawtext);
1578                    }
1579                }
1580            }
1581            // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
1582            State::RawtextEndTagName => {
1583                let anything_else = |lexer: &mut Lexer<'a, I>| {
1584                    lexer.finish_tag_token_name();
1585                    lexer.emit_character_token('<');
1586                    lexer.emit_character_token('/');
1587                    lexer.emit_temporary_buffer_as_character_tokens();
1588                    lexer.reconsume_in_state(State::Rawtext);
1589                };
1590
1591                // Consume the next input character:
1592                match self.consume_next_char() {
1593                    // U+0009 CHARACTER TABULATION (tab)
1594                    // U+000A LINE FEED (LF)
1595                    // U+000C FORM FEED (FF)
1596                    // U+0020 SPACE
1597                    // If the current end tag token is an appropriate end tag token, then switch
1598                    // to the before attribute name state. Otherwise, treat it as per the
1599                    // "anything else" entry below.
1600                    Some(c) if is_spacy(c) => {
1601                        self.skip_whitespaces(c);
1602
1603                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1604                            self.finish_tag_token_name();
1605                            self.state = State::BeforeAttributeName;
1606                        } else {
1607                            anything_else(self);
1608                        }
1609                    }
1610                    // U+002F SOLIDUS (/)
1611                    // If the current end tag token is an appropriate end tag token, then switch
1612                    // to the self-closing start tag state. Otherwise, treat it as per the
1613                    // "anything else" entry below.
1614                    Some('/') => {
1615                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1616                            self.finish_tag_token_name();
1617                            self.state = State::SelfClosingStartTag;
1618                        } else {
1619                            anything_else(self);
1620                        }
1621                    }
1622                    // U+003E GREATER-THAN SIGN (>)
1623                    // If the current end tag token is an appropriate end tag token, then switch
1624                    // to the data state and emit the current tag token. Otherwise, treat it as
1625                    // per the "anything else" entry below.
1626                    Some('>') => {
1627                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1628                            self.finish_tag_token_name();
1629                            self.state = State::Data;
1630                            self.emit_tag_token();
1631                        } else {
1632                            anything_else(self);
1633                        }
1634                    }
1635                    // ASCII upper alpha
1636                    // Append the lowercase version of the current input character (add 0x0020
1637                    // to the character's code point) to the current tag token's tag name.
1638                    // Append the current input character to the temporary buffer.
1639                    Some(c) if is_ascii_upper_alpha(c) => {
1640                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
1641                            c,
1642                            is_ascii_upper_alpha,
1643                        );
1644                    }
1645                    // ASCII lower alpha
1646                    // Append the current input character to the current tag token's tag name.
1647                    // Append the current input character to the temporary buffer.
1648                    Some(c) if is_ascii_lower_alpha(c) => {
1649                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
1650                            c,
1651                            is_ascii_lower_alpha,
1652                        );
1653                    }
1654                    // Anything else
1655                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
1656                    // token, and a character token for each of the characters in the temporary
1657                    // buffer (in the order they were added to the buffer). Reconsume in the
1658                    // RAWTEXT state.
1659                    _ => {
1660                        anything_else(self);
1661                    }
1662                }
1663            }
1664            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1665            State::ScriptDataLessThanSign => {
1666                // Consume the next input character:
1667                match self.consume_next_char() {
1668                    // U+002F SOLIDUS (/)
1669                    // Set the temporary buffer to the empty string. Switch to the script data
1670                    // end tag open state.
1671                    Some('/') => {
1672                        self.temporary_buffer.clear();
1673                        self.state = State::ScriptDataEndTagOpen;
1674                    }
1675                    // U+0021 EXCLAMATION MARK (!)
1676                    // Switch to the script data escape start state. Emit a U+003C LESS-THAN
1677                    // SIGN character token and a U+0021 EXCLAMATION MARK character token.
1678                    Some('!') => {
1679                        self.state = State::ScriptDataEscapeStart;
1680                        self.emit_character_token('<');
1681                        self.emit_character_token('!');
1682                    }
1683                    // Anything else
1684                    // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script
1685                    // data state.
1686                    _ => {
1687                        self.emit_character_token('<');
1688                        self.reconsume_in_state(State::ScriptData);
1689                    }
1690                }
1691            }
1692            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1693            State::ScriptDataEndTagOpen => {
1694                // Consume the next input character:
1695                match self.consume_next_char() {
1696                    // ASCII alpha
1697                    // Create a new end tag token, set its tag name to the empty string.
1698                    // Reconsume in the script data end tag name state.
1699                    Some(c) if is_ascii_alpha(c) => {
1700                        self.create_end_tag_token();
1701                        self.reconsume_in_state(State::ScriptDataEndTagName);
1702                    }
1703                    // Anything else
1704                    // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
1705                    // character token. Reconsume in the script data state.
1706                    _ => {
1707                        self.emit_character_token('<');
1708                        self.emit_character_token('/');
1709                        self.reconsume_in_state(State::ScriptData);
1710                    }
1711                }
1712            }
1713            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1714            State::ScriptDataEndTagName => {
1715                let anything_else = |lexer: &mut Lexer<'a, I>| {
1716                    lexer.finish_tag_token_name();
1717                    lexer.emit_character_token('<');
1718                    lexer.emit_character_token('/');
1719                    lexer.emit_temporary_buffer_as_character_tokens();
1720                    lexer.reconsume_in_state(State::ScriptData);
1721                };
1722
1723                // Consume the next input character:
1724                match self.consume_next_char() {
1725                    // U+0009 CHARACTER TABULATION (tab)
1726                    // U+000A LINE FEED (LF)
1727                    // U+000C FORM FEED (FF)
1728                    // U+0020 SPACE
1729                    // If the current end tag token is an appropriate end tag token, then switch
1730                    // to the before attribute name state. Otherwise, treat it as per the
1731                    // "anything else" entry below.
1732                    Some(c) if is_spacy(c) => {
1733                        self.skip_whitespaces(c);
1734
1735                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1736                            self.finish_tag_token_name();
1737                            self.state = State::BeforeAttributeName;
1738                        } else {
1739                            anything_else(self);
1740                        }
1741                    }
1742                    // U+002F SOLIDUS (/)
1743                    // If the current end tag token is an appropriate end tag token, then switch
1744                    // to the self-closing start tag state. Otherwise, treat it as per the
1745                    // "anything else" entry below.
1746                    Some('/') => {
1747                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1748                            self.finish_tag_token_name();
1749                            self.state = State::SelfClosingStartTag;
1750                        } else {
1751                            anything_else(self);
1752                        }
1753                    }
1754                    // U+003E GREATER-THAN SIGN (>)
1755                    // If the current end tag token is an appropriate end tag token, then switch
1756                    // to the data state and emit the current tag token. Otherwise, treat it as
1757                    // per the "anything else" entry below.
1758                    Some('>') => {
1759                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1760                            self.finish_tag_token_name();
1761                            self.state = State::Data;
1762                            self.emit_tag_token();
1763                        } else {
1764                            anything_else(self);
1765                        }
1766                    }
1767                    // ASCII upper alpha
1768                    // Append the lowercase version of the current input character (add 0x0020
1769                    // to the character's code point) to the current tag token's tag name.
1770                    // Append the current input character to the temporary buffer.
1771                    Some(c) if is_ascii_upper_alpha(c) => {
1772                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
1773                            c,
1774                            is_ascii_upper_alpha,
1775                        );
1776                    }
1777                    // ASCII lower alpha
1778                    // Append the current input character to the current tag token's tag name.
1779                    // Append the current input character to the temporary buffer.
1780                    Some(c) if is_ascii_lower_alpha(c) => {
1781                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
1782                            c,
1783                            is_ascii_lower_alpha,
1784                        );
1785                    }
1786                    // Anything else
1787                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
1788                    // token, and a character token for each of the characters in the temporary
1789                    // buffer (in the order they were added to the buffer). Reconsume in the
1790                    // script data state.
1791                    _ => {
1792                        anything_else(self);
1793                    }
1794                }
1795            }
1796            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1797            State::ScriptDataEscapeStart => {
1798                // Consume the next input character:
1799                match self.consume_next_char() {
1800                    // U+002D HYPHEN-MINUS (-)
1801                    // Switch to the script data escape start dash state. Emit a U+002D
1802                    // HYPHEN-MINUS character token.
1803                    Some(c @ '-') => {
1804                        self.state = State::ScriptDataEscapeStartDash;
1805                        self.emit_character_token(c);
1806                    }
1807                    // Anything else
1808                    // Reconsume in the script data state.
1809                    _ => {
1810                        self.reconsume_in_state(State::ScriptData);
1811                    }
1812                }
1813            }
1814            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1815            State::ScriptDataEscapeStartDash => {
1816                // Consume the next input character:
1817                match self.consume_next_char() {
1818                    // U+002D HYPHEN-MINUS (-)
1819                    // Switch to the script data escaped dash dash state. Emit a U+002D
1820                    // HYPHEN-MINUS character token.
1821                    Some(c @ '-') => {
1822                        self.state = State::ScriptDataEscapedDashDash;
1823                        self.emit_character_token(c);
1824                    }
1825                    // Anything else
1826                    // Reconsume in the script data state.
1827                    _ => {
1828                        self.reconsume_in_state(State::ScriptData);
1829                    }
1830                }
1831            }
1832            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1833            State::ScriptDataEscaped => {
1834                // Consume the next input character:
1835                match self.consume_next_char() {
1836                    // U+002D HYPHEN-MINUS (-)
1837                    // Switch to the script data escaped dash state. Emit a U+002D HYPHEN-MINUS
1838                    // character token.
1839                    Some(c @ '-') => {
1840                        self.state = State::ScriptDataEscapedDash;
1841                        self.emit_character_token(c);
1842                    }
1843                    // U+003C LESS-THAN SIGN (<)
1844                    // Switch to the script data escaped less-than sign state.
1845                    Some('<') => {
1846                        self.state = State::ScriptDataEscapedLessThanSign;
1847                    }
1848                    // U+0000 NULL
1849                    // This is an unexpected-null-character parse error. Emit a U+FFFD
1850                    // REPLACEMENT CHARACTER character token.
1851                    Some(c @ '\x00') => {
1852                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1853                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1854                    }
1855                    // EOF
1856                    // This is an eof-in-script-html-comment-like-text parse error. Emit an
1857                    // end-of-file token.
1858                    None => {
1859                        self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
1860                        self.emit_token(Token::Eof);
1861
1862                        return Ok(());
1863                    }
1864                    // Anything else
1865                    // Emit the current input character as a character token.
1866                    Some(c) => {
1867                        self.validate_input_stream_character(c);
1868                        self.handle_raw_and_emit_character_token(c);
1869                    }
1870                }
1871            }
1872            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1873            State::ScriptDataEscapedDash => {
1874                // Consume the next input character:
1875                match self.consume_next_char() {
1876                    // U+002D HYPHEN-MINUS (-)
1877                    // Switch to the script data escaped dash dash state. Emit a U+002D
1878                    // HYPHEN-MINUS character token.
1879                    Some(c @ '-') => {
1880                        self.state = State::ScriptDataEscapedDashDash;
1881                        self.emit_character_token(c);
1882                    }
1883                    // U+003C LESS-THAN SIGN (<)
1884                    // Switch to the script data escaped less-than sign state.
1885                    Some('<') => {
1886                        self.state = State::ScriptDataEscapedLessThanSign;
1887                    }
1888                    // U+0000 NULL
1889                    // This is an unexpected-null-character parse error. Switch to the script
1890                    // data escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character token.
1891                    Some(c @ '\x00') => {
1892                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1893                        self.state = State::ScriptDataEscaped;
1894                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1895                    }
1896                    // EOF
1897                    // This is an eof-in-script-html-comment-like-text parse error. Emit an
1898                    // end-of-file token.
1899                    None => {
1900                        self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
1901                        self.emit_token(Token::Eof);
1902
1903                        return Ok(());
1904                    }
1905                    // Anything else
1906                    // Switch to the script data escaped state. Emit the current input character
1907                    // as a character token.
1908                    Some(c) => {
1909                        self.validate_input_stream_character(c);
1910                        self.state = State::ScriptDataEscaped;
1911                        self.handle_raw_and_emit_character_token(c);
1912                    }
1913                }
1914            }
1915            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1916            State::ScriptDataEscapedDashDash => {
1917                // Consume the next input character:
1918                match self.consume_next_char() {
1919                    // U+002D HYPHEN-MINUS (-)
1920                    // Emit a U+002D HYPHEN-MINUS character token.
1921                    Some(c @ '-') => {
1922                        self.emit_character_token(c);
1923                    }
1924                    // U+003C LESS-THAN SIGN (<)
1925                    // Switch to the script data escaped less-than sign state.
1926                    Some('<') => {
1927                        self.state = State::ScriptDataEscapedLessThanSign;
1928                    }
1929                    // U+003E GREATER-THAN SIGN (>)
1930                    // Switch to the script data state. Emit a U+003E GREATER-THAN SIGN
1931                    // character token.
1932                    Some(c @ '>') => {
1933                        self.state = State::ScriptData;
1934                        self.emit_character_token(c);
1935                    }
1936                    // U+0000 NULL
1937                    // This is an unexpected-null-character parse error. Switch to the script
1938                    // data escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character token.
1939                    Some(c @ '\x00') => {
1940                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
1941                        self.state = State::ScriptDataEscaped;
1942                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1943                    }
1944                    // EOF
1945                    // This is an eof-in-script-html-comment-like-text parse error. Emit an
1946                    // end-of-file token.
1947                    None => {
1948                        self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
1949                        self.emit_token(Token::Eof);
1950
1951                        return Ok(());
1952                    }
1953                    // Anything else
1954                    // Switch to the script data escaped state. Emit the current input character
1955                    // as a character token.
1956                    Some(c) => {
1957                        self.validate_input_stream_character(c);
1958                        self.state = State::ScriptDataEscaped;
1959                        self.handle_raw_and_emit_character_token(c);
1960                    }
1961                }
1962            }
1963            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1964            State::ScriptDataEscapedLessThanSign => {
1965                // Consume the next input character:
1966                match self.consume_next_char() {
1967                    // U+002F SOLIDUS (/)
1968                    // Set the temporary buffer to the empty string. Switch to the script data
1969                    // escaped end tag open state.
1970                    Some('/') => {
1971                        self.temporary_buffer.clear();
1972                        self.state = State::ScriptDataEscapedEndTagOpen;
1973                    }
1974                    // ASCII alpha
1975                    // Set the temporary buffer to the empty string. Emit a U+003C LESS-THAN
1976                    // SIGN character token. Reconsume in the script data double escape start
1977                    // state.
1978                    Some(c) if is_ascii_alpha(c) => {
1979                        self.temporary_buffer.clear();
1980                        self.emit_character_token('<');
1981                        self.reconsume_in_state(State::ScriptDataDoubleEscapeStart);
1982                    }
1983                    // Anything else
1984                    // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script
1985                    // data escaped state.
1986                    _ => {
1987                        self.emit_character_token('<');
1988                        self.reconsume_in_state(State::ScriptDataEscaped);
1989                    }
1990                }
1991            }
1992            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1993            State::ScriptDataEscapedEndTagOpen => {
1994                // Consume the next input character:
1995                match self.consume_next_char() {
1996                    // ASCII alpha
1997                    // Create a new end tag token, set its tag name to the empty string.
1998                    // Reconsume in the script data escaped end tag name state.
1999                    Some(c) if is_ascii_alpha(c) => {
2000                        self.create_end_tag_token();
2001                        self.reconsume_in_state(State::ScriptDataEscapedEndTagName);
2002                    }
2003                    // Anything else
2004                    // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
2005                    // character token. Reconsume in the script data escaped state.
2006                    _ => {
2007                        self.emit_character_token('<');
2008                        self.emit_character_token('/');
2009                        self.reconsume_in_state(State::ScriptDataEscaped);
2010                    }
2011                }
2012            }
2013            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
2014            State::ScriptDataEscapedEndTagName => {
2015                let anything_else = |lexer: &mut Lexer<'a, I>| {
2016                    lexer.finish_tag_token_name();
2017                    lexer.emit_character_token('<');
2018                    lexer.emit_character_token('/');
2019                    lexer.emit_temporary_buffer_as_character_tokens();
2020                    lexer.reconsume_in_state(State::ScriptDataEscaped);
2021                };
2022
2023                // Consume the next input character:
2024                match self.consume_next_char() {
2025                    // U+0009 CHARACTER TABULATION (tab)
2026                    // U+000A LINE FEED (LF)
2027                    // U+000C FORM FEED (FF)
2028                    // U+0020 SPACE
2029                    // If the current end tag token is an appropriate end tag token, then switch
2030                    // to the before attribute name state. Otherwise, treat it as per the
2031                    // "anything else" entry below.
2032                    Some(c) if is_spacy(c) => {
2033                        self.skip_whitespaces(c);
2034
2035                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
2036                            self.finish_tag_token_name();
2037                            self.state = State::BeforeAttributeName;
2038                        } else {
2039                            anything_else(self);
2040                        }
2041                    }
2042                    // U+002F SOLIDUS (/)
2043                    // If the current end tag token is an appropriate end tag token, then switch
2044                    // to the self-closing start tag state. Otherwise, treat it as per the
2045                    // "anything else" entry below.
2046                    Some('/') => {
2047                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
2048                            self.finish_tag_token_name();
2049                            self.state = State::SelfClosingStartTag;
2050                        } else {
2051                            anything_else(self);
2052                        }
2053                    }
2054                    // U+003E GREATER-THAN SIGN (>)
2055                    // If the current end tag token is an appropriate end tag token, then switch
2056                    // to the data state and emit the current tag token. Otherwise, treat it as
2057                    // per the "anything else" entry below.
2058                    Some('>') => {
2059                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
2060                            self.finish_tag_token_name();
2061                            self.state = State::Data;
2062                            self.emit_tag_token();
2063                        } else {
2064                            anything_else(self);
2065                        }
2066                    }
2067                    // ASCII upper alpha
2068                    // Append the lowercase version of the current input character (add 0x0020
2069                    // to the character's code point) to the current tag token's tag name.
2070                    // Append the current input character to the temporary buffer.
2071                    Some(c) if is_ascii_upper_alpha(c) => {
2072                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
2073                            c,
2074                            is_ascii_upper_alpha,
2075                        );
2076                    }
2077                    // ASCII lower alpha
2078                    // Append the current input character to the current tag token's tag name.
2079                    // Append the current input character to the temporary buffer.
2080                    Some(c) if is_ascii_lower_alpha(c) => {
2081                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
2082                            c,
2083                            is_ascii_lower_alpha,
2084                        );
2085                    }
2086                    // Anything else
2087                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
2088                    // token, and a character token for each of the characters in the temporary
2089                    // buffer (in the order they were added to the buffer). Reconsume in the
2090                    // script data escaped state.
2091                    _ => {
2092                        anything_else(self);
2093                    }
2094                }
2095            }
2096            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
2097            State::ScriptDataDoubleEscapeStart => {
2098                // Consume the next input character:
2099                match self.consume_next_char() {
2100                    // U+0009 CHARACTER TABULATION (tab)
2101                    // U+000A LINE FEED (LF)
2102                    // U+000C FORM FEED (FF)
2103                    // U+0020 SPACE
2104                    // U+002F SOLIDUS (/)
2105                    // U+003E GREATER-THAN SIGN (>)
2106                    // If the temporary buffer is the string "script", then switch to the script
2107                    // data double escaped state. Otherwise, switch to the script data escaped
2108                    // state. Emit the current input character as a character token.
2109                    Some(c) if is_spacy(c) => {
2110                        let is_script = self.temporary_buffer == "script";
2111
2112                        if is_script {
2113                            self.state = State::ScriptDataDoubleEscaped;
2114                        } else {
2115                            self.state = State::ScriptDataEscaped;
2116                        }
2117
2118                        self.handle_raw_and_emit_character_token(c);
2119                    }
2120                    Some(c @ '/' | c @ '>') => {
2121                        let is_script = self.temporary_buffer == "script";
2122
2123                        if is_script {
2124                            self.state = State::ScriptDataDoubleEscaped;
2125                        } else {
2126                            self.state = State::ScriptDataEscaped;
2127                        }
2128
2129                        self.emit_character_token(c);
2130                    }
2131                    // ASCII upper alpha
2132                    // Append the lowercase version of the current input character (add 0x0020
2133                    // to the character's code point) to the temporary buffer. Emit the current
2134                    // input character as a character token.
2135                    Some(c) if is_ascii_upper_alpha(c) => {
2136                        self.temporary_buffer.push(c.to_ascii_lowercase());
2137                        self.emit_character_token(c);
2138                    }
2139                    // ASCII lower alpha
2140                    // Append the current input character to the temporary buffer. Emit the
2141                    // current input character as a character token.
2142                    Some(c) if is_ascii_lower_alpha(c) => {
2143                        self.temporary_buffer.push(c);
2144                        self.emit_character_token(c);
2145                    }
2146                    // Anything else
2147                    // Reconsume in the script data escaped state.
2148                    _ => {
2149                        self.reconsume_in_state(State::ScriptDataEscaped);
2150                    }
2151                }
2152            }
2153            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
2154            State::ScriptDataDoubleEscaped => {
2155                // Consume the next input character:
2156                match self.consume_next_char() {
2157                    // U+002D HYPHEN-MINUS (-)
2158                    // Switch to the script data double escaped dash state. Emit a U+002D
2159                    // HYPHEN-MINUS character token.
2160                    Some(c @ '-') => {
2161                        self.state = State::ScriptDataDoubleEscapedDash;
2162                        self.emit_character_token(c);
2163                    }
2164                    // U+003C LESS-THAN SIGN (<)
2165                    // Switch to the script data double escaped less-than sign state. Emit a
2166                    // U+003C LESS-THAN SIGN character token.
2167                    Some(c @ '<') => {
2168                        self.state = State::ScriptDataDoubleEscapedLessThanSign;
2169                        self.emit_character_token(c);
2170                    }
2171                    // U+0000 NULL
2172                    // This is an unexpected-null-character parse error. Emit a U+FFFD
2173                    // REPLACEMENT CHARACTER character token.
2174                    Some(c @ '\x00') => {
2175                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2176                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
2177                    }
2178                    // EOF
2179                    // This is an eof-in-script-html-comment-like-text parse error. Emit an
2180                    // end-of-file token.
2181                    None => {
2182                        self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
2183                        self.emit_token(Token::Eof);
2184
2185                        return Ok(());
2186                    }
2187                    // Anything else
2188                    // Emit the current input character as a character token.
2189                    Some(c) => {
2190                        self.validate_input_stream_character(c);
2191                        self.handle_raw_and_emit_character_token(c);
2192                    }
2193                }
2194            }
2195            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
2196            State::ScriptDataDoubleEscapedDash => {
2197                // Consume the next input character:
2198                match self.consume_next_char() {
2199                    // U+002D HYPHEN-MINUS (-)
2200                    // Switch to the script data double escaped dash dash state. Emit a U+002D
2201                    // HYPHEN-MINUS character token.
2202                    Some(c @ '-') => {
2203                        self.state = State::ScriptDataDoubleEscapedDashDash;
2204                        self.emit_character_token(c);
2205                    }
2206                    // U+003C LESS-THAN SIGN (<)
2207                    // Switch to the script data double escaped less-than sign state. Emit a
2208                    // U+003C LESS-THAN SIGN character token.
2209                    Some(c @ '<') => {
2210                        self.state = State::ScriptDataDoubleEscapedLessThanSign;
2211                        self.emit_character_token(c);
2212                    }
2213                    // U+0000 NULL
2214                    // This is an unexpected-null-character parse error. Switch to the script
2215                    // data double escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character
2216                    // token.
2217                    Some(c @ '\x00') => {
2218                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2219                        self.state = State::ScriptDataDoubleEscaped;
2220                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
2221                    }
2222                    // EOF
2223                    // This is an eof-in-script-html-comment-like-text parse error. Emit an
2224                    // end-of-file token.
2225                    None => {
2226                        self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
2227                        self.emit_token(Token::Eof);
2228
2229                        return Ok(());
2230                    }
2231                    // Anything else
2232                    // Switch to the script data double escaped state. Emit the current input
2233                    // character as a character token.
2234                    Some(c) => {
2235                        self.validate_input_stream_character(c);
2236                        self.state = State::ScriptDataDoubleEscaped;
2237                        self.handle_raw_and_emit_character_token(c);
2238                    }
2239                }
2240            }
2241            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
2242            State::ScriptDataDoubleEscapedDashDash => {
2243                // Consume the next input character:
2244                match self.consume_next_char() {
2245                    // U+002D HYPHEN-MINUS (-)
2246                    // Emit a U+002D HYPHEN-MINUS character token.
2247                    Some(c @ '-') => {
2248                        self.emit_character_token(c);
2249                    }
2250                    // U+003C LESS-THAN SIGN (<)
2251                    // Switch to the script data double escaped less-than sign state. Emit a
2252                    // U+003C LESS-THAN SIGN character token.
2253                    Some(c @ '<') => {
2254                        self.state = State::ScriptDataDoubleEscapedLessThanSign;
2255                        self.emit_character_token(c);
2256                    }
2257                    // U+003E GREATER-THAN SIGN (>)
2258                    // Switch to the script data state. Emit a U+003E GREATER-THAN SIGN
2259                    // character token.
2260                    Some(c @ '>') => {
2261                        self.state = State::ScriptData;
2262                        self.emit_character_token(c);
2263                    }
2264                    // U+0000 NULL
2265                    // This is an unexpected-null-character parse error. Switch to the script
2266                    // data double escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character
2267                    // token.
2268                    Some(c @ '\x00') => {
2269                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2270                        self.state = State::ScriptDataDoubleEscaped;
2271                        self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
2272                    }
2273                    // EOF
2274                    // This is an eof-in-script-html-comment-like-text parse error. Emit an
2275                    // end-of-file token.
2276                    None => {
2277                        self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
2278                        self.emit_token(Token::Eof);
2279
2280                        return Ok(());
2281                    }
2282                    // Anything else
2283                    // Switch to the script data double escaped state. Emit the current input
2284                    // character as a character token.
2285                    Some(c) => {
2286                        self.validate_input_stream_character(c);
2287                        self.state = State::ScriptDataDoubleEscaped;
2288                        self.handle_raw_and_emit_character_token(c);
2289                    }
2290                }
2291            }
2292            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
2293            State::ScriptDataDoubleEscapedLessThanSign => {
2294                // Consume the next input character:
2295                match self.consume_next_char() {
2296                    // U+002F SOLIDUS (/)
2297                    // Set the temporary buffer to the empty string. Switch to the script data
2298                    // double escape end state. Emit a U+002F SOLIDUS character token.
2299                    Some(c @ '/') => {
2300                        self.temporary_buffer.clear();
2301                        self.state = State::ScriptDataDoubleEscapeEnd;
2302                        self.emit_character_token(c);
2303                    }
2304                    // Anything else
2305                    // Reconsume in the script data double escaped state.
2306                    _ => {
2307                        self.reconsume_in_state(State::ScriptDataDoubleEscaped);
2308                    }
2309                }
2310            }
2311            // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
2312            State::ScriptDataDoubleEscapeEnd => {
2313                // Consume the next input character:
2314                match self.consume_next_char() {
2315                    // U+0009 CHARACTER TABULATION (tab)
2316                    // U+000A LINE FEED (LF)
2317                    // U+000C FORM FEED (FF)
2318                    // U+0020 SPACE
2319                    // U+002F SOLIDUS (/)
2320                    // U+003E GREATER-THAN SIGN (>)
2321                    // If the temporary buffer is the string "script", then switch to the script
2322                    // data escaped state. Otherwise, switch to the script data double escaped
2323                    // state. Emit the current input character as a character token.
2324                    Some(c) if is_spacy(c) => {
2325                        let is_script = self.temporary_buffer == "script";
2326
2327                        if is_script {
2328                            self.state = State::ScriptDataEscaped;
2329                        } else {
2330                            self.state = State::ScriptDataDoubleEscaped;
2331                        }
2332
2333                        self.handle_raw_and_emit_character_token(c);
2334                    }
2335                    Some(c @ '/' | c @ '>') => {
2336                        let is_script = self.temporary_buffer == "script";
2337
2338                        if is_script {
2339                            self.state = State::ScriptDataEscaped;
2340                        } else {
2341                            self.state = State::ScriptDataDoubleEscaped;
2342                        }
2343
2344                        self.emit_character_token(c);
2345                    }
2346                    // ASCII upper alpha
2347                    // Append the lowercase version of the current input character (add 0x0020
2348                    // to the character's code point) to the temporary buffer. Emit the current
2349                    // input character as a character token.
2350                    Some(c) if is_ascii_upper_alpha(c) => {
2351                        self.temporary_buffer.push(c.to_ascii_lowercase());
2352                        self.emit_character_token(c);
2353                    }
2354                    // ASCII lower alpha
2355                    // Append the current input character to the temporary buffer. Emit the
2356                    // current input character as a character token.
2357                    Some(c) if is_ascii_lower_alpha(c) => {
2358                        self.temporary_buffer.push(c);
2359
2360                        self.emit_character_token(c);
2361                    }
2362                    // Anything else
2363                    // Reconsume in the script data double escaped state.
2364                    _ => {
2365                        self.reconsume_in_state(State::ScriptDataDoubleEscaped);
2366                    }
2367                }
2368            }
2369            // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
2370            State::BeforeAttributeName => {
2371                // Consume the next input character:
2372                match self.consume_next_char() {
2373                    // U+0009 CHARACTER TABULATION (tab)
2374                    // U+000A LINE FEED (LF)
2375                    // U+000C FORM FEED (FF)
2376                    // U+0020 SPACE
2377                    // Ignore the character.
2378                    Some(c) if is_spacy(c) => {
2379                        self.skip_whitespaces(c);
2380                    }
2381                    // U+002F SOLIDUS (/)
2382                    // U+003E GREATER-THAN SIGN (>)
2383                    // EOF
2384                    // Reconsume in the after attribute name state.
2385                    Some('/') | Some('>') | None => {
2386                        self.reconsume_in_state(State::AfterAttributeName);
2387                    }
2388                    // U+003D EQUALS SIGN (=)
2389                    // This is an unexpected-equals-sign-before-attribute-name parse error.
2390                    // Start a new attribute in the current tag token. Set that attribute's name
2391                    // to the current input character, and its value to the empty string. Switch
2392                    // to the attribute name state.
2393                    // We set `None` for `value` to support boolean attributes in AST
2394                    Some(c @ '=') => {
2395                        self.emit_error(ErrorKind::UnexpectedEqualsSignBeforeAttributeName);
2396                        self.start_new_attribute_token();
2397                        self.append_to_attribute_token_name(c, c);
2398                        self.state = State::AttributeName;
2399                    }
2400                    // Anything else
2401                    // Start a new attribute in the current tag token. Set that attribute name
2402                    // and value to the empty string. Reconsume in the attribute name state.
2403                    // We set `None` for `value` to support boolean attributes in AST
2404                    _ => {
2405                        self.start_new_attribute_token();
2406                        self.reconsume_in_state(State::AttributeName);
2407                    }
2408                }
2409            }
2410            // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
2411            State::AttributeName => {
2412                let anything_else = |lexer: &mut Lexer<'a, I>, c: char| {
2413                    lexer.append_to_attribute_token_name(c, c);
2414                };
2415
2416                // Consume the next input character:
2417                match self.consume_next_char() {
2418                    // U+0009 CHARACTER TABULATION (tab)
2419                    // U+000A LINE FEED (LF)
2420                    // U+000C FORM FEED (FF)
2421                    // U+0020 SPACE
2422                    // U+002F SOLIDUS (/)
2423                    // U+003E GREATER-THAN SIGN (>)
2424                    // EOF
2425                    // Reconsume in the after attribute name state.
2426                    Some(c) if is_spacy(c) => {
2427                        self.finish_attribute_token_name();
2428                        self.skip_whitespaces(c);
2429                        self.reconsume_in_state(State::AfterAttributeName);
2430                    }
2431                    Some('/' | '>') | None => {
2432                        self.finish_attribute_token_name();
2433                        self.reconsume_in_state(State::AfterAttributeName);
2434                    }
2435                    // U+003D EQUALS SIGN (=)
2436                    // Switch to the before attribute value state.
2437                    Some('=') => {
2438                        self.finish_attribute_token_name();
2439                        self.state = State::BeforeAttributeValue;
2440                    }
2441                    // ASCII upper alpha
2442                    // Append the lowercase version of the current input character (add 0x0020
2443                    // to the character's code point) to the current attribute's name.
2444                    Some(c) if is_ascii_upper_alpha(c) => {
2445                        self.consume_and_append_to_attribute_token_name(c, |c| {
2446                            is_ascii_upper_alpha(c)
2447                        });
2448                    }
2449                    // U+0000 NULL
2450                    // This is an unexpected-null-character parse error. Append a U+FFFD
2451                    // REPLACEMENT CHARACTER character to the current attribute's name.
2452                    Some(c @ '\x00') => {
2453                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2454                        self.append_to_attribute_token_name(REPLACEMENT_CHARACTER, c);
2455                    }
2456                    // U+0022 QUOTATION MARK (")
2457                    // U+0027 APOSTROPHE (')
2458                    // U+003C LESS-THAN SIGN (<)
2459                    // This is an unexpected-character-in-attribute-name parse error. Treat it
2460                    // as per the "anything else" entry below.
2461                    Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') => {
2462                        self.emit_error(ErrorKind::UnexpectedCharacterInAttributeName);
2463
2464                        anything_else(self, c);
2465                    }
2466                    // Anything else
2467                    // Append the current input character to the current attribute's name.
2468                    Some(c) => {
2469                        self.validate_input_stream_character(c);
2470                        self.consume_and_append_to_attribute_token_name(c, |c| {
2471                            if !is_allowed_character(c) {
2472                                return false;
2473                            }
2474
2475                            // List of characters from above to stop consumption and a certain
2476                            // branch took control
2477                            !is_spacy(c)
2478                                && !matches!(c, '/' | '>' | '=' | '\x00' | '"' | '\'' | '<')
2479                                && !is_ascii_upper_alpha(c)
2480                        });
2481                    }
2482                }
2483
2484                // When the user agent leaves the attribute name state (and
2485                // before emitting the tag token, if appropriate), the
2486                // complete attribute's name must be compared to the other
2487                // attributes on the same token; if there is already an
2488                // attribute on the token with the exact same name, then
2489                // this is a duplicate-attribute parse error and the new
2490                // attribute must be removed from the token.
2491                //
2492                // We postpone it when we will emit current tag token
2493            }
2494            // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
2495            State::AfterAttributeName => {
2496                // Consume the next input character:
2497                match self.consume_next_char() {
2498                    // U+0009 CHARACTER TABULATION (tab)
2499                    // U+000A LINE FEED (LF)
2500                    // U+000C FORM FEED (FF)
2501                    // U+0020 SPACE
2502                    // Ignore the character.
2503                    Some(c) if is_spacy(c) => {
2504                        self.skip_whitespaces(c);
2505                    }
2506                    // U+002F SOLIDUS (/)
2507                    // Switch to the self-closing start tag state.
2508                    Some('/') => {
2509                        self.state = State::SelfClosingStartTag;
2510                    }
2511                    // U+003D EQUALS SIGN (=)
2512                    // Switch to the before attribute value state.
2513                    Some('=') => {
2514                        self.state = State::BeforeAttributeValue;
2515                    }
2516                    // U+003E GREATER-THAN SIGN (>)
2517                    // Switch to the data state. Emit the current tag token.
2518                    Some('>') => {
2519                        self.state = State::Data;
2520                        self.emit_tag_token();
2521                    }
2522                    // EOF
2523                    // This is an eof-in-tag parse error. Emit an end-of-file token.
2524                    None => {
2525                        self.emit_error(ErrorKind::EofInTag);
2526                        self.emit_token(Token::Eof);
2527
2528                        return Ok(());
2529                    }
2530                    // Anything else
2531                    // Start a new attribute in the current tag token. Set that attribute name
2532                    // and value to the empty string. Reconsume in the attribute name state.
2533                    // We set `None` for `value` to support boolean attributes in AST
2534                    _ => {
2535                        self.start_new_attribute_token();
2536                        self.reconsume_in_state(State::AttributeName);
2537                    }
2538                }
2539            }
2540            // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
2541            State::BeforeAttributeValue => {
2542                // Consume the next input character:
2543                match self.consume_next_char() {
2544                    // U+0009 CHARACTER TABULATION (tab)
2545                    // U+000A LINE FEED (LF)
2546                    // U+000C FORM FEED (FF)
2547                    // U+0020 SPACE
2548                    // Ignore the character.
2549                    Some(c) if is_spacy(c) => {
2550                        self.skip_whitespaces(c);
2551                    }
2552                    // U+0022 QUOTATION MARK (")
2553                    // Switch to the attribute value (double-quoted) state.
2554                    Some(c @ '"') => {
2555                        self.append_to_attribute_token_value(None, Some(c));
2556                        self.state = State::AttributeValueDoubleQuoted;
2557                    }
2558                    // U+0027 APOSTROPHE (')
2559                    // Switch to the attribute value (single-quoted) state.
2560                    Some(c @ '\'') => {
2561                        self.append_to_attribute_token_value(None, Some(c));
2562                        self.state = State::AttributeValueSingleQuoted;
2563                    }
2564                    // U+003E GREATER-THAN SIGN (>)
2565                    // This is a missing-attribute-value parse error. Switch to the data state.
2566                    // Emit the current tag token.
2567                    Some('>') => {
2568                        self.emit_error(ErrorKind::MissingAttributeValue);
2569                        self.state = State::Data;
2570                        self.emit_tag_token();
2571                    }
2572                    // Anything else
2573                    // Reconsume in the attribute value (unquoted) state.
2574                    _ => {
2575                        self.reconsume_in_state(State::AttributeValueUnquoted);
2576                    }
2577                }
2578            }
2579            // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
2580            State::AttributeValueDoubleQuoted => {
2581                // Consume the next input character:
2582                match self.consume_next_char() {
2583                    // U+0022 QUOTATION MARK (")
2584                    // Switch to the after attribute value (quoted) state.
2585                    // We set value to support empty attributes (i.e. `attr=""`)
2586                    Some(c @ '"') => {
2587                        self.append_to_attribute_token_value(None, Some(c));
2588                        self.state = State::AfterAttributeValueQuoted;
2589                    }
2590                    // U+0026 AMPERSAND (&)
2591                    // Set the return state to the attribute value (double-quoted) state. Switch
2592                    // to the character reference state.
2593                    Some('&') => {
2594                        self.return_state = State::AttributeValueDoubleQuoted;
2595                        self.state = State::CharacterReference;
2596                    }
2597                    // U+0000 NULL
2598                    // This is an unexpected-null-character parse error. Append a U+FFFD
2599                    // REPLACEMENT CHARACTER character to the current attribute's value.
2600                    Some(c @ '\x00') => {
2601                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2602                        self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
2603                    }
2604                    // EOF
2605                    // This is an eof-in-tag parse error. Emit an end-of-file token.
2606                    None => {
2607                        self.emit_error(ErrorKind::EofInTag);
2608                        self.emit_token(Token::Eof);
2609
2610                        return Ok(());
2611                    }
2612                    // Anything else
2613                    // Append the current input character to the current attribute's value.
2614                    Some(c) => {
2615                        self.validate_input_stream_character(c);
2616                        self.consume_and_append_to_attribute_token_value(c, |c| {
2617                            if !is_allowed_character(c) {
2618                                return false;
2619                            }
2620
2621                            // List of characters from above to stop consumption and a certain
2622                            // branch took control, `\r` is in list because of newline normalization
2623                            !matches!(c, '"' | '&' | '\x00' | '\r')
2624                        });
2625                    }
2626                }
2627            }
2628            // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
2629            State::AttributeValueSingleQuoted => {
2630                // Consume the next input character:
2631                match self.consume_next_char() {
2632                    // U+0027 APOSTROPHE (')
2633                    // Switch to the after attribute value (quoted) state.
2634                    // We set value to support empty attributes (i.e. `attr=''`)
2635                    Some(c @ '\'') => {
2636                        self.append_to_attribute_token_value(None, Some(c));
2637                        self.state = State::AfterAttributeValueQuoted;
2638                    }
2639                    // U+0026 AMPERSAND (&)
2640                    // Set the return state to the attribute value (single-quoted) state. Switch
2641                    // to the character reference state.
2642                    Some('&') => {
2643                        self.return_state = State::AttributeValueSingleQuoted;
2644                        self.state = State::CharacterReference;
2645                    }
2646                    // U+0000 NULL
2647                    // This is an unexpected-null-character parse error. Append a U+FFFD
2648                    // REPLACEMENT CHARACTER character to the current attribute's value.
2649                    Some(c @ '\x00') => {
2650                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2651                        self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
2652                    }
2653                    // EOF
2654                    // This is an eof-in-tag parse error. Emit an end-of-file token.
2655                    None => {
2656                        self.emit_error(ErrorKind::EofInTag);
2657                        self.emit_token(Token::Eof);
2658
2659                        return Ok(());
2660                    }
2661                    // Anything else
2662                    // Append the current input character to the current attribute's value.
2663                    Some(c) => {
2664                        self.validate_input_stream_character(c);
2665                        self.consume_and_append_to_attribute_token_value(c, |c| {
2666                            if !is_allowed_character(c) {
2667                                return false;
2668                            }
2669
2670                            // List of characters from above to stop consumption and a certain
2671                            // branch took control, `\r` is in list because of newline normalization
2672                            !matches!(c, '\'' | '&' | '\x00' | '\r')
2673                        });
2674                    }
2675                }
2676            }
2677            // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
2678            State::AttributeValueUnquoted => {
2679                let anything_else = |lexer: &mut Lexer<'a, I>, c: char| {
2680                    lexer.append_to_attribute_token_value(Some(c), Some(c));
2681                };
2682
2683                // Consume the next input character:
2684                match self.consume_next_char() {
2685                    // U+0009 CHARACTER TABULATION (tab)
2686                    // U+000A LINE FEED (LF)
2687                    // U+000C FORM FEED (FF)
2688                    // U+0020 SPACE
2689                    // Switch to the before attribute name state.
2690                    Some(c) if is_spacy(c) => {
2691                        self.finish_attribute_token_value();
2692                        self.skip_whitespaces(c);
2693                        self.state = State::BeforeAttributeName;
2694                    }
2695                    // U+0026 AMPERSAND (&)
2696                    // Set the return state to the attribute value (unquoted) state. Switch to
2697                    // the character reference state.
2698                    Some('&') => {
2699                        self.return_state = State::AttributeValueUnquoted;
2700                        self.state = State::CharacterReference;
2701                    }
2702                    // U+003E GREATER-THAN SIGN (>)
2703                    // Switch to the data state. Emit the current tag token.
2704                    Some('>') => {
2705                        self.finish_attribute_token_value();
2706                        self.state = State::Data;
2707                        self.emit_tag_token();
2708                    }
2709                    // U+0000 NULL
2710                    // This is an unexpected-null-character parse error. Append a U+FFFD
2711                    // REPLACEMENT CHARACTER character to the current attribute's value.
2712                    Some(c @ '\x00') => {
2713                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2714                        self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
2715                    }
2716                    // U+0022 QUOTATION MARK (")
2717                    // U+0027 APOSTROPHE (')
2718                    // U+003C LESS-THAN SIGN (<)
2719                    // U+003D EQUALS SIGN (=)
2720                    // U+0060 GRAVE ACCENT (`)
2721                    // This is an unexpected-character-in-unquoted-attribute-value parse error.
2722                    // Treat it as per the "anything else" entry below.
2723                    Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') | Some(c @ '=')
2724                    | Some(c @ '`') => {
2725                        self.emit_error(ErrorKind::UnexpectedCharacterInUnquotedAttributeValue);
2726
2727                        anything_else(self, c);
2728                    }
2729                    // EOF
2730                    // This is an eof-in-tag parse error. Emit an end-of-file token.
2731                    None => {
2732                        self.finish_attribute_token_value();
2733                        self.emit_error(ErrorKind::EofInTag);
2734                        self.emit_token(Token::Eof);
2735
2736                        return Ok(());
2737                    }
2738                    // Anything else
2739                    // Append the current input character to the current attribute's value.
2740                    Some(c) => {
2741                        self.validate_input_stream_character(c);
2742                        self.consume_and_append_to_attribute_token_value(c, |c| {
2743                            if !is_allowed_character(c) {
2744                                return false;
2745                            }
2746
2747                            // List of characters from above to stop consumption and a certain
2748                            // branch took control, `\r` is in list because of newline normalization
2749                            !is_spacy(c)
2750                                && !matches!(
2751                                    c,
2752                                    '&' | '>' | '\x00' | '"' | '\'' | '<' | '=' | '`' | '\r'
2753                                )
2754                        });
2755                    }
2756                }
2757            }
2758            // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
2759            State::AfterAttributeValueQuoted => {
2760                // Consume the next input character:
2761                match self.consume_next_char() {
2762                    // U+0009 CHARACTER TABULATION (tab)
2763                    // U+000A LINE FEED (LF)
2764                    // U+000C FORM FEED (FF)
2765                    // U+0020 SPACE
2766                    // Switch to the before attribute name state.
2767                    Some(c) if is_spacy(c) => {
2768                        self.finish_attribute_token_value();
2769                        self.skip_whitespaces(c);
2770                        self.state = State::BeforeAttributeName;
2771                    }
2772                    // U+002F SOLIDUS (/)
2773                    // Switch to the self-closing start tag state.
2774                    Some('/') => {
2775                        self.finish_attribute_token_value();
2776                        self.state = State::SelfClosingStartTag;
2777                    }
2778                    // U+003E GREATER-THAN SIGN (>)
2779                    // Switch to the data state. Emit the current tag token.
2780                    Some('>') => {
2781                        self.finish_attribute_token_value();
2782                        self.state = State::Data;
2783                        self.emit_tag_token();
2784                    }
2785                    // EOF
2786                    // This is an eof-in-tag parse error. Emit an end-of-file token.
2787                    None => {
2788                        self.finish_attribute_token_value();
2789                        self.emit_error(ErrorKind::EofInTag);
2790                        self.emit_token(Token::Eof);
2791
2792                        return Ok(());
2793                    }
2794                    // Anything else
2795                    // This is a missing-whitespace-between-attributes parse error. Reconsume in
2796                    // the before attribute name state.
2797                    _ => {
2798                        self.finish_attribute_token_value();
2799                        self.emit_error(ErrorKind::MissingWhitespaceBetweenAttributes);
2800                        self.reconsume_in_state(State::BeforeAttributeName);
2801                    }
2802                }
2803            }
2804            // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
2805            State::SelfClosingStartTag => {
2806                // Consume the next input character:
2807                match self.consume_next_char() {
2808                    // U+003E GREATER-THAN SIGN (>)
2809                    // Set the self-closing flag of the current tag token. Switch to the data
2810                    // state. Emit the current tag token.
2811                    Some('>') => {
2812                        if let Some(
2813                            Token::StartTag {
2814                                is_self_closing, ..
2815                            }
2816                            | Token::EndTag {
2817                                is_self_closing, ..
2818                            },
2819                        ) = &mut self.current_token
2820                        {
2821                            *is_self_closing = true;
2822                        }
2823
2824                        self.state = State::Data;
2825                        self.emit_tag_token();
2826                    }
2827                    // EOF
2828                    // This is an eof-in-tag parse error. Emit an end-of-file token.
2829                    None => {
2830                        self.emit_error(ErrorKind::EofInTag);
2831                        self.emit_token(Token::Eof);
2832
2833                        return Ok(());
2834                    }
2835                    // Anything else
2836                    // This is an unexpected-solidus-in-tag parse error. Reconsume in the before
2837                    // attribute name state.
2838                    _ => {
2839                        self.emit_error(ErrorKind::UnexpectedSolidusInTag);
2840                        self.reconsume_in_state(State::BeforeAttributeName);
2841                    }
2842                }
2843            }
2844            // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
2845            State::BogusComment => {
2846                // Consume the next input character:
2847                match self.consume_next_char() {
2848                    // U+003E GREATER-THAN SIGN (>)
2849                    // Switch to the data state. Emit the current comment token.
2850                    Some('>') => {
2851                        self.state = State::Data;
2852                        self.emit_comment_token(Some(">"));
2853                    }
2854                    // EOF
2855                    // Emit the comment. Emit an end-of-file token.
2856                    None => {
2857                        self.emit_comment_token(None);
2858                        self.emit_token(Token::Eof);
2859
2860                        return Ok(());
2861                    }
2862                    // U+0000 NULL
2863                    // This is an unexpected-null-character parse error. Append a U+FFFD
2864                    // REPLACEMENT CHARACTER character to the comment token's data.
2865                    Some(c @ '\x00') => {
2866                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
2867                        self.append_to_comment_token(REPLACEMENT_CHARACTER, c);
2868                    }
2869                    // Anything else
2870                    // Append the current input character to the comment token's data.
2871                    Some(c) => {
2872                        self.validate_input_stream_character(c);
2873                        self.consume_and_append_to_comment_token(c, |c| {
2874                            if !is_allowed_character(c) {
2875                                return false;
2876                            }
2877
2878                            // List of characters from above to stop consumption and a certain
2879                            // branch took control, `\r` is in list because of newline normalization
2880                            !matches!(c, '>' | '\x00' | '\r')
2881                        });
2882                    }
2883                }
2884            }
2885            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2886            State::MarkupDeclarationOpen => {
2887                let cur_pos = self.input.cur_pos();
2888                let anything_else = |lexer: &mut Lexer<'a, I>| {
2889                    lexer.emit_error(ErrorKind::IncorrectlyOpenedComment);
2890                    lexer.create_comment_token("<!");
2891                    lexer.state = State::BogusComment;
2892                    lexer.cur_pos = cur_pos;
2893                    // We don't validate input here because we reset position
2894                    unsafe {
2895                        // Safety: We reset position to the previous one
2896                        lexer.input.reset_to(cur_pos);
2897                    }
2898                };
2899
2900                // If the next few characters are:
2901                match self.consume_next_char() {
2902                    // Two U+002D HYPHEN-MINUS characters (-)
2903                    // Consume those two characters, create a comment token whose data
2904                    // is the empty string, and switch to the comment start state.
2905                    Some('-') => match self.consume_next_char() {
2906                        Some('-') => {
2907                            self.create_comment_token("<!--");
2908                            self.state = State::CommentStart;
2909                        }
2910                        _ => {
2911                            anything_else(self);
2912                        }
2913                    },
2914                    // ASCII case-insensitive match for the word "DOCTYPE"
2915                    // Consume those characters and switch to the DOCTYPE state.
2916                    Some(d @ 'd' | d @ 'D') => match self.consume_next_char() {
2917                        Some(o @ 'o' | o @ 'O') => match self.consume_next_char() {
2918                            Some(c @ 'c' | c @ 'C') => match self.consume_next_char() {
2919                                Some(t @ 't' | t @ 'T') => match self.consume_next_char() {
2920                                    Some(y @ 'y' | y @ 'Y') => match self.consume_next_char() {
2921                                        Some(p @ 'p' | p @ 'P') => match self.consume_next_char() {
2922                                            Some(e @ 'e' | e @ 'E') => {
2923                                                self.state = State::Doctype;
2924
2925                                                let b = self.sub_buf.clone();
2926                                                let mut sub_buf = b.borrow_mut();
2927
2928                                                sub_buf.push('<');
2929                                                sub_buf.push('!');
2930                                                sub_buf.push(d);
2931                                                sub_buf.push(o);
2932                                                sub_buf.push(c);
2933                                                sub_buf.push(t);
2934                                                sub_buf.push(y);
2935                                                sub_buf.push(p);
2936                                                sub_buf.push(e);
2937                                            }
2938                                            _ => {
2939                                                anything_else(self);
2940                                            }
2941                                        },
2942                                        _ => {
2943                                            anything_else(self);
2944                                        }
2945                                    },
2946                                    _ => {
2947                                        anything_else(self);
2948                                    }
2949                                },
2950                                _ => {
2951                                    anything_else(self);
2952                                }
2953                            },
2954                            _ => {
2955                                anything_else(self);
2956                            }
2957                        },
2958                        _ => {
2959                            anything_else(self);
2960                        }
2961                    },
2962                    // The string "[CDATA[" (the five uppercase letters "CDATA" with a
2963                    // U+005B LEFT SQUARE BRACKET character before and after)
2964                    // Consume those characters. If there is an adjusted current node and it
2965                    // is not an element in the HTML namespace, then switch to the CDATA
2966                    // section state. Otherwise, this is a cdata-in-html-content parse
2967                    // error. Create a comment token whose data is the "[CDATA[" string.
2968                    // Switch to the bogus comment state.
2969                    Some('[') => match self.consume_next_char() {
2970                        Some('C') => match self.consume_next_char() {
2971                            Some('D') => match self.consume_next_char() {
2972                                Some('A') => match self.consume_next_char() {
2973                                    Some('T') => match self.consume_next_char() {
2974                                        Some('A') => match self.consume_next_char() {
2975                                            Some('[') => {
2976                                                if let Some(false) = self.is_adjusted_current_node_is_element_in_html_namespace {
2977                                                    self.state = State::CdataSection;
2978                                                } else {
2979                                                    self.emit_error(
2980                                                        ErrorKind::CdataInHtmlContent,
2981                                                    );
2982                                                    self.create_comment_token_with_cdata();
2983
2984                                                    self.state = State::BogusComment;
2985                                                }
2986                                            }
2987                                            _ => {
2988                                                anything_else(self);
2989                                            }
2990                                        }
2991                                        _ => {
2992                                            anything_else(self);
2993                                        }
2994                                    },
2995                                    _ => {
2996                                        anything_else(self);
2997                                    }
2998                                },
2999                                _ => {
3000                                    anything_else(self);
3001                                }
3002                            }
3003                            _ => {
3004                                anything_else(self);
3005                            }
3006                        }
3007                        _ => {
3008                            anything_else(self);
3009                        }
3010                    }
3011                    // Anything else
3012                    // This is an incorrectly-opened-comment parse error. Create a comment token
3013                    // whose data is the empty string. Switch to the bogus comment state (don't
3014                    // consume anything in the current state).
3015                    _ => {
3016                        anything_else(self);
3017                    }
3018                }
3019            }
3020            // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
3021            State::CommentStart => {
3022                // Consume the next input character:
3023                match self.consume_next_char() {
3024                    // U+002D HYPHEN-MINUS (-)
3025                    // Switch to the comment start dash state.
3026                    Some('-') => {
3027                        self.state = State::CommentStartDash;
3028                    }
3029                    // U+003E GREATER-THAN SIGN (>)
3030                    // This is an abrupt-closing-of-empty-comment parse error. Switch to the
3031                    // data state. Emit the current comment token.
3032                    Some('>') => {
3033                        self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
3034                        self.state = State::Data;
3035                        self.emit_comment_token(Some(">"));
3036                    }
3037                    // Anything else
3038                    // Reconsume in the comment state.
3039                    _ => {
3040                        self.reconsume_in_state(State::Comment);
3041                    }
3042                }
3043            }
3044            // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
3045            State::CommentStartDash => {
3046                // Consume the next input character:
3047                match self.consume_next_char() {
3048                    // U+002D HYPHEN-MINUS (-)
3049                    // Switch to the comment end state.
3050                    Some('-') => {
3051                        self.state = State::CommentEnd;
3052                    }
3053                    // U+003E GREATER-THAN SIGN (>)
3054                    // This is an abrupt-closing-of-empty-comment parse error. Switch to the
3055                    // data state. Emit the current comment token.
3056                    Some('>') => {
3057                        self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
3058                        self.state = State::Data;
3059                        self.emit_comment_token(Some("->"));
3060                    }
3061                    // EOF
3062                    // This is an eof-in-comment parse error. Emit the current comment token.
3063                    // Emit an end-of-file token.
3064                    None => {
3065                        self.emit_error(ErrorKind::EofInComment);
3066                        self.emit_comment_token(None);
3067                        self.emit_token(Token::Eof);
3068
3069                        return Ok(());
3070                    }
3071                    // Anything else
3072                    // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
3073                    // Reconsume in the comment state.
3074                    _ => {
3075                        self.append_to_comment_token('-', '-');
3076                        self.reconsume_in_state(State::Comment);
3077                    }
3078                }
3079            }
3080            // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
3081            State::Comment => {
3082                // Consume the next input character:
3083                match self.consume_next_char() {
3084                    // U+003C LESS-THAN SIGN (<)
3085                    // Append the current input character to the comment token's data. Switch to
3086                    // the comment less-than sign state.
3087                    Some(c @ '<') => {
3088                        self.append_to_comment_token(c, c);
3089                        self.state = State::CommentLessThanSign;
3090                    }
3091                    // U+002D HYPHEN-MINUS (-)
3092                    // Switch to the comment end dash state.
3093                    Some('-') => {
3094                        self.state = State::CommentEndDash;
3095                    }
3096                    // U+0000 NULL
3097                    // This is an unexpected-null-character parse error. Append a U+FFFD
3098                    // REPLACEMENT CHARACTER character to the comment token's data.
3099                    Some(c @ '\x00') => {
3100                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
3101                        self.append_to_comment_token(REPLACEMENT_CHARACTER, c);
3102                    }
3103                    // EOF
3104                    // This is an eof-in-comment parse error. Emit the current comment token.
3105                    // Emit an end-of-file token.
3106                    None => {
3107                        self.emit_error(ErrorKind::EofInComment);
3108                        self.emit_comment_token(None);
3109                        self.emit_token(Token::Eof);
3110
3111                        return Ok(());
3112                    }
3113                    // Anything else
3114                    // Append the current input character to the comment token's data.
3115                    Some(c) => {
3116                        self.validate_input_stream_character(c);
3117                        self.consume_and_append_to_comment_token(c, |c| {
3118                            if !is_allowed_character(c) {
3119                                return false;
3120                            }
3121
3122                            // List of characters from above to stop consumption and a certain
3123                            // branch took control, `\r` is in list because of newline normalization
3124                            !matches!(c, '<' | '-' | '\x00' | '\r')
3125                        });
3126                    }
3127                }
3128            }
3129            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
3130            State::CommentLessThanSign => {
3131                // Consume the next input character:
3132                match self.consume_next_char() {
3133                    // U+0021 EXCLAMATION MARK (!)
3134                    // Append the current input character to the comment token's data. Switch to
3135                    // the comment less-than sign bang state.
3136                    Some(c @ '!') => {
3137                        self.append_to_comment_token(c, c);
3138                        self.state = State::CommentLessThanSignBang;
3139                    }
3140                    // U+003C LESS-THAN SIGN (<)
3141                    // Append the current input character to the comment token's data.
3142                    Some(c @ '<') => {
3143                        self.append_to_comment_token(c, c);
3144                    }
3145                    // Anything else
3146                    // Reconsume in the comment state.
3147                    _ => {
3148                        self.reconsume_in_state(State::Comment);
3149                    }
3150                }
3151            }
3152            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
3153            State::CommentLessThanSignBang => {
3154                // Consume the next input character:
3155                match self.consume_next_char() {
3156                    // U+002D HYPHEN-MINUS (-)
3157                    // Switch to the comment less-than sign bang dash state.
3158                    Some('-') => {
3159                        self.state = State::CommentLessThanSignBangDash;
3160                    }
3161                    // Anything else
3162                    // Reconsume in the comment state.
3163                    _ => {
3164                        self.reconsume_in_state(State::Comment);
3165                    }
3166                }
3167            }
3168            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
3169            State::CommentLessThanSignBangDash => {
3170                // Consume the next input character:
3171                match self.consume_next_char() {
3172                    // U+002D HYPHEN-MINUS (-)
3173                    // Switch to the comment less-than sign bang dash dash state.
3174                    Some('-') => {
3175                        self.state = State::CommentLessThanSignBangDashDash;
3176                    }
3177                    // Anything else
3178                    // Reconsume in the comment end dash state.
3179                    _ => {
3180                        self.reconsume_in_state(State::CommentEndDash);
3181                    }
3182                }
3183            }
3184            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
3185            State::CommentLessThanSignBangDashDash => {
3186                // Consume the next input character:
3187                match self.consume_next_char() {
3188                    // U+003E GREATER-THAN SIGN (>)
3189                    // EOF
3190                    // Reconsume in the comment end state.
3191                    Some('>') | None => {
3192                        self.reconsume_in_state(State::CommentEnd);
3193                    }
3194                    // Anything else
3195                    // This is a nested-comment parse error. Reconsume in the comment end state.
3196                    _ => {
3197                        self.emit_error(ErrorKind::NestedComment);
3198                        self.reconsume_in_state(State::CommentEnd);
3199                    }
3200                }
3201            }
3202            // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
3203            State::CommentEndDash => {
3204                // Consume the next input character:
3205                match self.consume_next_char() {
3206                    // U+002D HYPHEN-MINUS (-)
3207                    // Switch to the comment end state.
3208                    Some('-') => {
3209                        self.state = State::CommentEnd;
3210                    }
3211                    // EOF
3212                    // This is an eof-in-comment parse error. Emit the current comment token.
3213                    // Emit an end-of-file token.
3214                    None => {
3215                        self.emit_error(ErrorKind::EofInComment);
3216                        self.emit_comment_token(None);
3217                        self.emit_token(Token::Eof);
3218
3219                        return Ok(());
3220                    }
3221                    // Anything else
3222                    // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
3223                    // Reconsume in the comment state.
3224                    _ => {
3225                        self.append_to_comment_token('-', '-');
3226                        self.reconsume_in_state(State::Comment);
3227                    }
3228                }
3229            }
3230            // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
3231            State::CommentEnd => {
3232                // Consume the next input character:
3233                match self.consume_next_char() {
3234                    // U+003E GREATER-THAN SIGN (>)
3235                    // Switch to the data state. Emit the current comment token.
3236                    Some('>') => {
3237                        self.state = State::Data;
3238                        self.emit_comment_token(Some("-->"));
3239                    }
3240                    // U+0021 EXCLAMATION MARK (!)
3241                    // Switch to the comment end bang state.
3242                    Some('!') => {
3243                        self.state = State::CommentEndBang;
3244                    }
3245                    // U+002D HYPHEN-MINUS (-)
3246                    // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
3247                    Some(c @ '-') => {
3248                        self.append_to_comment_token(c, c);
3249                    }
3250                    // EOF
3251                    // This is an eof-in-comment parse error. Emit the current comment token.
3252                    // Emit an end-of-file token.
3253                    None => {
3254                        self.emit_error(ErrorKind::EofInComment);
3255                        self.emit_comment_token(None);
3256                        self.emit_token(Token::Eof);
3257
3258                        return Ok(());
3259                    }
3260                    // Anything else
3261                    // Append two U+002D HYPHEN-MINUS characters (-) to the comment token's
3262                    // data. Reconsume in the comment state.
3263                    _ => {
3264                        self.append_to_comment_token('-', '-');
3265                        self.append_to_comment_token('-', '-');
3266                        self.reconsume_in_state(State::Comment);
3267                    }
3268                }
3269            }
3270            // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
3271            State::CommentEndBang => {
3272                // Consume the next input character:
3273                match self.consume_next_char() {
3274                    // U+002D HYPHEN-MINUS (-)
3275                    // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION
3276                    // MARK character (!) to the comment token's data. Switch to the comment end
3277                    // dash state.
3278                    Some(c @ '-') => {
3279                        self.append_to_comment_token(c, c);
3280                        self.append_to_comment_token('-', '-');
3281                        self.append_to_comment_token('!', '!');
3282                        self.state = State::CommentEndDash;
3283                    }
3284                    // U+003E GREATER-THAN SIGN (>)
3285                    // This is an incorrectly-closed-comment parse error. Switch to the data
3286                    // state. Emit the current comment token.
3287                    Some('>') => {
3288                        self.emit_error(ErrorKind::IncorrectlyClosedComment);
3289                        self.state = State::Data;
3290                        self.emit_comment_token(Some(">"));
3291                    }
3292                    // EOF
3293                    // This is an eof-in-comment parse error. Emit the current comment token.
3294                    // Emit an end-of-file token.
3295                    None => {
3296                        self.emit_error(ErrorKind::EofInComment);
3297                        self.emit_comment_token(None);
3298                        self.emit_token(Token::Eof);
3299
3300                        return Ok(());
3301                    }
3302                    // Anything else
3303                    // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION
3304                    // MARK character (!) to the comment token's data. Reconsume in the comment
3305                    // state.
3306                    _ => {
3307                        self.append_to_comment_token('-', '-');
3308                        self.append_to_comment_token('-', '-');
3309                        self.append_to_comment_token('!', '!');
3310                        self.reconsume_in_state(State::Comment);
3311                    }
3312                }
3313            }
3314            // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
3315            State::Doctype => {
3316                // Consume the next input character:
3317                match self.consume_next_char() {
3318                    // U+0009 CHARACTER TABULATION (tab)
3319                    // U+000A LINE FEED (LF)
3320                    // U+000C FORM FEED (FF)
3321                    // U+0020 SPACE
3322                    // Switch to the before DOCTYPE name state.
3323                    Some(c) if is_spacy(c) => {
3324                        self.append_raw_to_doctype_token(c);
3325                        self.state = State::BeforeDoctypeName;
3326                    }
3327                    // U+003E GREATER-THAN SIGN (>)
3328                    // Reconsume in the before DOCTYPE name state.
3329                    Some('>') => {
3330                        self.reconsume_in_state(State::BeforeDoctypeName);
3331                    }
3332                    // EOF
3333                    // This is an eof-in-doctype parse error. Create a new DOCTYPE token. Set
3334                    // its force-quirks flag to on. Emit the current token. Emit an end-of-file
3335                    // token.
3336                    None => {
3337                        self.emit_error(ErrorKind::EofInDoctype);
3338                        self.create_doctype_token();
3339                        self.set_doctype_token_force_quirks();
3340                        self.emit_doctype_token();
3341                        self.emit_token(Token::Eof);
3342
3343                        return Ok(());
3344                    }
3345                    // Anything else
3346                    // This is a missing-whitespace-before-doctype-name parse error. Reconsume
3347                    // in the before DOCTYPE name state.
3348                    _ => {
3349                        self.emit_error(ErrorKind::MissingWhitespaceBeforeDoctypeName);
3350                        self.reconsume_in_state(State::BeforeDoctypeName);
3351                    }
3352                }
3353            }
3354            // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
3355            State::BeforeDoctypeName => {
3356                // Consume the next input character:
3357                match self.consume_next_char() {
3358                    // U+0009 CHARACTER TABULATION (tab)
3359                    // U+000A LINE FEED (LF)
3360                    // U+000C FORM FEED (FF)
3361                    // U+0020 SPACE
3362                    // Ignore the character.
3363                    Some(c) if is_spacy(c) => {
3364                        self.append_raw_to_doctype_token(c);
3365                    }
3366                    // ASCII upper alpha
3367                    // Create a new DOCTYPE token. Set the token's name to the lowercase version
3368                    // of the current input character (add 0x0020 to the character's code
3369                    // point). Switch to the DOCTYPE name state.
3370                    Some(c) if is_ascii_upper_alpha(c) => {
3371                        self.append_raw_to_doctype_token(c);
3372                        self.create_doctype_token();
3373                        self.set_doctype_token_name(c.to_ascii_lowercase());
3374                        self.state = State::DoctypeName;
3375                    }
3376                    // U+0000 NULL
3377                    // This is an unexpected-null-character parse error. Create a new DOCTYPE
3378                    // token. Set the token's name to a U+FFFD REPLACEMENT CHARACTER character.
3379                    // Switch to the DOCTYPE name state.
3380                    Some(c @ '\x00') => {
3381                        self.append_raw_to_doctype_token(c);
3382                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
3383                        self.create_doctype_token();
3384                        self.set_doctype_token_name(REPLACEMENT_CHARACTER);
3385                        self.state = State::DoctypeName;
3386                    }
3387                    // U+003E GREATER-THAN SIGN (>)
3388                    // This is a missing-doctype-name parse error. Create a new DOCTYPE token.
3389                    // Set its force-quirks flag to on. Switch to the data state. Emit the
3390                    // current token.
3391                    Some(c @ '>') => {
3392                        self.append_raw_to_doctype_token(c);
3393                        self.emit_error(ErrorKind::MissingDoctypeName);
3394                        self.create_doctype_token();
3395                        self.set_doctype_token_force_quirks();
3396                        self.state = State::Data;
3397                        self.emit_doctype_token();
3398                    }
3399                    // EOF
3400                    // This is an eof-in-doctype parse error. Create a new DOCTYPE token. Set
3401                    // its force-quirks flag to on. Emit the current token. Emit an end-of-file
3402                    // token.
3403                    None => {
3404                        self.emit_error(ErrorKind::EofInDoctype);
3405                        self.create_doctype_token();
3406                        self.set_doctype_token_force_quirks();
3407                        self.emit_doctype_token();
3408                        self.emit_token(Token::Eof);
3409
3410                        return Ok(());
3411                    }
3412                    // Anything else
3413                    // Create a new DOCTYPE token. Set the token's name to the current input
3414                    // character. Switch to the DOCTYPE name state.
3415                    Some(c) => {
3416                        self.validate_input_stream_character(c);
3417                        self.append_raw_to_doctype_token(c);
3418                        self.create_doctype_token();
3419                        self.set_doctype_token_name(c);
3420                        self.state = State::DoctypeName;
3421                    }
3422                }
3423            }
3424            // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
3425            State::DoctypeName => {
3426                // Consume the next input character:
3427                match self.consume_next_char() {
3428                    // U+0009 CHARACTER TABULATION (tab)
3429                    // U+000A LINE FEED (LF)
3430                    // U+000C FORM FEED (FF)
3431                    // U+0020 SPACE
3432                    // Switch to the after DOCTYPE name state.
3433                    Some(c) if is_spacy(c) => {
3434                        self.append_raw_to_doctype_token(c);
3435                        self.finish_doctype_token_name();
3436                        self.state = State::AfterDoctypeName;
3437                    }
3438                    // U+003E GREATER-THAN SIGN (>)
3439                    // Switch to the data state. Emit the current DOCTYPE token.
3440                    Some(c @ '>') => {
3441                        self.append_raw_to_doctype_token(c);
3442                        self.finish_doctype_token_name();
3443                        self.state = State::Data;
3444                        self.emit_doctype_token();
3445                    }
3446                    // ASCII upper alpha
3447                    // Append the lowercase version of the current input character (add 0x0020
3448                    // to the character's code point) to the current DOCTYPE token's name.
3449                    Some(c) if is_ascii_upper_alpha(c) => {
3450                        self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha);
3451                    }
3452                    // U+0000 NULL
3453                    // This is an unexpected-null-character parse error. Append a U+FFFD
3454                    // REPLACEMENT CHARACTER character to the current DOCTYPE token's name.
3455                    Some(c @ '\x00') => {
3456                        self.append_raw_to_doctype_token(c);
3457                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
3458                        self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER), None, None);
3459                    }
3460                    // EOF
3461                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3462                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3463                    // end-of-file token.
3464                    None => {
3465                        self.emit_error(ErrorKind::EofInDoctype);
3466                        self.set_doctype_token_force_quirks();
3467                        self.finish_doctype_token_name();
3468                        self.emit_doctype_token();
3469                        self.emit_token(Token::Eof);
3470
3471                        return Ok(());
3472                    }
3473                    // Anything else
3474                    // Append the current input character to the current DOCTYPE token's name.
3475                    Some(c) => {
3476                        self.validate_input_stream_character(c);
3477                        self.consume_and_append_to_doctype_token_name(c, |c| {
3478                            if !is_allowed_character(c) {
3479                                return false;
3480                            }
3481
3482                            !is_spacy(c) && !matches!(c, '>' | '\x00') && !is_ascii_upper_alpha(c)
3483                        });
3484                    }
3485                }
3486            }
3487            // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
3488            State::AfterDoctypeName => {
3489                let cur_pos = self.input.cur_pos();
3490
3491                // Consume the next input character:
3492                match self.consume_next_char() {
3493                    // U+0009 CHARACTER TABULATION (tab)
3494                    // U+000A LINE FEED (LF)
3495                    // U+000C FORM FEED (FF)
3496                    // U+0020 SPACE
3497                    // Ignore the character.
3498                    Some(c) if is_spacy(c) => {
3499                        self.append_raw_to_doctype_token(c);
3500                    }
3501                    // U+003E GREATER-THAN SIGN (>)
3502                    // Switch to the data state. Emit the current DOCTYPE token.
3503                    Some(c @ '>') => {
3504                        self.append_raw_to_doctype_token(c);
3505                        self.state = State::Data;
3506                        self.emit_doctype_token();
3507                    }
3508                    // EOF
3509                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3510                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3511                    // end-of-file token.
3512                    None => {
3513                        self.emit_error(ErrorKind::EofInDoctype);
3514                        self.set_doctype_token_force_quirks();
3515                        self.emit_doctype_token();
3516                        self.emit_token(Token::Eof);
3517
3518                        return Ok(());
3519                    }
3520                    // Anything else
3521                    // If the six characters starting from the current input character are an
3522                    // ASCII case-insensitive match for the word "PUBLIC", then consume those
3523                    // characters and switch to the after DOCTYPE public keyword state.
3524                    //
3525                    // Otherwise, if the six characters starting from the current input
3526                    // character are an ASCII case-insensitive match for the word "SYSTEM", then
3527                    // consume those characters and switch to the after DOCTYPE system keyword
3528                    // state.
3529                    //
3530                    // Otherwise, this is an invalid-character-sequence-after-doctype-name parse
3531                    // error. Set the current DOCTYPE token's force-quirks flag to on. Reconsume
3532                    // in the bogus DOCTYPE state.
3533                    Some(c) => {
3534                        let b = self.buf.clone();
3535                        let mut buf = b.borrow_mut();
3536
3537                        buf.push(c);
3538
3539                        for _ in 0..5 {
3540                            match self.consume_next_char() {
3541                                Some(c) => {
3542                                    buf.push(c);
3543                                }
3544                                _ => {
3545                                    break;
3546                                }
3547                            }
3548                        }
3549
3550                        match &*buf.to_lowercase() {
3551                            "public" => {
3552                                self.state = State::AfterDoctypePublicKeyword;
3553
3554                                let b = self.sub_buf.clone();
3555                                let mut sub_buf = b.borrow_mut();
3556
3557                                sub_buf.push_str(&buf);
3558                                buf.clear();
3559                            }
3560                            "system" => {
3561                                self.state = State::AfterDoctypeSystemKeyword;
3562
3563                                let b = self.sub_buf.clone();
3564                                let mut sub_buf = b.borrow_mut();
3565
3566                                sub_buf.push_str(&buf);
3567                                buf.clear();
3568                            }
3569                            _ => {
3570                                buf.clear();
3571                                self.cur_pos = cur_pos;
3572                                unsafe {
3573                                    // Safety: We got cur_pos from self.input.cur_pos() above, so
3574                                    // it's a valid position.
3575                                    self.input.reset_to(cur_pos);
3576                                }
3577                                self.emit_error(
3578                                    ErrorKind::InvalidCharacterSequenceAfterDoctypeName,
3579                                );
3580                                self.set_doctype_token_force_quirks();
3581                                self.reconsume_in_state(State::BogusDoctype);
3582                            }
3583                        }
3584                    }
3585                }
3586            }
3587            // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
3588            State::AfterDoctypePublicKeyword => {
3589                // Consume the next input character:
3590                match self.consume_next_char() {
3591                    // U+0009 CHARACTER TABULATION (tab)
3592                    // U+000A LINE FEED (LF)
3593                    // U+000C FORM FEED (FF)
3594                    // U+0020 SPACE
3595                    // Switch to the before DOCTYPE public identifier state.
3596                    Some(c) if is_spacy(c) => {
3597                        self.append_raw_to_doctype_token(c);
3598                        self.state = State::BeforeDoctypePublicIdentifier;
3599                    }
3600                    // U+0022 QUOTATION MARK (")
3601                    // This is a missing-whitespace-after-doctype-public-keyword parse error.
3602                    // Set the current DOCTYPE token's public identifier to the empty string
3603                    // (not missing), then switch to the DOCTYPE public identifier
3604                    // (double-quoted) state.
3605                    Some(c @ '"') => {
3606                        self.append_raw_to_doctype_token(c);
3607                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
3608                        self.set_doctype_token_public_id();
3609                        self.state = State::DoctypePublicIdentifierDoubleQuoted;
3610                    }
3611                    // U+0027 APOSTROPHE (')
3612                    // This is a missing-whitespace-after-doctype-public-keyword parse error.
3613                    // Set the current DOCTYPE token's public identifier to the empty string
3614                    // (not missing), then switch to the DOCTYPE public identifier
3615                    // (single-quoted) state.
3616                    Some(c @ '\'') => {
3617                        self.append_raw_to_doctype_token(c);
3618                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
3619                        self.set_doctype_token_public_id();
3620                        self.state = State::DoctypePublicIdentifierSingleQuoted;
3621                    }
3622                    // U+003E GREATER-THAN SIGN (>)
3623                    // This is a missing-doctype-public-identifier parse error. Set the current
3624                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3625                    // the current DOCTYPE token.
3626                    Some(c @ '>') => {
3627                        self.append_raw_to_doctype_token(c);
3628                        self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
3629                        self.set_doctype_token_force_quirks();
3630                        self.state = State::Data;
3631                        self.emit_doctype_token();
3632                    }
3633                    // EOF
3634                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3635                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3636                    // end-of-file token.
3637                    None => {
3638                        self.emit_error(ErrorKind::EofInDoctype);
3639                        self.set_doctype_token_force_quirks();
3640                        self.emit_doctype_token();
3641                        self.emit_token(Token::Eof);
3642
3643                        return Ok(());
3644                    }
3645                    // Anything else
3646                    // This is a missing-quote-before-doctype-public-identifier parse error. Set
3647                    // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3648                    // bogus DOCTYPE state.
3649                    _ => {
3650                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier);
3651                        self.set_doctype_token_force_quirks();
3652                        self.reconsume_in_state(State::BogusDoctype);
3653                    }
3654                }
3655            }
3656            // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
3657            State::BeforeDoctypePublicIdentifier => {
3658                // Consume the next input character:
3659                match self.consume_next_char() {
3660                    // U+0009 CHARACTER TABULATION (tab)
3661                    // U+000A LINE FEED (LF)
3662                    // U+000C FORM FEED (FF)
3663                    // U+0020 SPACE
3664                    // Ignore the character.
3665                    Some(c) if is_spacy(c) => {
3666                        self.append_raw_to_doctype_token(c);
3667                    }
3668                    // U+0022 QUOTATION MARK (")
3669                    // Set the current DOCTYPE token's public identifier to the empty string
3670                    // (not missing), then switch to the DOCTYPE public identifier
3671                    // (double-quoted) state.
3672                    Some(c @ '"') => {
3673                        self.append_raw_to_doctype_token(c);
3674                        self.set_doctype_token_public_id();
3675                        self.state = State::DoctypePublicIdentifierDoubleQuoted;
3676                    }
3677                    // U+0027 APOSTROPHE (')
3678                    // Set the current DOCTYPE token's public identifier to the empty string
3679                    // (not missing), then switch to the DOCTYPE public identifier
3680                    // (single-quoted) state.
3681                    Some(c @ '\'') => {
3682                        self.append_raw_to_doctype_token(c);
3683                        self.set_doctype_token_public_id();
3684                        self.state = State::DoctypePublicIdentifierSingleQuoted;
3685                    }
3686                    // U+003E GREATER-THAN SIGN (>)
3687                    // This is a missing-doctype-public-identifier parse error. Set the current
3688                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3689                    // the current DOCTYPE token.
3690                    Some(c @ '>') => {
3691                        self.append_raw_to_doctype_token(c);
3692                        self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
3693                        self.set_doctype_token_force_quirks();
3694                        self.state = State::Data;
3695                        self.emit_doctype_token();
3696                    }
3697                    // EOF
3698                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3699                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3700                    // end-of-file token.
3701                    None => {
3702                        self.emit_error(ErrorKind::EofInDoctype);
3703                        self.set_doctype_token_force_quirks();
3704                        self.emit_doctype_token();
3705                        self.emit_token(Token::Eof);
3706
3707                        return Ok(());
3708                    }
3709                    // Anything else
3710                    // This is a missing-quote-before-doctype-public-identifier parse error. Set
3711                    // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3712                    // bogus DOCTYPE state.
3713                    _ => {
3714                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier);
3715                        self.set_doctype_token_force_quirks();
3716                        self.reconsume_in_state(State::BogusDoctype);
3717                    }
3718                }
3719            }
3720            // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
3721            State::DoctypePublicIdentifierDoubleQuoted => {
3722                // Consume the next input character:
3723                match self.consume_next_char() {
3724                    // U+0022 QUOTATION MARK (")
3725                    // Switch to the after DOCTYPE public identifier state.
3726                    Some(c @ '"') => {
3727                        self.append_raw_to_doctype_token(c);
3728                        self.finish_doctype_token_public_id();
3729                        self.state = State::AfterDoctypePublicIdentifier;
3730                    }
3731                    // U+0000 NULL
3732                    // This is an unexpected-null-character parse error. Append a U+FFFD
3733                    // REPLACEMENT CHARACTER character to the current DOCTYPE token's public
3734                    // identifier.
3735                    Some(c @ '\x00') => {
3736                        self.append_raw_to_doctype_token(c);
3737                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
3738                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None);
3739                    }
3740                    // U+003E GREATER-THAN SIGN (>)
3741                    // This is an abrupt-doctype-public-identifier parse error. Set the current
3742                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3743                    // the current DOCTYPE token.
3744                    Some(c @ '>') => {
3745                        self.append_raw_to_doctype_token(c);
3746                        self.finish_doctype_token_public_id();
3747                        self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
3748                        self.set_doctype_token_force_quirks();
3749                        self.state = State::Data;
3750                        self.emit_doctype_token();
3751                    }
3752                    // EOF
3753                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3754                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3755                    // end-of-file token.
3756                    None => {
3757                        self.finish_doctype_token_public_id();
3758                        self.emit_error(ErrorKind::EofInDoctype);
3759                        self.set_doctype_token_force_quirks();
3760                        self.emit_doctype_token();
3761                        self.emit_token(Token::Eof);
3762
3763                        return Ok(());
3764                    }
3765                    // Anything else
3766                    // Append the current input character to the current DOCTYPE token's public
3767                    // identifier.
3768                    Some(c) => {
3769                        self.validate_input_stream_character(c);
3770                        self.consume_and_append_to_doctype_token_public_id(c, |c| {
3771                            if !is_allowed_character(c) {
3772                                return false;
3773                            }
3774
3775                            !matches!(c, '"' | '\x00' | '>' | '\r')
3776                        });
3777                    }
3778                }
3779            }
3780            // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
3781            State::DoctypePublicIdentifierSingleQuoted => {
3782                // Consume the next input character:
3783                match self.consume_next_char() {
3784                    // U+0027 APOSTROPHE (')
3785                    // Switch to the after DOCTYPE public identifier state.
3786                    Some(c @ '\'') => {
3787                        self.finish_doctype_token_public_id();
3788                        self.append_raw_to_doctype_token(c);
3789                        self.state = State::AfterDoctypePublicIdentifier;
3790                    }
3791                    // U+0000 NULL
3792                    // This is an unexpected-null-character parse error. Append a U+FFFD
3793                    // REPLACEMENT CHARACTER character to the current DOCTYPE token's public
3794                    // identifier.
3795                    Some(c @ '\x00') => {
3796                        self.append_raw_to_doctype_token(c);
3797                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
3798                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None);
3799                    }
3800                    // U+003E GREATER-THAN SIGN (>)
3801                    // This is an abrupt-doctype-public-identifier parse error. Set the current
3802                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3803                    // the current DOCTYPE token.
3804                    Some(c @ '>') => {
3805                        self.finish_doctype_token_public_id();
3806                        self.append_raw_to_doctype_token(c);
3807                        self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
3808                        self.set_doctype_token_force_quirks();
3809                        self.state = State::Data;
3810                        self.emit_doctype_token();
3811                    }
3812                    // EOF
3813                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3814                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3815                    // end-of-file token.
3816                    None => {
3817                        self.finish_doctype_token_public_id();
3818                        self.emit_error(ErrorKind::EofInDoctype);
3819                        self.set_doctype_token_force_quirks();
3820                        self.emit_doctype_token();
3821                        self.emit_token(Token::Eof);
3822
3823                        return Ok(());
3824                    }
3825                    // Anything else
3826                    // Append the current input character to the current DOCTYPE token's public
3827                    // identifier.
3828                    Some(c) => {
3829                        self.validate_input_stream_character(c);
3830                        self.consume_and_append_to_doctype_token_public_id(c, |c| {
3831                            if !is_allowed_character(c) {
3832                                return false;
3833                            }
3834
3835                            !matches!(c, '\'' | '\x00' | '>' | '\r')
3836                        });
3837                    }
3838                }
3839            }
3840            // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
3841            State::AfterDoctypePublicIdentifier => {
3842                // Consume the next input character:
3843                match self.consume_next_char() {
3844                    // U+0009 CHARACTER TABULATION (tab)
3845                    // U+000A LINE FEED (LF)
3846                    // U+000C FORM FEED (FF)
3847                    // U+0020 SPACE
3848                    // Switch to the between DOCTYPE public and system identifiers state.
3849                    Some(c) if is_spacy(c) => {
3850                        self.append_raw_to_doctype_token(c);
3851                        self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
3852                    }
3853                    // U+003E GREATER-THAN SIGN (>)
3854                    // Switch to the data state. Emit the current DOCTYPE token.
3855                    Some(c @ '>') => {
3856                        self.append_raw_to_doctype_token(c);
3857                        self.state = State::Data;
3858                        self.emit_doctype_token();
3859                    }
3860                    // U+0022 QUOTATION MARK (")
3861                    // This is a missing-whitespace-between-doctype-public-and-system-identifiers
3862                    // parse error. Set the current DOCTYPE token's system
3863                    // identifier to the empty string (not missing), then switch
3864                    // to the DOCTYPE system identifier (double-quoted) state.
3865                    Some(c @ '"') => {
3866                        self.append_raw_to_doctype_token(c);
3867                        self.emit_error(
3868                            ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
3869                        );
3870                        self.set_doctype_token_system_id();
3871                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
3872                    }
3873                    // U+0027 APOSTROPHE (')
3874                    // This is a missing-whitespace-between-doctype-public-and-system-identifiers
3875                    // parse error. Set the current DOCTYPE token's system
3876                    // identifier to the empty string (not missing), then switch
3877                    // to the DOCTYPE system identifier (single-quoted) state.
3878                    Some(c @ '\'') => {
3879                        self.append_raw_to_doctype_token(c);
3880                        self.emit_error(
3881                            ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
3882                        );
3883                        self.set_doctype_token_system_id();
3884                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
3885                    }
3886                    // EOF
3887                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3888                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3889                    // end-of-file token.
3890                    None => {
3891                        self.emit_error(ErrorKind::EofInDoctype);
3892                        self.set_doctype_token_force_quirks();
3893                        self.emit_doctype_token();
3894                        self.emit_token(Token::Eof);
3895
3896                        return Ok(());
3897                    }
3898                    // Anything else
3899                    // This is a missing-quote-before-doctype-system-identifier parse error. Set
3900                    // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3901                    // bogus DOCTYPE state.
3902                    _ => {
3903                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
3904                        self.set_doctype_token_force_quirks();
3905                        self.reconsume_in_state(State::BogusDoctype);
3906                    }
3907                }
3908            }
3909            // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
3910            State::BetweenDoctypePublicAndSystemIdentifiers => {
3911                // Consume the next input character:
3912                match self.consume_next_char() {
3913                    // U+0009 CHARACTER TABULATION (tab)
3914                    // U+000A LINE FEED (LF)
3915                    // U+000C FORM FEED (FF)
3916                    // U+0020 SPACE
3917                    // Ignore the character.
3918                    Some(c) if is_spacy(c) => {
3919                        self.append_raw_to_doctype_token(c);
3920                    }
3921                    // U+003E GREATER-THAN SIGN (>)
3922                    // Switch to the data state. Emit the current DOCTYPE token.
3923                    Some(c @ '>') => {
3924                        self.append_raw_to_doctype_token(c);
3925                        self.state = State::Data;
3926                        self.emit_doctype_token();
3927                    }
3928                    // U+0022 QUOTATION MARK (")
3929                    // Set the current DOCTYPE token's system identifier to the empty string
3930                    // (not missing), then switch to the DOCTYPE system identifier
3931                    // (double-quoted) state.
3932                    Some(c @ '"') => {
3933                        self.append_raw_to_doctype_token(c);
3934                        self.set_doctype_token_system_id();
3935                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
3936                    }
3937                    // U+0027 APOSTROPHE (')
3938                    // Set the current DOCTYPE token's system identifier to the empty string
3939                    // (not missing), then switch to the DOCTYPE system identifier
3940                    // (single-quoted) state.
3941                    Some(c @ '\'') => {
3942                        self.append_raw_to_doctype_token(c);
3943                        self.set_doctype_token_system_id();
3944                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
3945                    }
3946                    // EOF
3947                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3948                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3949                    // end-of-file token.
3950                    None => {
3951                        self.emit_error(ErrorKind::EofInDoctype);
3952                        self.set_doctype_token_force_quirks();
3953                        self.emit_doctype_token();
3954                        self.emit_token(Token::Eof);
3955
3956                        return Ok(());
3957                    }
3958                    // Anything else
3959                    // This is a missing-quote-before-doctype-system-identifier parse error. Set
3960                    // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3961                    // bogus DOCTYPE state
3962                    _ => {
3963                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
3964                        self.set_doctype_token_force_quirks();
3965                        self.reconsume_in_state(State::BogusDoctype);
3966                    }
3967                }
3968            }
3969            // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
3970            State::AfterDoctypeSystemKeyword => {
3971                // Consume the next input character:
3972                match self.consume_next_char() {
3973                    // U+0009 CHARACTER TABULATION (tab)
3974                    // U+000A LINE FEED (LF)
3975                    // U+000C FORM FEED (FF)
3976                    // U+0020 SPACE
3977                    // Switch to the before DOCTYPE system identifier state.
3978                    Some(c) if is_spacy(c) => {
3979                        self.append_raw_to_doctype_token(c);
3980                        self.state = State::BeforeDoctypeSystemIdentifier;
3981                    }
3982                    // U+0022 QUOTATION MARK (")
3983                    // This is a missing-whitespace-after-doctype-system-keyword parse error.
3984                    // Set the current DOCTYPE token's system identifier to the empty string
3985                    // (not missing), then switch to the DOCTYPE system identifier
3986                    // (double-quoted) state.
3987                    Some(c @ '"') => {
3988                        self.append_raw_to_doctype_token(c);
3989                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
3990                        self.set_doctype_token_system_id();
3991                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
3992                    }
3993                    // U+0027 APOSTROPHE (')
3994                    // This is a missing-whitespace-after-doctype-system-keyword parse error.
3995                    // Set the current DOCTYPE token's system identifier to the empty string
3996                    // (not missing), then switch to the DOCTYPE system identifier
3997                    // (single-quoted) state.
3998                    Some(c @ '\'') => {
3999                        self.append_raw_to_doctype_token(c);
4000                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
4001                        self.set_doctype_token_system_id();
4002                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
4003                    }
4004                    // U+003E GREATER-THAN SIGN (>)
4005                    // This is a missing-doctype-system-identifier parse error. Set the current
4006                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4007                    // the current DOCTYPE token.
4008                    Some(c @ '>') => {
4009                        self.append_raw_to_doctype_token(c);
4010                        self.emit_error(ErrorKind::MissingDoctypeSystemIdentifier);
4011                        self.set_doctype_token_force_quirks();
4012                        self.state = State::Data;
4013                        self.emit_doctype_token();
4014                    }
4015                    // EOF
4016                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4017                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4018                    // end-of-file token.
4019                    None => {
4020                        self.emit_error(ErrorKind::EofInDoctype);
4021                        self.set_doctype_token_force_quirks();
4022                        self.emit_doctype_token();
4023                        self.emit_token(Token::Eof);
4024
4025                        return Ok(());
4026                    }
4027                    // Anything else
4028                    // This is a missing-quote-before-doctype-system-identifier parse error. Set
4029                    // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
4030                    // bogus DOCTYPE state.
4031                    _ => {
4032                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
4033                        self.set_doctype_token_force_quirks();
4034                        self.reconsume_in_state(State::BogusDoctype);
4035                    }
4036                }
4037            }
4038            // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
4039            State::BeforeDoctypeSystemIdentifier => {
4040                // Consume the next input character:
4041                match self.consume_next_char() {
4042                    // U+0009 CHARACTER TABULATION (tab)
4043                    // U+000A LINE FEED (LF)
4044                    // U+000C FORM FEED (FF)
4045                    // U+0020 SPACE
4046                    // Ignore the character.
4047                    Some(c) if is_spacy(c) => {
4048                        self.append_raw_to_doctype_token(c);
4049                    }
4050                    // U+0022 QUOTATION MARK (")
4051                    // Set the current DOCTYPE token's system identifier to the empty string
4052                    // (not missing), then switch to the DOCTYPE system identifier
4053                    // (double-quoted) state.
4054                    Some(c @ '"') => {
4055                        self.append_raw_to_doctype_token(c);
4056                        self.set_doctype_token_system_id();
4057                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
4058                    }
4059                    // U+0027 APOSTROPHE (')
4060                    // Set the current DOCTYPE token's system identifier to the empty string
4061                    // (not missing), then switch to the DOCTYPE system identifier
4062                    // (single-quoted) state.
4063                    Some(c @ '\'') => {
4064                        self.append_raw_to_doctype_token(c);
4065                        self.set_doctype_token_system_id();
4066                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
4067                    }
4068                    // U+003E GREATER-THAN SIGN (>)
4069                    // This is a missing-doctype-system-identifier parse error. Set the current
4070                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4071                    // the current DOCTYPE token.
4072                    Some(c @ '>') => {
4073                        self.append_raw_to_doctype_token(c);
4074                        self.emit_error(ErrorKind::EofInDoctype);
4075                        self.set_doctype_token_force_quirks();
4076                        self.state = State::Data;
4077                        self.emit_doctype_token();
4078                    }
4079                    // EOF
4080                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4081                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4082                    // end-of-file token.
4083                    None => {
4084                        self.emit_error(ErrorKind::EofInDoctype);
4085                        self.set_doctype_token_force_quirks();
4086                        self.emit_doctype_token();
4087                        self.emit_token(Token::Eof);
4088
4089                        return Ok(());
4090                    }
4091                    // Anything else
4092                    // This is a missing-quote-before-doctype-system-identifier parse error. Set
4093                    // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
4094                    // bogus DOCTYPE state.
4095                    _ => {
4096                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
4097                        self.set_doctype_token_force_quirks();
4098                        self.reconsume_in_state(State::BogusDoctype);
4099                    }
4100                }
4101            }
4102            // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
4103            State::DoctypeSystemIdentifierDoubleQuoted => {
4104                // Consume the next input character:
4105                match self.consume_next_char() {
4106                    // U+0027 APOSTROPHE (')
4107                    // Switch to the after DOCTYPE system identifier state.
4108                    Some(c @ '"') => {
4109                        self.finish_doctype_token_system_id();
4110                        self.append_raw_to_doctype_token(c);
4111                        self.state = State::AfterDoctypeSystemIdentifier;
4112                    }
4113                    // U+0000 NULL
4114                    // This is an unexpected-null-character parse error. Append a U+FFFD
4115                    // REPLACEMENT CHARACTER character to the current DOCTYPE token's system
4116                    // identifier.
4117                    Some(c @ '\x00') => {
4118                        self.append_raw_to_doctype_token(c);
4119                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
4120                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER));
4121                    }
4122                    // U+003E GREATER-THAN SIGN (>)
4123                    // This is an abrupt-doctype-system-identifier parse error. Set the current
4124                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4125                    // the current DOCTYPE token.
4126                    Some(c @ '>') => {
4127                        self.finish_doctype_token_system_id();
4128                        self.append_raw_to_doctype_token(c);
4129                        self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
4130                        self.set_doctype_token_force_quirks();
4131                        self.state = State::Data;
4132                        self.emit_doctype_token();
4133                    }
4134                    // EOF
4135                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4136                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4137                    // end-of-file token.
4138                    None => {
4139                        self.finish_doctype_token_system_id();
4140                        self.emit_error(ErrorKind::EofInDoctype);
4141                        self.set_doctype_token_force_quirks();
4142                        self.emit_doctype_token();
4143                        self.emit_token(Token::Eof);
4144
4145                        return Ok(());
4146                    }
4147                    // Anything else
4148                    // Append the current input character to the current DOCTYPE token's system
4149                    // identifier.
4150                    Some(c) => {
4151                        self.validate_input_stream_character(c);
4152                        self.consume_and_append_to_doctype_token_system_id(c, |c| {
4153                            if !is_allowed_character(c) {
4154                                return false;
4155                            }
4156
4157                            !matches!(c, '"' | '\x00' | '>' | '\r')
4158                        });
4159                    }
4160                }
4161            }
4162            // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
4163            State::DoctypeSystemIdentifierSingleQuoted => {
4164                // Consume the next input character:
4165                match self.consume_next_char() {
4166                    // U+0027 APOSTROPHE (')
4167                    // Switch to the after DOCTYPE system identifier state.
4168                    Some(c @ '\'') => {
4169                        self.finish_doctype_token_system_id();
4170                        self.append_raw_to_doctype_token(c);
4171                        self.state = State::AfterDoctypeSystemIdentifier;
4172                    }
4173                    // U+0000 NULL
4174                    // This is an unexpected-null-character parse error. Append a U+FFFD
4175                    // REPLACEMENT CHARACTER character to the current DOCTYPE token's system
4176                    // identifier.
4177                    Some(c @ '\x00') => {
4178                        self.append_raw_to_doctype_token(c);
4179                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
4180                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER));
4181                    }
4182                    // U+003E GREATER-THAN SIGN (>)
4183                    // This is an abrupt-doctype-system-identifier parse error. Set the current
4184                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4185                    // the current DOCTYPE token.
4186                    Some(c @ '>') => {
4187                        self.finish_doctype_token_system_id();
4188                        self.append_raw_to_doctype_token(c);
4189                        self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
4190                        self.set_doctype_token_force_quirks();
4191                        self.state = State::Data;
4192                        self.emit_doctype_token();
4193                    }
4194                    // EOF
4195                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4196                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4197                    // end-of-file token.
4198                    None => {
4199                        self.finish_doctype_token_system_id();
4200                        self.emit_error(ErrorKind::EofInDoctype);
4201                        self.set_doctype_token_force_quirks();
4202                        self.emit_doctype_token();
4203                        self.emit_token(Token::Eof);
4204
4205                        return Ok(());
4206                    }
4207                    // Anything else
4208                    // Append the current input character to the current DOCTYPE token's system
4209                    // identifier.
4210                    Some(c) => {
4211                        self.validate_input_stream_character(c);
4212                        self.consume_and_append_to_doctype_token_system_id(c, |c| {
4213                            if !is_allowed_character(c) {
4214                                return false;
4215                            }
4216
4217                            !matches!(c, '\'' | '\x00' | '>' | '\r')
4218                        });
4219                    }
4220                }
4221            }
4222            // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
4223            State::AfterDoctypeSystemIdentifier => {
4224                // Consume the next input character:
4225                match self.consume_next_char() {
4226                    // U+0009 CHARACTER TABULATION (tab)
4227                    // U+000A LINE FEED (LF)
4228                    // U+000C FORM FEED (FF)
4229                    // U+0020 SPACE
4230                    // Ignore the character.
4231                    Some(c) if is_spacy(c) => {
4232                        self.append_raw_to_doctype_token(c);
4233                    }
4234                    // U+003E GREATER-THAN SIGN (>)
4235                    // Switch to the data state. Emit the current DOCTYPE token.
4236                    Some(c @ '>') => {
4237                        self.append_raw_to_doctype_token(c);
4238                        self.state = State::Data;
4239                        self.emit_doctype_token();
4240                    }
4241                    // EOF
4242                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4243                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4244                    // end-of-file token.
4245                    None => {
4246                        self.emit_error(ErrorKind::EofInDoctype);
4247                        self.set_doctype_token_force_quirks();
4248                        self.emit_doctype_token();
4249                        self.emit_token(Token::Eof);
4250
4251                        return Ok(());
4252                    }
4253                    // Anything else
4254                    // This is an unexpected-character-after-doctype-system-identifier parse
4255                    // error. Reconsume in the bogus DOCTYPE state. (This does not set the
4256                    // current DOCTYPE token's force-quirks flag to on.)
4257                    _ => {
4258                        self.emit_error(ErrorKind::UnexpectedCharacterAfterDoctypeSystemIdentifier);
4259                        self.reconsume_in_state(State::BogusDoctype);
4260                    }
4261                }
4262            }
4263            // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
4264            State::BogusDoctype => {
4265                // Consume the next input character:
4266                match self.consume_next_char() {
4267                    // U+003E GREATER-THAN SIGN (>)
4268                    // Switch to the data state. Emit the DOCTYPE token.
4269                    Some(c @ '>') => {
4270                        self.append_raw_to_doctype_token(c);
4271                        self.state = State::Data;
4272                        self.emit_doctype_token();
4273                    }
4274                    // U+0000 NULL
4275                    // This is an unexpected-null-character parse error. Ignore the character.
4276                    Some(c @ '\x00') => {
4277                        self.append_raw_to_doctype_token(c);
4278                        self.emit_error(ErrorKind::UnexpectedNullCharacter);
4279                    }
4280                    // EOF
4281                    // Emit the DOCTYPE token. Emit an end-of-file token.
4282                    None => {
4283                        self.emit_doctype_token();
4284                        self.emit_token(Token::Eof);
4285
4286                        return Ok(());
4287                    }
4288                    // Anything else
4289                    // Ignore the character.
4290                    Some(c) => {
4291                        self.validate_input_stream_character(c);
4292                        self.append_raw_to_doctype_token(c);
4293                    }
4294                }
4295            }
4296            // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
4297            State::CdataSection => {
4298                // Consume the next input character:
4299                match self.consume_next_char() {
4300                    // U+005D RIGHT SQUARE BRACKET (])
4301                    // Switch to the CDATA section bracket state.
4302                    Some(']') => {
4303                        self.state = State::CdataSectionBracket;
4304                    }
4305                    // EOF
4306                    // This is an eof-in-cdata parse error. Emit an end-of-file token.
4307                    None => {
4308                        self.emit_error(ErrorKind::EofInCdata);
4309                        self.emit_token(Token::Eof);
4310
4311                        return Ok(());
4312                    }
4313                    // Anything else
4314                    // Emit the current input character as a character token.
4315                    Some(c) => {
4316                        self.validate_input_stream_character(c);
4317                        self.handle_raw_and_emit_character_token(c);
4318                    }
4319                }
4320            }
4321            // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
4322            State::CdataSectionBracket => {
4323                // Consume the next input character:
4324                match self.consume_next_char() {
4325                    // U+005D RIGHT SQUARE BRACKET (])
4326                    // Switch to the CDATA section end state.
4327                    Some(']') => {
4328                        self.state = State::CdataSectionEnd;
4329                    }
4330                    // Anything else
4331                    // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
4332                    // CDATA section state.
4333                    _ => {
4334                        self.emit_character_token(']');
4335                        self.reconsume_in_state(State::CdataSection);
4336                    }
4337                }
4338            }
4339            // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
4340            State::CdataSectionEnd => {
4341                // Consume the next input character:
4342                match self.consume_next_char() {
4343                    // U+005D RIGHT SQUARE BRACKET (])
4344                    // Emit a U+005D RIGHT SQUARE BRACKET character token.
4345                    Some(c @ ']') => {
4346                        self.emit_character_token_with_raw(']', c);
4347                    }
4348                    // U+003E GREATER-THAN SIGN character
4349                    // Switch to the data state.
4350                    Some('>') => {
4351                        self.state = State::Data;
4352                    }
4353                    // Anything else
4354                    // Emit two U+005D RIGHT SQUARE BRACKET character tokens. Reconsume in the
4355                    // CDATA section state.
4356                    _ => {
4357                        self.emit_character_token(']');
4358                        self.emit_character_token(']');
4359                        self.reconsume_in_state(State::CdataSection);
4360                    }
4361                }
4362            }
4363            // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
4364            State::CharacterReference => {
4365                // Set the temporary buffer to the empty string. Append a U+0026 AMPERSAND (&)
4366                // character to the temporary buffer.
4367                self.temporary_buffer.clear();
4368                self.temporary_buffer.push('&');
4369
4370                // Consume the next input character:
4371                match self.consume_next_char() {
4372                    // ASCII alphanumeric
4373                    // Reconsume in the named character reference state.
4374                    Some(c) if c.is_ascii_alphanumeric() => {
4375                        self.reconsume_in_state(State::NamedCharacterReference);
4376                    }
4377                    // U+0023 NUMBER SIGN (#)
4378                    // Append the current input character to the temporary buffer. Switch to the
4379                    // numeric character reference state.
4380                    Some(c @ '#') => {
4381                        self.temporary_buffer.push(c);
4382                        self.state = State::NumericCharacterReference;
4383                    }
4384                    // Anything else
4385                    // Flush code points consumed as a character reference. Reconsume in the
4386                    // return state.
4387                    _ => {
4388                        self.flush_code_points_consumed_as_character_reference(None);
4389                        self.reconsume_in_state(self.return_state.clone());
4390                    }
4391                }
4392            }
4393            // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
4394            State::NamedCharacterReference => {
4395                // Consume the maximum number of characters possible, where the consumed
4396                // characters are one of the identifiers in the first column of the named
4397                // character references table. Append each character to the temporary buffer
4398                // when it's consumed.
4399                // The shortest entity - `&GT`
4400                // The longest entity - `&CounterClockwiseContourIntegral;`
4401                let initial_cur_pos = self.input.cur_pos();
4402
4403                let mut entity: Option<&Entity> = None;
4404                let mut entity_cur_pos: Option<BytePos> = None;
4405                let mut entity_temporary_buffer =
4406                    String::with_capacity(self.temporary_buffer.capacity());
4407
4408                entity_temporary_buffer.push_str(&self.temporary_buffer);
4409
4410                // No need to validate input, because we reset position if nothing was found
4411                while let Some(c) = &self.consume_next_char() {
4412                    entity_temporary_buffer.push(*c);
4413
4414                    if let Some(found_entity) = HTML_ENTITIES.get(&entity_temporary_buffer) {
4415                        entity = Some(found_entity);
4416                        entity_cur_pos = Some(self.input.cur_pos());
4417
4418                        self.temporary_buffer
4419                            .replace_range(1.., &entity_temporary_buffer[1..]);
4420                    } else {
4421                        // We stop when:
4422                        //
4423                        // - not ascii alphanumeric
4424                        // - we consume more characters than the longest entity
4425                        if !c.is_ascii_alphanumeric() || entity_temporary_buffer.len() > 32 {
4426                            break;
4427                        }
4428                    }
4429                }
4430
4431                if entity.is_some() {
4432                    self.cur_pos = entity_cur_pos.unwrap();
4433                    unsafe {
4434                        // Safety: We got entity_cur_pos from the input, so it's valid
4435                        self.input.reset_to(entity_cur_pos.unwrap());
4436                    }
4437                } else {
4438                    self.cur_pos = initial_cur_pos;
4439                    unsafe {
4440                        // Safety: We got initial_cur_pos from the input, so it's valid
4441                        self.input.reset_to(initial_cur_pos);
4442                    }
4443                }
4444
4445                let is_last_semicolon = self.temporary_buffer.ends_with(';');
4446
4447                // If there is a match
4448                match entity {
4449                    Some(entity) => {
4450                        let is_next_equals_sign_or_ascii_alphanumeric = match self.next() {
4451                            Some('=') => true,
4452                            Some(c) if c.is_ascii_alphanumeric() => true,
4453                            _ => false,
4454                        };
4455
4456                        // If the character reference was consumed as part of an attribute, and
4457                        // the last character matched is not a
4458                        // U+003B SEMICOLON character (;), and the next input
4459                        // character is either a U+003D EQUALS SIGN character (=) or an ASCII
4460                        // alphanumeric, then, for historical reasons, flush code points
4461                        // consumed as a character reference and
4462                        // switch to the return state.
4463                        if self.is_consumed_as_part_of_an_attribute()
4464                            && !is_last_semicolon
4465                            && is_next_equals_sign_or_ascii_alphanumeric
4466                        {
4467                            self.flush_code_points_consumed_as_character_reference(None);
4468                            self.state = self.return_state.clone();
4469                        }
4470                        // Otherwise:
4471                        //
4472                        // If the last character matched is not a U+003B SEMICOLON character
4473                        // (;), then this is a missing-semicolon-after-character-reference parse
4474                        // error.
4475                        //
4476                        // Set the temporary buffer to the empty string. Append one or two
4477                        // characters corresponding to the character reference name (as given by
4478                        // the second column of the named character references table) to the
4479                        // temporary buffer.
4480                        //
4481                        // Flush code points consumed as a character reference. Switch to the
4482                        // return state.
4483                        else {
4484                            if !is_last_semicolon {
4485                                self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
4486                            }
4487
4488                            let old_temporary_buffer = self.temporary_buffer.clone();
4489
4490                            self.temporary_buffer.clear();
4491                            self.temporary_buffer.push_str(&entity.characters);
4492                            self.flush_code_points_consumed_as_character_reference(Some(
4493                                old_temporary_buffer,
4494                            ));
4495                            self.state = self.return_state.clone();
4496                        }
4497                    }
4498                    // Otherwise
4499                    // Flush code points consumed as a character reference. Switch to the
4500                    // ambiguous ampersand state.
4501                    _ => {
4502                        self.flush_code_points_consumed_as_character_reference(None);
4503                        self.state = State::AmbiguousAmpersand;
4504                    }
4505                }
4506            }
4507            // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
4508            State::AmbiguousAmpersand => {
4509                // Consume the next input character:
4510                match self.consume_next_char() {
4511                    // ASCII alphanumeric
4512                    // If the character reference was consumed as part of an attribute, then
4513                    // append the current input character to the current attribute's value.
4514                    // Otherwise, emit the current input character as a character token.
4515                    Some(c) if c.is_ascii_alphanumeric() => {
4516                        if self.is_consumed_as_part_of_an_attribute() {
4517                            self.append_to_attribute_token_value(Some(c), Some(c));
4518                        } else {
4519                            self.emit_character_token(c);
4520                        }
4521                    }
4522                    // U+003B SEMICOLON (;)
4523                    // This is an unknown-named-character-reference parse error. Reconsume in
4524                    // the return state.
4525                    Some(';') => {
4526                        self.emit_error(ErrorKind::UnknownNamedCharacterReference);
4527                        self.reconsume_in_state(self.return_state.clone());
4528                    }
4529                    // Anything else
4530                    // Reconsume in the return state.
4531                    _ => {
4532                        self.reconsume_in_state(self.return_state.clone());
4533                    }
4534                }
4535            }
4536            // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
4537            State::NumericCharacterReference => {
4538                self.character_reference_code = Some(vec![(0, 0, None)]);
4539
4540                // Consume the next input character:
4541                match self.consume_next_char() {
4542                    // U+0078 LATIN SMALL LETTER X
4543                    // U+0058 LATIN CAPITAL LETTER X
4544                    // Append the current input character to the temporary buffer. Switch to the
4545                    // hexadecimal character reference start state.
4546                    Some(c @ 'x' | c @ 'X') => {
4547                        self.temporary_buffer.push(c);
4548                        self.state = State::HexademicalCharacterReferenceStart;
4549                    }
4550                    // Anything else
4551                    // Reconsume in the decimal character reference start state.
4552                    _ => {
4553                        self.reconsume_in_state(State::DecimalCharacterReferenceStart);
4554                    }
4555                }
4556            }
4557            // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
4558            State::HexademicalCharacterReferenceStart => {
4559                // Consume the next input character:
4560                match self.consume_next_char() {
4561                    // ASCII hex digit
4562                    // Reconsume in the hexadecimal character reference state.
4563                    Some(c) if is_ascii_hex_digit(c) => {
4564                        self.reconsume_in_state(State::HexademicalCharacterReference);
4565                    }
4566                    // Anything else
4567                    // This is an absence-of-digits-in-numeric-character-reference parse error.
4568                    // Flush code points consumed as a character reference. Reconsume in the
4569                    // return state.
4570                    _ => {
4571                        self.emit_error(ErrorKind::AbsenceOfDigitsInNumericCharacterReference);
4572                        self.flush_code_points_consumed_as_character_reference(None);
4573                        self.reconsume_in_state(self.return_state.clone());
4574                    }
4575                }
4576            }
4577            // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
4578            State::DecimalCharacterReferenceStart => {
4579                // Consume the next input character:
4580                match self.consume_next_char() {
4581                    // ASCII digit
4582                    // Reconsume in the decimal character reference state.
4583                    Some(c) if c.is_ascii_digit() => {
4584                        self.reconsume_in_state(State::DecimalCharacterReference);
4585                    }
4586                    // Anything else
4587                    // This is an absence-of-digits-in-numeric-character-reference parse error.
4588                    // Flush code points consumed as a character reference. Reconsume in the
4589                    // return state.
4590                    _ => {
4591                        self.emit_error(ErrorKind::AbsenceOfDigitsInNumericCharacterReference);
4592                        self.flush_code_points_consumed_as_character_reference(None);
4593                        self.reconsume_in_state(self.return_state.clone());
4594                    }
4595                }
4596            }
4597            // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
4598            State::HexademicalCharacterReference => {
4599                // Consume the next input character:
4600                match self.consume_next_char() {
4601                    // ASCII digit
4602                    // Multiply the character reference code by 16. Add a numeric version of the
4603                    // current input character (subtract 0x0030 from the character's code point)
4604                    // to the character reference code.
4605                    Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code {
4606                        Some(character_reference_code) => {
4607                            character_reference_code.push((16, c as u32 - 0x30, Some(c)));
4608                        }
4609                        _ => {
4610                            unreachable!();
4611                        }
4612                    },
4613                    // ASCII upper hex digit
4614                    // Multiply the character reference code by 16. Add a numeric version of the
4615                    // current input character as a hexadecimal digit (subtract 0x0037 from the
4616                    // character's code point) to the character reference code.
4617                    Some(c) if is_upper_hex_digit(c) => match &mut self.character_reference_code {
4618                        Some(character_reference_code) => {
4619                            character_reference_code.push((16, c as u32 - 0x37, Some(c)));
4620                        }
4621                        _ => {
4622                            unreachable!();
4623                        }
4624                    },
4625                    // ASCII lower hex digit
4626                    // Multiply the character reference code by 16. Add a numeric version of the
4627                    // current input character as a hexadecimal digit (subtract 0x0057 from the
4628                    // character's code point) to the character reference code.
4629                    Some(c) if is_lower_hex_digit(c) => match &mut self.character_reference_code {
4630                        Some(character_reference_code) => {
4631                            character_reference_code.push((16, c as u32 - 0x57, Some(c)));
4632                        }
4633                        _ => {
4634                            unreachable!();
4635                        }
4636                    },
4637                    // U+003B SEMICOLON
4638                    // Switch to the numeric character reference end state.
4639                    Some(';') => {
4640                        self.state = State::NumericCharacterReferenceEnd;
4641                    }
4642                    // Anything else
4643                    // This is a missing-semicolon-after-character-reference parse error.
4644                    // Reconsume in the numeric character reference end state.
4645                    _ => {
4646                        self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
4647                        self.reconsume_in_state(State::NumericCharacterReferenceEnd);
4648                    }
4649                }
4650            }
4651            // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
4652            State::DecimalCharacterReference => {
4653                // Consume the next input character:
4654                match self.consume_next_char() {
4655                    // ASCII digit
4656                    // Multiply the character reference code by 10. Add a numeric version of the
4657                    // current input character (subtract 0x0030 from the character's code point)
4658                    // to the character reference code.
4659                    Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code {
4660                        Some(character_reference_code) => {
4661                            character_reference_code.push((10, c as u32 - 0x30, Some(c)));
4662                        }
4663                        _ => {
4664                            unreachable!();
4665                        }
4666                    },
4667                    // U+003B SEMICOLON
4668                    // Switch to the numeric character reference end state.
4669                    Some(';') => self.state = State::NumericCharacterReferenceEnd,
4670                    // Anything else
4671                    // This is a missing-semicolon-after-character-reference parse error.
4672                    // Reconsume in the numeric character reference end state.
4673                    _ => {
4674                        self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
4675                        self.reconsume_in_state(State::NumericCharacterReferenceEnd);
4676                    }
4677                }
4678            }
4679            // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
4680            State::NumericCharacterReferenceEnd => {
4681                let (value, raw_char_ref) =
4682                    if let Some(chars) = self.character_reference_code.take() {
4683                        let mut raw = String::with_capacity(8);
4684                        let mut i: u32 = 0;
4685                        let mut overflowed = false;
4686
4687                        for (base, value, c) in chars.iter() {
4688                            if let Some(c) = c {
4689                                raw.push(*c);
4690                            }
4691
4692                            if !overflowed {
4693                                if let Some(result) = i.checked_mul(*base as u32) {
4694                                    i = result;
4695
4696                                    if let Some(result) = i.checked_add(*value) {
4697                                        i = result;
4698                                    } else {
4699                                        i = 0x110000;
4700
4701                                        overflowed = true;
4702                                    }
4703                                } else {
4704                                    i = 0x110000;
4705
4706                                    overflowed = true;
4707                                }
4708                            }
4709                        }
4710
4711                        (i, raw)
4712                    } else {
4713                        unreachable!();
4714                    };
4715
4716                // Check the character reference code:
4717                let cr = match value {
4718                    // If the number is 0x00, then this is a null-character-reference
4719                    // parse error. Set the character
4720                    // reference code to 0xFFFD.
4721                    0 => {
4722                        self.emit_error(ErrorKind::NullCharacterReference);
4723
4724                        0xfffd
4725                    }
4726                    // If the number is greater than 0x10FFFF, then this is a
4727                    // character-reference-outside-unicode-range parse error. Set the
4728                    // character reference code to
4729                    // 0xFFFD.
4730                    cr if cr > 0x10ffff => {
4731                        self.emit_error(ErrorKind::CharacterReferenceOutsideUnicodeRange);
4732
4733                        0xfffd
4734                    }
4735                    // If the number is a surrogate, then this is a
4736                    // surrogate-character-reference parse error. Set the character
4737                    // reference code to 0xFFFD.
4738                    cr if is_surrogate(cr) => {
4739                        self.emit_error(ErrorKind::SurrogateCharacterReference);
4740
4741                        0xfffd
4742                    }
4743                    // If the number is a noncharacter, then this is a
4744                    // noncharacter-character-reference parse error.
4745                    cr if is_noncharacter(cr) => {
4746                        self.emit_error(ErrorKind::NoncharacterCharacterReference);
4747
4748                        cr
4749                    }
4750                    // If the number is 0x0D, or a control that's not ASCII whitespace,
4751                    // then
4752                    // this is a control-character-reference parse error. If the number
4753                    // is one of the numbers in the
4754                    // first column of the following table, then find the
4755                    // row with that number in the first column, and set the character
4756                    // reference code to the number in
4757                    // the second column of that row.
4758                    cr if cr == 0x0d || is_control(cr) => {
4759                        self.emit_error(ErrorKind::ControlCharacterReference);
4760
4761                        match cr {
4762                            // 0x80	0x20AC	EURO SIGN (€)
4763                            0x80 => 0x20ac,
4764                            // 0x82	0x201A	SINGLE LOW-9 QUOTATION MARK (‚)
4765                            0x82 => 0x201a,
4766                            // 0x83	0x0192	LATIN SMALL LETTER F WITH HOOK (ƒ)
4767                            0x83 => 0x0192,
4768                            // 0x84	0x201E	DOUBLE LOW-9 QUOTATION MARK („)
4769                            0x84 => 0x201e,
4770                            // 0x85	0x2026	HORIZONTAL ELLIPSIS (…)
4771                            0x85 => 0x2026,
4772                            // 0x86	0x2020	DAGGER (†)
4773                            0x86 => 0x2020,
4774                            // 0x87	0x2021	DOUBLE DAGGER (‡)
4775                            0x87 => 0x2021,
4776                            // 0x88	0x02C6	MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
4777                            0x88 => 0x02c6,
4778                            // 0x89	0x2030	PER MILLE SIGN (‰)
4779                            0x89 => 0x2030,
4780                            // 0x8A	0x0160	LATIN CAPITAL LETTER S WITH CARON (Š)
4781                            0x8a => 0x0160,
4782                            // 0x8B	0x2039	SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
4783                            0x8b => 0x2039,
4784                            // 0x8C	0x0152	LATIN CAPITAL LIGATURE OE (Œ)
4785                            0x8c => 0x0152,
4786                            // 0x8E	0x017D	LATIN CAPITAL LETTER Z WITH CARON (Ž)
4787                            0x8e => 0x017d,
4788                            // 0x91	0x2018	LEFT SINGLE QUOTATION MARK (‘)
4789                            0x91 => 0x2018,
4790                            // 0x92	0x2018	RIGHT SINGLE QUOTATION MARK (’)
4791                            0x92 => 0x2019,
4792                            // 0x93	0x201C	LEFT DOUBLE QUOTATION MARK (“)
4793                            0x93 => 0x201c,
4794                            // 0x94	0x201D	RIGHT DOUBLE QUOTATION MARK (”)
4795                            0x94 => 0x201d,
4796                            // 0x95	0x2022	BULLET (•)
4797                            0x95 => 0x2022,
4798                            // 0x96	0x2013	EN DASH (–)
4799                            0x96 => 0x2013,
4800                            // 0x97	0x2014	EM DASH (—)
4801                            0x97 => 0x2014,
4802                            // 0x98	0x02DC	SMALL TILDE (˜)
4803                            0x98 => 0x02dc,
4804                            // 0x99	0x2122	TRADE MARK SIGN (™)
4805                            0x99 => 0x2122,
4806                            // 0x9A	0x0161	LATIN SMALL LETTER S WITH CARON (š)
4807                            0x9a => 0x0161,
4808                            // 0x9B	0x203A	SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
4809                            0x9b => 0x203a,
4810                            // 0x9C	0x0153	LATIN SMALL LIGATURE OE (œ)
4811                            0x9c => 0x0153,
4812                            // 0x9E	0x017E	LATIN SMALL LETTER Z WITH CARON (ž)
4813                            0x9e => 0x017e,
4814                            // 0x9F	0x0178	LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
4815                            0x9f => 0x0178,
4816                            _ => cr,
4817                        }
4818                    }
4819                    _ => value,
4820                };
4821
4822                // Set the temporary buffer to the empty string.
4823                // Append a code point equal to the character reference code to the temporary
4824                // buffer.
4825                // Flush code points consumed as a character reference.
4826                // Switch to the return state.
4827                let old_temporary_buffer = self.temporary_buffer.clone();
4828
4829                let mut raw =
4830                    String::with_capacity(old_temporary_buffer.len() + raw_char_ref.len() + 1);
4831
4832                raw.push_str(&old_temporary_buffer);
4833                raw.push_str(&raw_char_ref);
4834
4835                if self.cur == Some(';') {
4836                    raw.push(';');
4837                }
4838
4839                self.temporary_buffer.clear();
4840
4841                let c = match char::from_u32(cr) {
4842                    Some(c) => c,
4843                    _ => {
4844                        unreachable!();
4845                    }
4846                };
4847
4848                self.temporary_buffer.push(c);
4849                self.flush_code_points_consumed_as_character_reference(Some(raw));
4850                self.state = self.return_state.clone();
4851            }
4852        }
4853
4854        Ok(())
4855    }
4856
4857    #[inline(always)]
4858    fn skip_whitespaces(&mut self, c: char) {
4859        if c == '\r' && self.input.cur() == Some('\n') {
4860            unsafe {
4861                // Safety: cur() is Some
4862                self.input.bump();
4863            }
4864        }
4865    }
4866}
4867
4868// By spec '\r` removed before tokenizer, but we keep them to have better AST
4869// and don't break logic to ignore characters
4870#[inline(always)]
4871fn is_spacy(c: char) -> bool {
4872    matches!(c, '\x09' | '\x0a' | '\x0d' | '\x0c' | '\x20')
4873}
4874
4875#[inline(always)]
4876fn is_control(c: u32) -> bool {
4877    matches!(c, c @ 0x00..=0x1f | c @ 0x7f..=0x9f if !matches!(c, 0x09 | 0x0a | 0x0c | 0x0d | 0x20))
4878}
4879
4880#[inline(always)]
4881fn is_surrogate(c: u32) -> bool {
4882    matches!(c, 0xd800..=0xdfff)
4883}
4884
4885// A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
4886// inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE,
4887// U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
4888// U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE,
4889// U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE,
4890// U+FFFFF, U+10FFFE, or U+10FFFF.
4891#[inline(always)]
4892fn is_noncharacter(c: u32) -> bool {
4893    matches!(
4894        c,
4895        0xfdd0
4896            ..=0xfdef
4897                | 0xfffe
4898                | 0xffff
4899                | 0x1fffe
4900                | 0x1ffff
4901                | 0x2fffe
4902                | 0x2ffff
4903                | 0x3fffe
4904                | 0x3ffff
4905                | 0x4fffe
4906                | 0x4ffff
4907                | 0x5fffe
4908                | 0x5ffff
4909                | 0x6fffe
4910                | 0x6ffff
4911                | 0x7fffe
4912                | 0x7ffff
4913                | 0x8fffe
4914                | 0x8ffff
4915                | 0x9fffe
4916                | 0x9ffff
4917                | 0xafffe
4918                | 0xaffff
4919                | 0xbfffe
4920                | 0xbffff
4921                | 0xcfffe
4922                | 0xcffff
4923                | 0xdfffe
4924                | 0xdffff
4925                | 0xefffe
4926                | 0xeffff
4927                | 0xffffe
4928                | 0xfffff
4929                | 0x10fffe
4930                | 0x10ffff,
4931    )
4932}
4933
4934#[inline(always)]
4935fn is_upper_hex_digit(c: char) -> bool {
4936    matches!(c, '0'..='9' | 'A'..='F')
4937}
4938
4939#[inline(always)]
4940fn is_lower_hex_digit(c: char) -> bool {
4941    matches!(c, '0'..='9' | 'a'..='f')
4942}
4943
4944#[inline(always)]
4945fn is_ascii_hex_digit(c: char) -> bool {
4946    is_upper_hex_digit(c) || is_lower_hex_digit(c)
4947}
4948
4949#[inline(always)]
4950fn is_ascii_upper_alpha(c: char) -> bool {
4951    c.is_ascii_uppercase()
4952}
4953
4954#[inline(always)]
4955fn is_ascii_lower_alpha(c: char) -> bool {
4956    c.is_ascii_lowercase()
4957}
4958
4959#[inline(always)]
4960fn is_ascii_alpha(c: char) -> bool {
4961    is_ascii_upper_alpha(c) || is_ascii_lower_alpha(c)
4962}
4963
4964#[inline(always)]
4965fn is_allowed_control_character(c: u32) -> bool {
4966    c != 0x00 && is_control(c)
4967}
4968
4969#[inline(always)]
4970fn is_allowed_character(c: char) -> bool {
4971    let c = c as u32;
4972
4973    if is_surrogate(c) || is_allowed_control_character(c) || is_noncharacter(c) {
4974        return false;
4975    }
4976
4977    return true;
4978}