swc_xml_parser/lexer/
mod.rs

1use std::{collections::VecDeque, mem::take};
2
3use rustc_hash::FxHashSet;
4use swc_atoms::Atom;
5use swc_common::{input::Input, BytePos, Span};
6use swc_xml_ast::{AttributeToken, Token, TokenAndSpan};
7
8use crate::{
9    error::{Error, ErrorKind},
10    parser::input::ParserInput,
11};
12
13#[derive(Debug, Clone)]
14pub enum State {
15    Data,
16    CharacterReferenceInData,
17    Pi,
18    PiTarget,
19    PiTargetQuestion,
20    PiTargetAfter,
21    PiData,
22    PiEnd,
23    MarkupDeclaration,
24    CommentStart,
25    CommentStartDash,
26    Comment,
27    CommentLessThanSign,
28    CommentLessThanSignBang,
29    CommentLessThanSignBangDash,
30    CommentLessThanSignBangDashDash,
31    CommentEndDash,
32    CommentEnd,
33    CommentEndBang,
34    Cdata,
35    CdataBracket,
36    CdataEnd,
37    TagOpen,
38    EndTagOpen,
39    TagName,
40    EmptyTag,
41    TagAttributeNameBefore,
42    TagAttributeName,
43    TagAttributeNameAfter,
44    TagAttributeValueBefore,
45    TagAttributeValueDoubleQuoted,
46    TagAttributeValueSingleQuoted,
47    TagAttributeValueUnquoted,
48    TagAttributeValueAfter,
49    CharacterReferenceInAttributeValue,
50    BogusComment,
51    Doctype,
52    BeforeDoctypeName,
53    DoctypeName,
54    AfterDoctypeName,
55    AfterDoctypePublicKeyword,
56    AfterDoctypeSystemKeyword,
57    BeforeDoctypeSystemIdentifier,
58    BeforeDoctypePublicIdentifier,
59    DoctypePublicIdentifierSingleQuoted,
60    DoctypePublicIdentifierDoubleQuoted,
61    AfterDoctypePublicIdentifier,
62    BetweenDoctypePublicAndSystemIdentifiers,
63    DoctypeSystemIdentifierSingleQuoted,
64    DoctypeSystemIdentifierDoubleQuoted,
65    AfterDoctypeSystemIdentifier,
66    DoctypeTypeInternalSubSet,
67    BogusDoctype,
68}
69
70// TODO implement `raw` for all tokens
71
72#[derive(PartialEq, Eq, Clone, Debug)]
73struct Doctype {
74    name: Option<String>,
75    public_id: Option<String>,
76    system_id: Option<String>,
77}
78
79#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
80enum TagKind {
81    Start,
82    End,
83    Empty,
84}
85
86#[derive(PartialEq, Eq, Clone, Debug)]
87struct Tag {
88    kind: TagKind,
89    tag_name: String,
90    attributes: Vec<Attribute>,
91}
92
93#[derive(PartialEq, Eq, Clone, Debug)]
94struct Attribute {
95    span: Span,
96    name: String,
97    raw_name: Option<String>,
98    value: Option<String>,
99    raw_value: Option<String>,
100}
101
102#[derive(PartialEq, Eq, Clone, Debug)]
103struct Comment {
104    data: String,
105    raw: String,
106}
107
108#[derive(PartialEq, Eq, Clone, Debug)]
109struct ProcessingInstruction {
110    target: String,
111    data: String,
112}
113
114#[derive(PartialEq, Eq, Clone, Debug)]
115struct Cdata {
116    data: String,
117    raw: String,
118}
119
120pub(crate) type LexResult<T> = Result<T, ErrorKind>;
121
122pub struct Lexer<'a, I>
123where
124    I: Input<'a>,
125{
126    input: I,
127    cur: Option<char>,
128    cur_pos: BytePos,
129    last_token_pos: BytePos,
130    finished: bool,
131    state: State,
132    return_state: Option<State>,
133    errors: Vec<Error>,
134    additional_allowed_character: Option<char>,
135    pending_tokens: VecDeque<TokenAndSpan>,
136    doctype_raw: Option<String>,
137    current_doctype_token: Option<Doctype>,
138    current_comment_token: Option<Comment>,
139    current_processing_instruction: Option<ProcessingInstruction>,
140    current_tag_token: Option<Tag>,
141    current_cdata_token: Option<Cdata>,
142    attribute_start_position: Option<BytePos>,
143    phantom: std::marker::PhantomData<&'a ()>,
144}
145
146impl<'a, I> Lexer<'a, I>
147where
148    I: Input<'a>,
149{
150    pub fn new(input: I) -> Self {
151        let start_pos = input.last_pos();
152
153        let mut lexer = Lexer {
154            input,
155            cur: None,
156            cur_pos: start_pos,
157            last_token_pos: start_pos,
158            finished: false,
159            state: State::Data,
160            return_state: None,
161            errors: Vec::new(),
162            additional_allowed_character: None,
163            pending_tokens: VecDeque::new(),
164            doctype_raw: None,
165            current_doctype_token: None,
166            current_comment_token: None,
167            current_processing_instruction: None,
168            current_tag_token: None,
169            current_cdata_token: None,
170            attribute_start_position: None,
171            phantom: std::marker::PhantomData,
172        };
173
174        // A leading Byte Order Mark (BOM) causes the character encoding argument to be
175        // ignored and will itself be skipped.
176        if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') {
177            unsafe {
178                // Safety: cur() is Some('\u{feff}')
179                lexer.input.bump();
180            }
181        }
182
183        lexer
184    }
185}
186
187impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
188    type Item = TokenAndSpan;
189
190    fn next(&mut self) -> Option<Self::Item> {
191        let token_and_span = self.read_token_and_span();
192
193        match token_and_span {
194            Ok(token_and_span) => {
195                return Some(token_and_span);
196            }
197            Err(..) => {
198                return None;
199            }
200        }
201    }
202}
203
204impl<'a, I> ParserInput for Lexer<'a, I>
205where
206    I: Input<'a>,
207{
208    fn start_pos(&mut self) -> swc_common::BytePos {
209        self.input.cur_pos()
210    }
211
212    fn last_pos(&mut self) -> swc_common::BytePos {
213        self.input.last_pos()
214    }
215
216    fn take_errors(&mut self) -> Vec<Error> {
217        take(&mut self.errors)
218    }
219}
220
221impl<'a, I> Lexer<'a, I>
222where
223    I: Input<'a>,
224{
225    #[inline(always)]
226    fn next(&mut self) -> Option<char> {
227        self.input.cur()
228    }
229
230    // Any occurrences of surrogates are surrogate-in-input-stream parse errors. Any
231    // occurrences of noncharacters are noncharacter-in-input-stream parse errors
232    // and any occurrences of controls other than ASCII whitespace and U+0000 NULL
233    // characters are control-character-in-input-stream parse errors.
234    //
235    // Postpone validation for each character for perf reasons and do it in
236    // `anything else`
237    #[inline(always)]
238    fn validate_input_stream_character(&mut self, c: char) {
239        let code = c as u32;
240
241        if (0xd800..=0xdfff).contains(&code) {
242            self.emit_error(ErrorKind::SurrogateInInputStream);
243        } else if code != 0x00 && is_control(code) {
244            self.emit_error(ErrorKind::ControlCharacterInInputStream);
245        } else if is_noncharacter(code) {
246            self.emit_error(ErrorKind::NoncharacterInInputStream);
247        }
248    }
249
250    #[inline(always)]
251    fn consume(&mut self) {
252        self.cur = self.input.cur();
253        self.cur_pos = self.input.cur_pos();
254
255        if self.cur.is_some() {
256            unsafe {
257                // Safety: cur() is Some(c)
258                self.input.bump();
259            }
260        }
261    }
262
263    #[inline(always)]
264    fn reconsume(&mut self) {
265        unsafe {
266            // Safety: We got cur_pos from self.input
267            self.input.reset_to(self.cur_pos);
268        }
269    }
270
271    #[inline(always)]
272    fn reconsume_in_state(&mut self, state: State) {
273        self.state = state;
274        self.reconsume();
275    }
276
277    #[inline(always)]
278    fn consume_next_char(&mut self) -> Option<char> {
279        // The next input character is the first character in the input stream that has
280        // not yet been consumed or explicitly ignored by the requirements in this
281        // section. Initially, the next input character is the first character in the
282        // input. The current input character is the last character to have been
283        // consumed.
284        let c = self.next();
285
286        self.consume();
287
288        c
289    }
290
291    #[cold]
292    fn emit_error(&mut self, kind: ErrorKind) {
293        self.errors.push(Error::new(
294            Span::new(self.cur_pos, self.input.cur_pos()),
295            kind,
296        ));
297    }
298
299    #[inline(always)]
300    fn emit_token(&mut self, token: Token) {
301        let cur_pos = self.input.cur_pos();
302
303        let span = Span::new(self.last_token_pos, cur_pos);
304
305        self.last_token_pos = cur_pos;
306        self.pending_tokens.push_back(TokenAndSpan { span, token });
307    }
308
309    fn consume_character_reference(&mut self) -> Option<(char, String)> {
310        let cur_pos = self.input.cur_pos();
311        let anything_else = |lexer: &mut Lexer<'a, I>| {
312            lexer.emit_error(ErrorKind::InvalidEntityCharacter);
313            lexer.cur_pos = cur_pos;
314            unsafe {
315                // Safety: We got cur_post from self.input
316                lexer.input.reset_to(cur_pos);
317            }
318        };
319
320        // This section defines how to consume a character reference, optionally with an
321        // additional allowed character, which, if specified where the algorithm is
322        // invoked, adds a character to the list of characters that cause there to not
323        // be a character reference.
324        //
325        // This definition is used when parsing character in text and in attributes.
326        //
327        // The behavior depends on identity of next character (the one immediately after
328        // the U+0026 AMPERSAND character), as follows:
329        match self.consume_next_char() {
330            // The additional allowed character if there is one
331            // Not a character reference. No characters are consumed and nothing is returned (This
332            // is not an error, either).
333            Some(c) if self.additional_allowed_character == Some(c) => {
334                self.emit_error(ErrorKind::InvalidEntityCharacter);
335                self.cur_pos = cur_pos;
336                unsafe {
337                    // Safety: We got cur_post from self.input
338                    self.input.reset_to(cur_pos);
339                }
340            }
341            Some('l') => match self.consume_next_char() {
342                Some('t') => {
343                    match self.consume_next_char() {
344                        Some(';') => {}
345                        _ => {
346                            self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
347                        }
348                    }
349
350                    return Some(('<', String::from("&lt;")));
351                }
352                _ => {
353                    anything_else(self);
354                }
355            },
356            Some('g') => match self.consume_next_char() {
357                Some('t') => {
358                    match self.consume_next_char() {
359                        Some(';') => {}
360                        _ => {
361                            self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
362                        }
363                    }
364
365                    return Some(('>', String::from("&gt;")));
366                }
367                _ => {
368                    anything_else(self);
369                }
370            },
371            Some('q') => match self.consume_next_char() {
372                Some('u') => match self.consume_next_char() {
373                    Some('o') => match self.consume_next_char() {
374                        Some('t') => {
375                            match self.consume_next_char() {
376                                Some(';') => {}
377                                _ => {
378                                    self.emit_error(
379                                        ErrorKind::MissingSemicolonAfterCharacterReference,
380                                    );
381                                }
382                            }
383
384                            return Some(('"', String::from("&quot;")));
385                        }
386                        _ => {
387                            anything_else(self);
388                        }
389                    },
390                    _ => {
391                        anything_else(self);
392                    }
393                },
394                _ => {
395                    anything_else(self);
396                }
397            },
398            Some('a') => match self.consume_next_char() {
399                Some('p') => match self.consume_next_char() {
400                    Some('o') => match self.consume_next_char() {
401                        Some('s') => {
402                            match self.consume_next_char() {
403                                Some(';') => {}
404                                _ => {
405                                    self.emit_error(
406                                        ErrorKind::MissingSemicolonAfterCharacterReference,
407                                    );
408                                }
409                            }
410
411                            return Some(('\'', String::from("&apos;")));
412                        }
413                        _ => {
414                            anything_else(self);
415                        }
416                    },
417                    _ => {
418                        anything_else(self);
419                    }
420                },
421                Some('m') => match self.consume_next_char() {
422                    Some('p') => {
423                        match self.consume_next_char() {
424                            Some(';') => {}
425                            _ => {
426                                self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
427                            }
428                        }
429
430                        return Some(('&', String::from("&amp;")));
431                    }
432                    _ => {
433                        anything_else(self);
434                    }
435                },
436                _ => {
437                    anything_else(self);
438                }
439            },
440            Some('#') => {
441                let mut base = 10;
442                let mut characters = Vec::new();
443                let mut has_semicolon = false;
444
445                match self.consume_next_char() {
446                    Some('x' | 'X') => {
447                        base = 16;
448
449                        while let Some(c) = &self.consume_next_char() {
450                            if !c.is_ascii_hexdigit() {
451                                if *c == ';' {
452                                    has_semicolon = true;
453                                }
454
455                                break;
456                            }
457
458                            if c.is_ascii_digit() {
459                                characters.push(*c as u32 - 0x30);
460                            } else if is_upper_hex_digit(*c) {
461                                characters.push(*c as u32 - 0x37);
462                            } else if is_lower_hex_digit(*c) {
463                                characters.push(*c as u32 - 0x57);
464                            }
465                        }
466                    }
467                    Some(c) if c.is_ascii_digit() => {
468                        characters.push(c as u32 - 0x30);
469
470                        while let Some(c) = &self.consume_next_char() {
471                            if !c.is_ascii_digit() {
472                                if *c == ';' {
473                                    has_semicolon = true;
474                                }
475
476                                break;
477                            }
478
479                            characters.push(*c as u32 - 0x30);
480                        }
481                    }
482                    _ => {}
483                }
484
485                if characters.is_empty() {
486                    // TODO
487                    self.cur_pos = cur_pos;
488                    unsafe {
489                        // Safety: We got cur_post from self.input
490                        self.input.reset_to(cur_pos);
491                    }
492
493                    return None;
494                }
495
496                if !has_semicolon {
497                    self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
498                }
499
500                let cr = {
501                    let mut i: u32 = 0;
502                    let mut overflowed = false;
503
504                    for value in characters {
505                        if !overflowed {
506                            if let Some(result) = i.checked_mul(base as u32) {
507                                i = result;
508
509                                if let Some(result) = i.checked_add(value) {
510                                    i = result;
511                                } else {
512                                    i = 0x110000;
513
514                                    overflowed = true;
515                                }
516                            } else {
517                                i = 0x110000;
518
519                                overflowed = true;
520                            }
521                        }
522                    }
523
524                    i
525                };
526
527                if is_surrogate(cr) {
528                    self.emit_error(ErrorKind::SurrogateCharacterReference);
529
530                    return Some((char::REPLACEMENT_CHARACTER, String::from("empty")));
531                }
532
533                let c = match char::from_u32(cr) {
534                    Some(c) => c,
535                    _ => {
536                        unreachable!();
537                    }
538                };
539
540                return Some((c, String::from("empty")));
541            }
542            _ => {
543                anything_else(self);
544            }
545        }
546
547        None
548    }
549
550    fn create_doctype_token(&mut self, name_c: Option<char>) {
551        let mut new_name = None;
552
553        if let Some(name_c) = name_c {
554            let mut name = String::with_capacity(4);
555
556            name.push(name_c);
557            new_name = Some(name);
558        }
559
560        self.current_doctype_token = Some(Doctype {
561            name: new_name,
562            public_id: None,
563            system_id: None,
564        });
565    }
566
567    fn append_raw_to_doctype_token(&mut self, c: char) {
568        if let Some(doctype_raw) = &mut self.doctype_raw {
569            let is_cr = c == '\r';
570
571            if is_cr {
572                let mut raw = String::with_capacity(2);
573
574                raw.push(c);
575
576                if self.input.cur() == Some('\n') {
577                    unsafe {
578                        // Safety: cur() is Some('\n')
579                        self.input.bump();
580                    }
581
582                    raw.push('\n');
583                }
584
585                doctype_raw.push_str(&raw);
586            } else {
587                doctype_raw.push(c);
588            }
589        }
590    }
591
592    fn append_to_doctype_token(
593        &mut self,
594        name: Option<char>,
595        public_id: Option<char>,
596        system_id: Option<char>,
597    ) {
598        if let Some(ref mut token) = self.current_doctype_token {
599            if let Some(name) = name {
600                if let Doctype {
601                    name: Some(old_name),
602                    ..
603                } = token
604                {
605                    old_name.push(name);
606                }
607            }
608
609            if let Some(public_id) = public_id {
610                if let Doctype {
611                    public_id: Some(old_public_id),
612                    ..
613                } = token
614                {
615                    old_public_id.push(public_id);
616                }
617            }
618
619            if let Some(system_id) = system_id {
620                if let Doctype {
621                    system_id: Some(old_system_id),
622                    ..
623                } = token
624                {
625                    old_system_id.push(system_id);
626                }
627            }
628        }
629    }
630
631    fn set_doctype_token_public_id(&mut self) {
632        if let Some(Doctype { public_id, .. }) = &mut self.current_doctype_token {
633            // The Longest public id is `-//softquad software//dtd hotmetal pro
634            // 6.0::19990601::extensions to html 4.0//`
635            *public_id = Some(String::with_capacity(78));
636        }
637    }
638
639    fn set_doctype_token_system_id(&mut self) {
640        if let Some(Doctype { system_id, .. }) = &mut self.current_doctype_token {
641            // The Longest system id is `http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd`
642            *system_id = Some(String::with_capacity(58));
643        }
644    }
645
646    fn emit_doctype_token(&mut self) {
647        let current_doctype_token = self.current_doctype_token.take().unwrap();
648
649        let raw = match self.doctype_raw.take() {
650            Some(raw) => raw,
651            _ => {
652                unreachable!();
653            }
654        };
655
656        let token = Token::Doctype {
657            name: current_doctype_token.name.map(Atom::from),
658            public_id: current_doctype_token.public_id.map(Atom::from),
659            system_id: current_doctype_token.system_id.map(Atom::from),
660            raw: Some(Atom::from(raw)),
661        };
662
663        self.emit_token(token);
664    }
665
666    fn create_tag_token(&mut self, kind: TagKind) {
667        self.current_tag_token = Some(Tag {
668            kind,
669            // Maximum known html tags are `blockquote` and `figcaption`
670            tag_name: String::with_capacity(10),
671            attributes: Vec::with_capacity(255),
672        });
673    }
674
675    fn append_to_tag_token_name(&mut self, c: char) {
676        if let Some(Tag { tag_name, .. }) = &mut self.current_tag_token {
677            tag_name.push(c);
678        }
679    }
680
681    fn start_new_attribute(&mut self, c: Option<char>) {
682        if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
683            // The longest known HTML attribute is "allowpaymentrequest" for "iframe".
684            let mut name = String::with_capacity(19);
685            let mut raw_name = String::with_capacity(19);
686
687            if let Some(c) = c {
688                name.push(c);
689                raw_name.push(c);
690            };
691
692            attributes.push(Attribute {
693                span: Default::default(),
694                name,
695                raw_name: Some(raw_name),
696                value: None,
697                raw_value: None,
698            });
699
700            self.attribute_start_position = Some(self.cur_pos);
701        }
702    }
703
704    fn append_to_attribute(
705        &mut self,
706        name: Option<(char, char)>,
707        value: Option<(bool, Option<char>, Option<char>)>,
708    ) {
709        if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
710            if let Some(attribute) = attributes.last_mut() {
711                if let Some(name) = name {
712                    attribute.name.push(name.0);
713
714                    if let Some(raw_name) = &mut attribute.raw_name {
715                        raw_name.push(name.1);
716                    }
717                }
718
719                if let Some(value) = value {
720                    if let Some(c) = value.1 {
721                        if let Some(old_value) = &mut attribute.value {
722                            old_value.push(c);
723                        } else {
724                            let mut new_value = String::with_capacity(255);
725
726                            new_value.push(c);
727
728                            attribute.value = Some(new_value);
729                        }
730                    }
731
732                    if let Some(c) = value.2 {
733                        // Quote for attribute was found, so we set empty value by default
734                        if value.0 && attribute.value.is_none() {
735                            attribute.value = Some(String::with_capacity(255));
736                        }
737
738                        if let Some(raw_value) = &mut attribute.raw_value {
739                            raw_value.push(c);
740                        } else {
741                            let mut raw_new_value = String::with_capacity(255);
742
743                            raw_new_value.push(c);
744
745                            attribute.raw_value = Some(raw_new_value);
746                        }
747                    }
748                }
749            }
750        }
751    }
752
753    fn append_to_attribute_with_entity(&mut self, value: Option<(Option<char>, Option<&str>)>) {
754        if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
755            if let Some(attribute) = attributes.last_mut() {
756                if let Some(value) = value {
757                    if let Some(c) = value.0 {
758                        if let Some(old_value) = &mut attribute.value {
759                            old_value.push(c);
760                        } else {
761                            let mut new_value = String::with_capacity(255);
762
763                            new_value.push(c);
764
765                            attribute.value = Some(new_value);
766                        }
767                    }
768
769                    if let Some(c) = value.1 {
770                        if let Some(raw_value) = &mut attribute.raw_value {
771                            raw_value.push_str(c);
772                        } else {
773                            let mut raw_new_value = String::with_capacity(255);
774
775                            raw_new_value.push_str(c);
776
777                            attribute.raw_value = Some(raw_new_value);
778                        }
779                    }
780                }
781            }
782        }
783    }
784
785    fn update_attribute_span(&mut self) {
786        if let Some(attribute_start_position) = self.attribute_start_position {
787            if let Some(Tag {
788                ref mut attributes, ..
789            }) = self.current_tag_token
790            {
791                if let Some(last) = attributes.last_mut() {
792                    last.span = Span::new(attribute_start_position, self.cur_pos);
793                }
794            }
795        }
796    }
797
798    fn set_tag_to_empty_tag(&mut self) {
799        if let Some(Tag { kind, .. }) = &mut self.current_tag_token {
800            *kind = TagKind::Empty;
801        }
802    }
803
804    fn emit_tag_token(&mut self, kind: Option<TagKind>) {
805        if let Some(mut current_tag_token) = self.current_tag_token.take() {
806            if let Some(kind) = kind {
807                current_tag_token.kind = kind;
808            }
809
810            let mut already_seen: FxHashSet<Atom> = Default::default();
811
812            let attributes = current_tag_token
813                .attributes
814                .drain(..)
815                .map(|attribute| {
816                    let name = Atom::from(attribute.name);
817
818                    if already_seen.contains(&name) {
819                        self.errors
820                            .push(Error::new(attribute.span, ErrorKind::DuplicateAttribute));
821                    }
822
823                    already_seen.insert(name.clone());
824
825                    AttributeToken {
826                        span: attribute.span,
827                        name,
828                        raw_name: attribute.raw_name.map(Atom::from),
829                        value: attribute.value.map(Atom::from),
830                        raw_value: attribute.raw_value.map(Atom::from),
831                    }
832                })
833                .collect();
834
835            match current_tag_token.kind {
836                TagKind::Start => {
837                    let start_tag_token = Token::StartTag {
838                        tag_name: current_tag_token.tag_name.into(),
839                        attributes,
840                    };
841
842                    self.emit_token(start_tag_token);
843                }
844                TagKind::End => {
845                    if !current_tag_token.attributes.is_empty() {
846                        self.emit_error(ErrorKind::EndTagWithAttributes);
847                    }
848
849                    let end_tag_token = Token::EndTag {
850                        tag_name: current_tag_token.tag_name.into(),
851                        attributes,
852                    };
853
854                    self.emit_token(end_tag_token);
855                }
856                TagKind::Empty => {
857                    let empty_tag = Token::EmptyTag {
858                        tag_name: current_tag_token.tag_name.into(),
859                        attributes,
860                    };
861
862                    self.emit_token(empty_tag);
863                }
864            }
865        }
866    }
867
868    fn create_comment_token(&mut self, new_data: Option<String>, raw_start: &str) {
869        let mut data = String::with_capacity(32);
870        let mut raw = String::with_capacity(38);
871
872        raw.push_str(raw_start);
873
874        if let Some(new_data) = new_data {
875            data.push_str(&new_data);
876            raw.push_str(&new_data);
877        };
878
879        self.current_comment_token = Some(Comment { data, raw });
880    }
881
882    fn append_to_comment_token(&mut self, c: char, raw_c: char) {
883        if let Some(Comment { data, raw }) = &mut self.current_comment_token {
884            data.push(c);
885            raw.push(raw_c);
886        }
887    }
888
889    fn handle_raw_and_append_to_comment_token(&mut self, c: char) {
890        if let Some(Comment { data, raw }) = &mut self.current_comment_token {
891            let is_cr = c == '\r';
892
893            if is_cr {
894                let mut raw_c = String::with_capacity(2);
895
896                raw_c.push(c);
897
898                if self.input.cur() == Some('\n') {
899                    unsafe {
900                        // Safety: cur() is Some('\n')
901                        self.input.bump();
902                    }
903
904                    raw_c.push('\n');
905                }
906
907                data.push('\n');
908                raw.push_str(&raw_c);
909            } else {
910                data.push(c);
911                raw.push(c);
912            }
913        }
914    }
915
916    fn emit_comment_token(&mut self, raw_end: Option<&str>) {
917        let mut comment = self.current_comment_token.take().unwrap();
918
919        if let Some(raw_end) = raw_end {
920            comment.raw.push_str(raw_end);
921        }
922
923        self.emit_token(Token::Comment {
924            data: comment.data.into(),
925            raw: comment.raw.into(),
926        });
927    }
928
929    fn create_cdata_token(&mut self) {
930        let data = String::new();
931        let raw = String::with_capacity(12);
932
933        self.current_cdata_token = Some(Cdata { data, raw });
934    }
935
936    fn append_to_cdata_token(&mut self, c: Option<char>, raw_c: Option<char>) {
937        if let Some(Cdata { data, raw }) = &mut self.current_cdata_token {
938            if let Some(c) = c {
939                data.push(c);
940            }
941
942            if let Some(raw_c) = raw_c {
943                raw.push(raw_c);
944            }
945        }
946    }
947
948    fn emit_cdata_token(&mut self) {
949        let cdata = self.current_cdata_token.take().unwrap();
950
951        self.emit_token(Token::Cdata {
952            data: cdata.data.into(),
953            raw: cdata.raw.into(),
954        });
955    }
956
957    fn handle_raw_and_emit_character_token(&mut self, c: char) {
958        let is_cr = c == '\r';
959
960        if is_cr {
961            let mut raw = String::with_capacity(2);
962
963            raw.push(c);
964
965            if self.input.cur() == Some('\n') {
966                unsafe {
967                    // Safety: cur() is Some('\n')
968                    self.input.bump();
969                }
970
971                raw.push('\n');
972            }
973
974            self.emit_token(Token::Character {
975                value: '\n',
976                raw: Some(raw.into()),
977            });
978        } else {
979            self.emit_token(Token::Character {
980                value: c,
981                raw: Some(String::from(c).into()),
982            });
983        }
984    }
985
986    fn create_processing_instruction_token(&mut self) {
987        self.current_processing_instruction = Some(ProcessingInstruction {
988            target: String::with_capacity(3),
989            data: String::with_capacity(255),
990        });
991    }
992
993    fn set_processing_instruction_token(&mut self, target_c: Option<char>, data_c: Option<char>) {
994        if let Some(ProcessingInstruction { target, data, .. }) =
995            &mut self.current_processing_instruction
996        {
997            if let Some(target_c) = target_c {
998                target.push(target_c);
999            }
1000
1001            if let Some(data_c) = data_c {
1002                data.push(data_c);
1003            }
1004        }
1005    }
1006
1007    fn emit_current_processing_instruction(&mut self) {
1008        let processing_instruction = self.current_processing_instruction.take().unwrap();
1009
1010        let token = Token::ProcessingInstruction {
1011            target: processing_instruction.target.into(),
1012            data: processing_instruction.data.into(),
1013        };
1014
1015        self.emit_token(token);
1016    }
1017
1018    #[inline(always)]
1019    fn emit_character_token(&mut self, value: (char, char)) {
1020        self.emit_token(Token::Character {
1021            value: value.0,
1022            raw: Some(String::from(value.1).into()),
1023        });
1024    }
1025
1026    #[inline(always)]
1027    fn emit_character_token_with_entity(&mut self, c: char, raw: &str) {
1028        self.emit_token(Token::Character {
1029            value: c,
1030            raw: Some(raw.into()),
1031        });
1032    }
1033
1034    fn read_token_and_span(&mut self) -> LexResult<TokenAndSpan> {
1035        if self.finished {
1036            return Err(ErrorKind::Eof);
1037        } else {
1038            while self.pending_tokens.is_empty() {
1039                self.run()?;
1040            }
1041        }
1042
1043        let token_and_span = self.pending_tokens.pop_front().unwrap();
1044
1045        match token_and_span.token {
1046            Token::Eof => {
1047                self.finished = true;
1048
1049                return Err(ErrorKind::Eof);
1050            }
1051            _ => {
1052                return Ok(token_and_span);
1053            }
1054        }
1055    }
1056
1057    fn run(&mut self) -> LexResult<()> {
1058        match self.state {
1059            State::Data => {
1060                // Consume the next input character:
1061                match self.consume_next_char() {
1062                    // U+0026 AMPERSAND (&)
1063                    // Switch to character reference in data state.
1064                    Some('&') => {
1065                        self.state = State::CharacterReferenceInData;
1066                    }
1067                    // U+003C LESSER-THAN SIGN (<)
1068                    // Switch to the tag open state.
1069                    Some('<') => {
1070                        self.state = State::TagOpen;
1071                    }
1072                    // EOF
1073                    // Emit an end-of-file token.
1074                    None => {
1075                        self.emit_token(Token::Eof);
1076
1077                        return Ok(());
1078                    }
1079                    // Anything else
1080                    // Emit the current input character as character. Stay in this state.
1081                    Some(c) => {
1082                        self.validate_input_stream_character(c);
1083                        self.handle_raw_and_emit_character_token(c);
1084                    }
1085                }
1086            }
1087            State::CharacterReferenceInData => {
1088                // Switch to the data state.
1089                // Attempt to consume a character reference.
1090                //
1091                // If nothing is returned emit a U+0026 AMPERSAND character (&) token.
1092                //
1093                // Otherwise, emit character tokens that were returned.
1094                self.state = State::Data;
1095
1096                let character_reference = self.consume_character_reference();
1097
1098                if let Some((c, raw)) = character_reference {
1099                    self.emit_character_token_with_entity(c, &raw);
1100                } else {
1101                    self.emit_character_token(('&', '&'));
1102                }
1103            }
1104            State::Pi => {
1105                // Consume the next input character:
1106                match self.consume_next_char() {
1107                    // U+0009 CHARACTER TABULATION (tab)
1108                    // U+000A LINE FEED (LF)
1109                    // U+0020 SPACE
1110                    // EOF
1111                    // Parse error.
1112                    // Switch to the pi target after state.
1113                    Some(c) if is_whitespace(c) => {
1114                        self.emit_error(ErrorKind::InvalidCharacterOfProcessingInstruction);
1115                        self.create_processing_instruction_token();
1116                        self.state = State::PiTargetAfter;
1117                    }
1118                    None => {
1119                        self.emit_error(ErrorKind::EofInProcessingInstruction);
1120                        self.create_processing_instruction_token();
1121                        self.emit_current_processing_instruction();
1122                        self.reconsume_in_state(State::Data);
1123                    }
1124                    // U+003F QUESTION MARK(?)
1125                    // Emit error
1126                    // Reprocess the current input character in the pi end state (recovery mode).
1127                    Some('?') => {
1128                        self.emit_error(ErrorKind::NoTargetNameInProcessingInstruction);
1129                        self.create_processing_instruction_token();
1130                        self.state = State::PiEnd;
1131                    }
1132                    Some(c) => {
1133                        self.validate_input_stream_character(c);
1134                        self.create_processing_instruction_token();
1135                        self.set_processing_instruction_token(Some(c), None);
1136                        self.state = State::PiTarget;
1137                    }
1138                }
1139            }
1140            State::PiTarget => {
1141                // Consume the next input character:
1142                match self.consume_next_char() {
1143                    // U+0009 CHARACTER TABULATION (tab)
1144                    // U+000A LINE FEED (LF)
1145                    // U+0020 SPACE
1146                    // Switch to the pi target state.
1147                    Some(c) if is_whitespace(c) => {
1148                        self.state = State::PiTargetAfter;
1149                    }
1150                    // EOF
1151                    // Parse error. Emit the current processing instruction token and then reprocess
1152                    // the current input character in the data state.
1153                    None => {
1154                        self.emit_error(ErrorKind::EofInProcessingInstruction);
1155                        self.emit_current_processing_instruction();
1156                        self.reconsume_in_state(State::Data);
1157                    }
1158                    // U+003F QUESTION MARK(?)
1159                    // Switch to the pi target question.
1160                    Some('?') => {
1161                        self.state = State::PiTargetQuestion;
1162                    }
1163                    // Anything else
1164                    // Append the current input character to the processing instruction target and
1165                    // stay in the current state.
1166                    Some(c) => {
1167                        self.validate_input_stream_character(c);
1168                        self.set_processing_instruction_token(Some(c), None);
1169                    }
1170                }
1171            }
1172            State::PiTargetQuestion => {
1173                // Consume the next input character:
1174                match self.consume_next_char() {
1175                    // U+003E GREATER-THAN SIGN (>)
1176                    Some('>') => {
1177                        self.reconsume_in_state(State::PiEnd);
1178                    }
1179                    _ => {
1180                        self.errors.push(Error::new(
1181                            Span::new(self.cur_pos - BytePos(1), self.input.cur_pos() - BytePos(1)),
1182                            ErrorKind::MissingWhitespaceBeforeQuestionInProcessingInstruction,
1183                        ));
1184                        self.set_processing_instruction_token(None, Some('?'));
1185                        self.reconsume_in_state(State::PiData);
1186                    }
1187                }
1188            }
1189            State::PiTargetAfter => {
1190                // Consume the next input character:
1191                match self.consume_next_char() {
1192                    // U+0009 CHARACTER TABULATION (Tab)
1193                    // U+000A LINE FEED (LF)
1194                    // U+0020 SPACE (Space)
1195                    // Stay in the current state.
1196                    Some(c) if is_whitespace(c) => {
1197                        self.skip_next_lf(c);
1198                    }
1199                    // Anything else
1200                    // Reprocess the current input character in the pi data state.
1201                    _ => {
1202                        self.reconsume_in_state(State::PiData);
1203                    }
1204                }
1205            }
1206            State::PiData => {
1207                // Consume the next input character:
1208                match self.consume_next_char() {
1209                    // U+003F QUESTION MARK(?)
1210                    // Switch to the pi after state.
1211                    Some('?') => {
1212                        self.state = State::PiEnd;
1213                    }
1214                    // EOF
1215                    // Parse error. Emit the current processing instruction token and then reprocess
1216                    // the current input character in the data state.
1217                    None => {
1218                        self.emit_error(ErrorKind::EofInProcessingInstruction);
1219                        self.emit_current_processing_instruction();
1220                        self.reconsume_in_state(State::Data);
1221                    }
1222                    // Anything else
1223                    // Append the current input character to the pi’s data and stay in the current
1224                    // state.
1225                    Some(c) => {
1226                        self.validate_input_stream_character(c);
1227                        self.set_processing_instruction_token(None, Some(c));
1228                    }
1229                }
1230            }
1231            State::PiEnd => {
1232                // Consume the next input character:
1233                match self.consume_next_char() {
1234                    // U+003E GREATER-THAN SIGN (>)
1235                    // Emit the current token and then switch to the data state.
1236                    Some('>') => {
1237                        self.emit_current_processing_instruction();
1238                        self.state = State::Data;
1239                    }
1240                    // EOF
1241                    // Parse error. Emit the current processing instruction token and then reprocess
1242                    // the current input character in the data state.
1243                    None => {
1244                        self.emit_error(ErrorKind::EofInProcessingInstruction);
1245                        self.emit_current_processing_instruction();
1246                        self.reconsume_in_state(State::Data);
1247                    }
1248                    // Anything else
1249                    // Reprocess the current input character in the pi data state.
1250                    _ => {
1251                        self.set_processing_instruction_token(None, Some('?'));
1252                        self.reconsume_in_state(State::PiData);
1253                    }
1254                }
1255            }
1256            State::MarkupDeclaration => {
1257                let cur_pos = self.input.cur_pos();
1258                let anything_else = |lexer: &mut Lexer<'a, I>| {
1259                    lexer.emit_error(ErrorKind::IncorrectlyOpenedComment);
1260                    lexer.create_comment_token(None, "<!");
1261                    lexer.state = State::BogusComment;
1262                    lexer.cur_pos = cur_pos;
1263                    // We don't validate input here because we reset position
1264                    unsafe {
1265                        // Safety: cur_pos is in the range of input
1266                        lexer.input.reset_to(cur_pos);
1267                    }
1268                };
1269
1270                // If the next few characters are:
1271                match self.consume_next_char() {
1272                    // Two U+002D HYPEN-MINUS characters (-)
1273                    // Consume those two characters, create a comment token whose data is the empty
1274                    // string and switch to comment start state.
1275                    Some('-') => match self.consume_next_char() {
1276                        Some('-') => {
1277                            self.create_comment_token(None, "<!--");
1278                            self.state = State::CommentStart;
1279                        }
1280                        _ => {
1281                            anything_else(self);
1282                        }
1283                    },
1284                    // ASCII case-insensitive match for word "DOCTYPE"
1285                    // Consume those characters and switch to Doctype state
1286                    Some(d @ 'd' | d @ 'D') => match self.consume_next_char() {
1287                        Some(o @ 'o' | o @ 'O') => match self.consume_next_char() {
1288                            Some(c @ 'c' | c @ 'C') => match self.consume_next_char() {
1289                                Some(t @ 't' | t @ 'T') => match self.consume_next_char() {
1290                                    Some(y @ 'y' | y @ 'Y') => match self.consume_next_char() {
1291                                        Some(p @ 'p' | p @ 'P') => match self.consume_next_char() {
1292                                            Some(e @ 'e' | e @ 'E') => {
1293                                                self.state = State::Doctype;
1294
1295                                                let mut raw_keyword = String::with_capacity(9);
1296
1297                                                raw_keyword.push('<');
1298                                                raw_keyword.push('!');
1299                                                raw_keyword.push(d);
1300                                                raw_keyword.push(o);
1301                                                raw_keyword.push(c);
1302                                                raw_keyword.push(t);
1303                                                raw_keyword.push(y);
1304                                                raw_keyword.push(p);
1305                                                raw_keyword.push(e);
1306
1307                                                self.doctype_raw = Some(raw_keyword);
1308                                            }
1309                                            _ => {
1310                                                anything_else(self);
1311                                            }
1312                                        },
1313                                        _ => {
1314                                            anything_else(self);
1315                                        }
1316                                    },
1317                                    _ => {
1318                                        anything_else(self);
1319                                    }
1320                                },
1321                                _ => {
1322                                    anything_else(self);
1323                                }
1324                            },
1325                            _ => {
1326                                anything_else(self);
1327                            }
1328                        },
1329                        _ => {
1330                            anything_else(self);
1331                        }
1332                    },
1333                    // Exact match for word "[CDATA[" with a (the five uppercase letters "CDATA"
1334                    // with a U+005B LEFT SQUARE BRACKET character before and after)
1335                    // Consume those characters and switch to CDATA state
1336                    Some('[') => match self.consume_next_char() {
1337                        Some(c @ 'C') => match self.consume_next_char() {
1338                            Some(d @ 'D') => match self.consume_next_char() {
1339                                Some(a1 @ 'A') => match self.consume_next_char() {
1340                                    Some(t @ 'T') => match self.consume_next_char() {
1341                                        Some(a2 @ 'A') => match self.consume_next_char() {
1342                                            Some('[') => {
1343                                                self.create_cdata_token();
1344                                                self.append_to_cdata_token(None, Some('<'));
1345                                                self.append_to_cdata_token(None, Some('!'));
1346                                                self.append_to_cdata_token(None, Some('['));
1347                                                self.append_to_cdata_token(None, Some(c));
1348                                                self.append_to_cdata_token(None, Some(d));
1349                                                self.append_to_cdata_token(None, Some(a1));
1350                                                self.append_to_cdata_token(None, Some(t));
1351                                                self.append_to_cdata_token(None, Some(a2));
1352                                                self.append_to_cdata_token(None, Some('['));
1353                                                self.state = State::Cdata;
1354                                            }
1355                                            _ => {
1356                                                anything_else(self);
1357                                            }
1358                                        },
1359                                        _ => {
1360                                            anything_else(self);
1361                                        }
1362                                    },
1363                                    _ => {
1364                                        anything_else(self);
1365                                    }
1366                                },
1367                                _ => {
1368                                    anything_else(self);
1369                                }
1370                            },
1371                            _ => {
1372                                anything_else(self);
1373                            }
1374                        },
1375                        _ => {
1376                            anything_else(self);
1377                        }
1378                    },
1379                    // Anything else
1380                    // Emit an error. Create a comment token whose data is an empty string. Switch
1381                    // to bogus comment state (don’t consume any characters)
1382                    _ => {
1383                        anything_else(self);
1384                    }
1385                }
1386            }
1387            State::CommentStart => {
1388                // Consume the next input character:
1389                match self.consume_next_char() {
1390                    // U+002D HYPHEN-MINUS (-)
1391                    // Switch to the comment start dash state.
1392                    Some('-') => {
1393                        self.state = State::CommentStartDash;
1394                    }
1395                    // U+003E GREATER-THAN SIGN (>)
1396                    // This is an abrupt-closing-of-empty-comment parse error. Switch to the
1397                    // data state. Emit the current comment token.
1398                    Some('>') => {
1399                        self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
1400                        self.state = State::Data;
1401                        self.emit_comment_token(Some(">"));
1402                    }
1403                    // Anything else
1404                    // Reconsume in the comment state.
1405                    _ => {
1406                        self.reconsume_in_state(State::Comment);
1407                    }
1408                }
1409            }
1410            State::CommentStartDash => {
1411                // Consume the next input character:
1412                match self.consume_next_char() {
1413                    // U+002D HYPHEN-MINUS (-)
1414                    // Switch to the comment end state.
1415                    Some('-') => {
1416                        self.state = State::CommentEnd;
1417                    }
1418                    // U+003E GREATER-THAN SIGN (>)
1419                    // This is an abrupt-closing-of-empty-comment parse error. Switch to the
1420                    // data state. Emit the current comment token.
1421                    Some('>') => {
1422                        self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
1423                        self.state = State::Data;
1424                        self.emit_comment_token(Some("->"));
1425                    }
1426                    // EOF
1427                    // This is an eof-in-comment parse error. Emit the current comment token.
1428                    // Emit an end-of-file token.
1429                    None => {
1430                        self.emit_error(ErrorKind::EofInComment);
1431                        self.emit_comment_token(None);
1432                        self.emit_token(Token::Eof);
1433
1434                        return Ok(());
1435                    }
1436                    // Anything else
1437                    // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
1438                    // Reconsume in the comment state.
1439                    _ => {
1440                        self.append_to_comment_token('-', '-');
1441                        self.reconsume_in_state(State::Comment);
1442                    }
1443                }
1444            }
1445            State::Comment => {
1446                // Consume the next input character:
1447                match self.consume_next_char() {
1448                    // U+003C LESS-THAN SIGN (<)
1449                    // Append the current input character to the comment token's data. Switch to
1450                    // the comment less-than sign state.
1451                    Some(c @ '<') => {
1452                        self.append_to_comment_token(c, c);
1453                        self.state = State::CommentLessThanSign;
1454                    }
1455                    // U+002D HYPHEN-MINUS (-)
1456                    // Switch to the comment end dash state.
1457                    Some('-') => {
1458                        self.state = State::CommentEndDash;
1459                    }
1460                    // EOF
1461                    // This is an eof-in-comment parse error. Emit the current comment token.
1462                    // Emit an end-of-file token.
1463                    None => {
1464                        self.emit_error(ErrorKind::EofInComment);
1465                        self.emit_comment_token(None);
1466                        self.emit_token(Token::Eof);
1467
1468                        return Ok(());
1469                    }
1470                    // Anything else
1471                    // Append the current input character to the comment token's data.
1472                    Some(c) => {
1473                        self.validate_input_stream_character(c);
1474                        self.handle_raw_and_append_to_comment_token(c);
1475                    }
1476                }
1477            }
1478            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
1479            State::CommentLessThanSign => {
1480                // Consume the next input character:
1481                match self.consume_next_char() {
1482                    // U+0021 EXCLAMATION MARK (!)
1483                    // Append the current input character to the comment token's data. Switch to
1484                    // the comment less-than sign bang state.
1485                    Some(c @ '!') => {
1486                        self.append_to_comment_token(c, c);
1487                        self.state = State::CommentLessThanSignBang;
1488                    }
1489                    // U+003C LESS-THAN SIGN (<)
1490                    // Append the current input character to the comment token's data.
1491                    Some(c @ '<') => {
1492                        self.append_to_comment_token(c, c);
1493                    }
1494                    // Anything else
1495                    // Reconsume in the comment state.
1496                    _ => {
1497                        self.reconsume_in_state(State::Comment);
1498                    }
1499                }
1500            }
1501            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
1502            State::CommentLessThanSignBang => {
1503                // Consume the next input character:
1504                match self.consume_next_char() {
1505                    // U+002D HYPHEN-MINUS (-)
1506                    // Switch to the comment less-than sign bang dash state.
1507                    Some('-') => {
1508                        self.state = State::CommentLessThanSignBangDash;
1509                    }
1510                    // Anything else
1511                    // Reconsume in the comment state.
1512                    _ => {
1513                        self.reconsume_in_state(State::Comment);
1514                    }
1515                }
1516            }
1517            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
1518            State::CommentLessThanSignBangDash => {
1519                // Consume the next input character:
1520                match self.consume_next_char() {
1521                    // U+002D HYPHEN-MINUS (-)
1522                    // Switch to the comment less-than sign bang dash dash state.
1523                    Some('-') => {
1524                        self.state = State::CommentLessThanSignBangDashDash;
1525                    }
1526                    // Anything else
1527                    // Reconsume in the comment end dash state.
1528                    _ => {
1529                        self.reconsume_in_state(State::CommentEndDash);
1530                    }
1531                }
1532            }
1533            // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
1534            State::CommentLessThanSignBangDashDash => {
1535                // Consume the next input character:
1536                match self.consume_next_char() {
1537                    // U+003E GREATER-THAN SIGN (>)
1538                    // EOF
1539                    // Reconsume in the comment end state.
1540                    Some('>') | None => {
1541                        self.reconsume_in_state(State::CommentEnd);
1542                    }
1543                    // Anything else
1544                    // This is a nested-comment parse error. Reconsume in the comment end state.
1545                    _ => {
1546                        self.emit_error(ErrorKind::NestedComment);
1547                        self.reconsume_in_state(State::CommentEnd);
1548                    }
1549                }
1550            }
1551            // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
1552            State::CommentEndDash => {
1553                // Consume the next input character:
1554                match self.consume_next_char() {
1555                    // U+002D HYPHEN-MINUS (-)
1556                    // Switch to the comment end state.
1557                    Some('-') => {
1558                        self.state = State::CommentEnd;
1559                    }
1560                    // EOF
1561                    // This is an eof-in-comment parse error. Emit the current comment token.
1562                    // Emit an end-of-file token.
1563                    None => {
1564                        self.emit_error(ErrorKind::EofInComment);
1565                        self.emit_comment_token(None);
1566                        self.emit_token(Token::Eof);
1567
1568                        return Ok(());
1569                    }
1570                    // Anything else
1571                    // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
1572                    // Reconsume in the comment state.
1573                    _ => {
1574                        self.append_to_comment_token('-', '-');
1575                        self.reconsume_in_state(State::Comment);
1576                    }
1577                }
1578            }
1579            // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
1580            State::CommentEnd => {
1581                // Consume the next input character:
1582                match self.consume_next_char() {
1583                    // U+003E GREATER-THAN SIGN (>)
1584                    // Switch to the data state. Emit the current comment token.
1585                    Some('>') => {
1586                        self.state = State::Data;
1587                        self.emit_comment_token(Some("-->"));
1588                    }
1589                    // U+0021 EXCLAMATION MARK (!)
1590                    // Switch to the comment end bang state.
1591                    Some('!') => {
1592                        self.state = State::CommentEndBang;
1593                    }
1594                    // U+002D HYPHEN-MINUS (-)
1595                    // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
1596                    Some(c @ '-') => {
1597                        self.append_to_comment_token(c, c);
1598                        self.emit_error(ErrorKind::DoubleHyphenWithInComment);
1599                    }
1600                    // EOF
1601                    // This is an eof-in-comment parse error. Emit the current comment token.
1602                    // Emit an end-of-file token.
1603                    None => {
1604                        self.emit_error(ErrorKind::EofInComment);
1605                        self.emit_comment_token(None);
1606                        self.emit_token(Token::Eof);
1607
1608                        return Ok(());
1609                    }
1610                    // Anything else
1611                    // Append two U+002D (-) characters and the current input character to the
1612                    // comment token’s data. Reconsume in the comment state.
1613                    _ => {
1614                        self.append_to_comment_token('-', '-');
1615                        self.append_to_comment_token('-', '-');
1616                        self.reconsume_in_state(State::Comment);
1617                    }
1618                }
1619            }
1620            // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
1621            State::CommentEndBang => {
1622                // Consume the next input character:
1623                match self.consume_next_char() {
1624                    // U+002D HYPHEN-MINUS (-)
1625                    // Append a U+002D HYPHEN-MINUS character (-) and U+0021 EXCLAMATION MARK
1626                    // character(!) to the comment token’s data. Switch to the comment end dash
1627                    // state.
1628                    Some('-') => {
1629                        self.append_to_comment_token('-', '-');
1630                        self.append_to_comment_token('!', '!');
1631                        self.state = State::CommentEndDash;
1632                    }
1633                    // U+003E GREATER-THAN SIGN (>)
1634                    // Parse error. Switch to the data state.Emit the comment token.
1635                    Some('>') => {
1636                        self.emit_error(ErrorKind::IncorrectlyClosedComment);
1637                        self.state = State::Data;
1638                        self.emit_comment_token(Some(">"));
1639                    }
1640                    // EOF
1641                    // Parse error. Emit the comment token. Emit an end-of-file token.
1642                    None => {
1643                        self.emit_error(ErrorKind::EofInComment);
1644                        self.emit_comment_token(None);
1645                        self.emit_token(Token::Eof);
1646
1647                        return Ok(());
1648                    }
1649                    // Anything else
1650                    // Anything else
1651                    // Append two U+002D (-) characters and U+0021 EXCLAMATION MARK character(!) to
1652                    // the comment token’s data. Reconsume in the comment state.
1653                    _ => {
1654                        self.append_to_comment_token('-', '-');
1655                        self.append_to_comment_token('-', '-');
1656                        self.append_to_comment_token('!', '!');
1657                        self.reconsume_in_state(State::Comment);
1658                    }
1659                }
1660            }
1661            State::Cdata => {
1662                // Consume the next input character:
1663                match self.consume_next_char() {
1664                    // U+005D RIGHT SQUARE BRACKET (])
1665                    // Switch to the CDATA bracket state.
1666                    Some(']') => {
1667                        self.state = State::CdataBracket;
1668                    }
1669                    // EOF
1670                    // Parse error. Reprocess the current input character in the data state.
1671                    None => {
1672                        self.emit_error(ErrorKind::EofInCdata);
1673                        self.reconsume_in_state(State::Data);
1674                    }
1675                    // Anything else
1676                    // Append the current input character to the cdata dta. Stay in the current
1677                    // state.
1678                    Some(c) => {
1679                        self.validate_input_stream_character(c);
1680                        self.append_to_cdata_token(Some(c), Some(c));
1681                    }
1682                }
1683            }
1684            State::CdataBracket => {
1685                // Consume the next input character:
1686                match self.consume_next_char() {
1687                    // U+005D RIGHT SQUARE BRACKET (])
1688                    // Switch to the CDATA end state.
1689                    Some(']') => {
1690                        self.state = State::CdataEnd;
1691                    }
1692                    // EOF
1693                    // Parse error. Reconsume the current input character in the data state.
1694                    None => {
1695                        self.emit_error(ErrorKind::EofInCdata);
1696                        self.reconsume_in_state(State::Data);
1697                    }
1698                    // Anything else
1699                    // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
1700                    // CDATA section state.
1701                    Some(c) => {
1702                        self.append_to_cdata_token(Some(']'), Some(']'));
1703                        self.append_to_cdata_token(Some(c), Some(c));
1704                        self.state = State::Cdata;
1705                    }
1706                }
1707            }
1708            State::CdataEnd => {
1709                // Consume the next input character:
1710                match self.consume_next_char() {
1711                    // U+003E GREATER-THAN SIGN (>)
1712                    // Switch to the data state.
1713                    Some('>') => {
1714                        self.append_to_cdata_token(None, Some(']'));
1715                        self.append_to_cdata_token(None, Some(']'));
1716                        self.append_to_cdata_token(None, Some('>'));
1717                        self.emit_cdata_token();
1718                        self.state = State::Data;
1719                    }
1720                    // U+005D RIGHT SQUARE BRACKET (])
1721                    // Emit the current input character as character token. Stay in the current
1722                    // state.
1723                    Some(c @ ']') => {
1724                        self.append_to_cdata_token(Some(c), Some(c));
1725                    }
1726                    // EOF
1727                    // Parse error. Reconsume the current input character in the data state.
1728                    None => {
1729                        self.emit_error(ErrorKind::EofInCdata);
1730                        self.reconsume_in_state(State::Data);
1731                    }
1732                    // Anything else
1733                    // Emit two U+005D RIGHT SQUARE BRACKET (]) characters as character tokens and
1734                    // also emit the current input character as character token. Switch to the CDATA
1735                    // state.
1736                    Some(c) => {
1737                        self.append_to_cdata_token(Some(']'), Some(']'));
1738                        self.append_to_cdata_token(Some(']'), Some(']'));
1739                        self.append_to_cdata_token(Some(c), Some(c));
1740                        self.state = State::Cdata;
1741                    }
1742                }
1743            }
1744            State::TagOpen => {
1745                // Consume the next input character:
1746                match self.consume_next_char() {
1747                    // U+002F SOLIDUS (/)
1748                    // Switch to the end tag open state.
1749                    Some('/') => {
1750                        self.state = State::EndTagOpen;
1751                    }
1752                    // U+0021 EXCLAMATION MARK (!)
1753                    // Switch to the markup declaration open state.
1754                    Some('!') => {
1755                        self.state = State::MarkupDeclaration;
1756                    }
1757                    // U+003F QUESTION MARK(?)
1758                    // Switch to the pi state.
1759                    Some('?') => {
1760                        self.state = State::Pi;
1761                    }
1762                    // Name start character
1763                    // Create a new tag token and set its name to the input character, then switch
1764                    // to the tag name state.
1765                    Some(c) if is_name_start_char(c) => {
1766                        self.create_tag_token(TagKind::Start);
1767                        self.reconsume_in_state(State::TagName);
1768                    }
1769                    // EOF
1770                    // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1771                    // character token and an end-of-file token.
1772                    None => {
1773                        self.emit_error(ErrorKind::EofBeforeTagName);
1774                        self.emit_character_token(('<', '<'));
1775                        self.emit_token(Token::Eof);
1776
1777                        return Ok(());
1778                    }
1779                    // Anything else
1780                    // This is an invalid-first-character-of-tag-name parse error. Emit a U+003C
1781                    // LESS-THAN SIGN character token. Reconsume in the data state.
1782                    _ => {
1783                        self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1784                        self.emit_character_token(('<', '<'));
1785                        self.reconsume_in_state(State::Data);
1786                    }
1787                }
1788            }
1789            State::EndTagOpen => {
1790                // Consume the next input character:
1791                match self.consume_next_char() {
1792                    // ASCII alpha
1793                    // Create a new end tag token, set its tag name to the empty string.
1794                    // Reconsume in the tag name state.
1795                    Some(c) if is_name_char(c) => {
1796                        self.create_tag_token(TagKind::End);
1797                        self.reconsume_in_state(State::TagName);
1798                    }
1799                    // U+003E GREATER-THAN SIGN (>)
1800                    // This is a missing-end-tag-name parse error. Switch to the data state.
1801                    Some('>') => {
1802                        self.emit_error(ErrorKind::MissingEndTagName);
1803                        self.state = State::Data;
1804                    }
1805                    // EOF
1806                    // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1807                    // character token, a U+002F SOLIDUS character token and an end-of-file
1808                    // token.
1809                    None => {
1810                        self.emit_error(ErrorKind::EofBeforeTagName);
1811                        self.emit_character_token(('<', '<'));
1812                        self.emit_character_token(('/', '/'));
1813                        self.emit_token(Token::Eof);
1814
1815                        return Ok(());
1816                    }
1817                    // Anything else
1818                    // This is an invalid-first-character-of-tag-name parse error. Create a
1819                    // comment token whose data is the empty string. Reconsume in the bogus
1820                    // comment state.
1821                    _ => {
1822                        self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1823                        self.emit_character_token(('<', '<'));
1824                        self.emit_character_token(('/', '/'));
1825                        self.reconsume_in_state(State::BogusComment);
1826                    }
1827                }
1828            }
1829            State::TagName => {
1830                // Consume the next input character:
1831                match self.consume_next_char() {
1832                    // U+0009 CHARACTER TABULATION (Tab)
1833                    // U+000A LINE FEED (LF)
1834                    // U+0020 SPACE (Space)
1835                    // Switch to the before attribute name state.
1836                    Some(c) if is_whitespace(c) => {
1837                        self.skip_next_lf(c);
1838                        self.state = State::TagAttributeNameBefore;
1839                    }
1840                    // U+002F SOLIDUS (/)
1841                    // Set current tag to empty tag. Switch to the empty tag state.
1842                    Some('/') => {
1843                        self.set_tag_to_empty_tag();
1844                        self.state = State::EmptyTag;
1845                    }
1846                    // U+003E GREATER-THAN SIGN (>)
1847                    // Switch to the data state. Emit the current tag token.
1848                    Some('>') => {
1849                        self.state = State::Data;
1850                        self.emit_tag_token(None);
1851                    }
1852                    // EOF
1853                    // This is an eof-in-tag parse error. Emit an end-of-file token.
1854                    None => {
1855                        self.emit_error(ErrorKind::EofInTag);
1856                        self.emit_tag_token(None);
1857
1858                        return Ok(());
1859                    }
1860                    // Name character
1861                    // Append the current input character to the tag name and stay in the current
1862                    // state.
1863                    Some(c) if is_name_char(c) => {
1864                        self.validate_input_stream_character(c);
1865                        self.append_to_tag_token_name(c);
1866                    }
1867                    // Anything else
1868                    // Parse error. Append the current input character to the tag name and stay in
1869                    // the current state.
1870                    Some(c) => {
1871                        self.emit_error(ErrorKind::InvalidCharacterInTag);
1872                        self.validate_input_stream_character(c);
1873                        self.append_to_tag_token_name(c);
1874                    }
1875                }
1876            }
1877            State::EmptyTag => {
1878                // Consume the next input character:
1879                match self.consume_next_char() {
1880                    // U+003E GREATER-THAN SIGN (>)
1881                    // Emit the current tag token as empty tag token and then switch to the data
1882                    // state.
1883                    Some('>') => {
1884                        self.emit_tag_token(Some(TagKind::Empty));
1885                        self.state = State::Data;
1886                    }
1887                    // Anything else
1888                    // Parse error. Reprocess the current input character in the tag attribute name
1889                    // before state.
1890                    _ => {
1891                        self.emit_error(ErrorKind::UnexpectedSolidusInTag);
1892                        self.reconsume_in_state(State::TagAttributeNameBefore);
1893                    }
1894                }
1895            }
1896            State::TagAttributeNameBefore => {
1897                // Consume the next input character:
1898                match self.consume_next_char() {
1899                    // U+0009 CHARACTER TABULATION (tab)
1900                    // U+000A LINE FEED (LF)
1901                    // U+0020 SPACE
1902                    // Ignore the character.
1903                    Some(c) if is_whitespace(c) => {
1904                        self.skip_next_lf(c);
1905                    }
1906                    // U+003E GREATER-THAN SIGN(>)
1907                    // Emit the current token and then switch to the data state.
1908                    Some('>') => {
1909                        self.emit_tag_token(None);
1910                        self.state = State::Data;
1911                    }
1912                    // U+002F SOLIDUS (/)
1913                    // Set current tag to empty tag. Switch to the empty tag state.
1914                    Some('/') => {
1915                        self.set_tag_to_empty_tag();
1916                        self.state = State::EmptyTag;
1917                    }
1918                    // U+003A COLON (:)
1919                    // Parse error. Stay in the current state.
1920                    Some(':') => {
1921                        self.emit_error(ErrorKind::UnexpectedColonBeforeAttributeName);
1922                    }
1923                    // EOF
1924                    // Parse error. Emit the current token and then reprocess the current input
1925                    // character in the data state.
1926                    None => {
1927                        self.emit_error(ErrorKind::EofBeforeTagName);
1928                        self.emit_tag_token(None);
1929                        self.reconsume_in_state(State::Data);
1930                    }
1931                    // Anything else
1932                    // Start a new attribute in the current tag token. Set that attribute’s name to
1933                    // the current input character and its value to the empty string and then switch
1934                    // to the tag attribute name state.
1935                    _ => {
1936                        self.start_new_attribute(None);
1937                        self.reconsume_in_state(State::TagAttributeName);
1938                    }
1939                }
1940            }
1941            State::TagAttributeName => {
1942                // Consume the next input character:
1943                match self.consume_next_char() {
1944                    // U+003D EQUALS SIGN (=)
1945                    // Switch to the before attribute value state.
1946                    Some('=') => {
1947                        self.state = State::TagAttributeValueBefore;
1948                    }
1949                    // U+003E GREATER-THEN SIGN (>)
1950                    // Emit the current token as start tag token. Switch to the data state.
1951                    Some('>') => {
1952                        self.emit_error(ErrorKind::MissingEqualAfterAttributeName);
1953                        self.emit_tag_token(None);
1954                        self.state = State::Data;
1955                    }
1956                    // U+0009 CHARACTER TABULATION (Tab)
1957                    // U+000A LINE FEED (LF)
1958                    // U+0020 SPACE (Space)
1959                    // Switch to the tag attribute name after state.
1960                    Some(c) if is_whitespace(c) => {
1961                        self.update_attribute_span();
1962                        self.skip_next_lf(c);
1963                        self.reconsume_in_state(State::TagAttributeNameAfter);
1964                    }
1965                    // U+002F SOLIDUS (/)
1966                    // Set current tag to empty tag. Switch to the empty tag state.
1967                    Some('/') => {
1968                        self.emit_error(ErrorKind::MissingEqualAfterAttributeName);
1969                        self.set_tag_to_empty_tag();
1970                        self.state = State::EmptyTag;
1971                    }
1972                    // EOF
1973                    // Parse error. Emit the current token as start tag token and then reprocess the
1974                    // current input character in the data state.
1975                    None => {
1976                        self.emit_error(ErrorKind::EofInTag);
1977                        self.emit_tag_token(Some(TagKind::Start));
1978                        self.reconsume_in_state(State::Data);
1979                    }
1980                    // Anything else
1981                    // Append the current input character to the current attribute's name.
1982                    Some(c) => {
1983                        self.validate_input_stream_character(c);
1984                        self.append_to_attribute(Some((c, c)), None);
1985                    }
1986                }
1987
1988                // When the user agent leaves the attribute name state (and
1989                // before emitting the tag token, if appropriate), the
1990                // complete attribute's name must be compared to the other
1991                // attributes on the same token; if there is already an
1992                // attribute on the token with the exact same name, then
1993                // this is a duplicate-attribute parse error and the new
1994                // attribute must be removed from the token.
1995                //
1996                // We postpone it when we will emit current tag token
1997            }
1998            State::TagAttributeNameAfter => {
1999                // Consume the next input character:
2000                match self.consume_next_char() {
2001                    // U+0009 CHARACTER TABULATION (tab)
2002                    // U+000A LINE FEED (LF)
2003                    // U+0020 SPACE
2004                    // Ignore the character.
2005                    Some(c) if is_whitespace(c) => {
2006                        self.skip_next_lf(c);
2007                    }
2008                    // U+003D EQUALS SIGN(=)
2009                    // Switch to the tag attribute value before state.
2010                    Some('=') => {
2011                        self.state = State::TagAttributeValueBefore;
2012                    }
2013                    // U+003E GREATER-THEN SIGN(>)
2014                    // Emit the current token and then switch to the data state.
2015                    Some('>') => {
2016                        self.emit_tag_token(None);
2017                        self.state = State::Data;
2018                    }
2019                    // U+002F SOLIDUS (/)
2020                    // Set current tag to empty tag. Switch to the empty tag state.
2021                    Some('/') => {
2022                        self.set_tag_to_empty_tag();
2023                        self.state = State::EmptyTag;
2024                    }
2025                    // EOF
2026                    // Parse error. Emit the current token and then reprocess the current input
2027                    // character in the data state.
2028                    None => {
2029                        self.emit_error(ErrorKind::EofInTag);
2030                        self.emit_tag_token(None);
2031                        self.reconsume_in_state(State::Data);
2032                    }
2033                    // Anything else
2034                    // Start a new attribute in the current tag token. Set that attribute’s name to
2035                    // the current input character and its value to the empty string and then switch
2036                    // to the tag attribute name state.
2037                    Some(c) => {
2038                        self.emit_error(ErrorKind::MissingEqualAfterAttributeName);
2039                        self.validate_input_stream_character(c);
2040                        self.start_new_attribute(Some(c));
2041                        self.state = State::TagAttributeName;
2042                    }
2043                }
2044            }
2045            State::TagAttributeValueBefore => {
2046                // Consume the next input character:
2047                match self.consume_next_char() {
2048                    // U+0009 CHARACTER TABULATION (tab)
2049                    // U+000A LINE FEED (LF)
2050                    // U+0020 SPACE
2051                    // Ignore the character.
2052                    Some(c) if is_whitespace(c) => {
2053                        self.skip_next_lf(c);
2054                    }
2055                    // U+0022 QUOTATION MARK (")
2056                    // Switch to the attribute value (double-quoted) state.
2057                    Some(c @ '"') => {
2058                        self.append_to_attribute(None, Some((true, None, Some(c))));
2059                        self.state = State::TagAttributeValueDoubleQuoted;
2060                    }
2061                    // U+0027 APOSTROPHE (')
2062                    // Switch to the attribute value (single-quoted) state.
2063                    Some(c @ '\'') => {
2064                        self.append_to_attribute(None, Some((true, None, Some(c))));
2065                        self.state = State::TagAttributeValueSingleQuoted;
2066                    }
2067                    // U+003E GREATER-THAN SIGN(>)
2068                    // Emit the current token and then switch to the data state.
2069                    Some('>') => {
2070                        self.emit_tag_token(None);
2071                        self.state = State::Data;
2072                    }
2073                    // EOF
2074                    // Parse error. Emit the current token and then reprocess the current input
2075                    // character in the data state.
2076                    None => {
2077                        self.emit_error(ErrorKind::EofInTag);
2078                        self.emit_tag_token(None);
2079                        self.reconsume_in_state(State::Data);
2080                    }
2081                    // Anything else
2082                    // Append the current input character to the current attribute’s value and then
2083                    // switch to the tag attribute value unquoted state.
2084                    Some(c) => {
2085                        self.emit_error(ErrorKind::MissingQuoteBeforeAttributeValue);
2086                        self.validate_input_stream_character(c);
2087                        self.append_to_attribute(None, Some((true, Some(c), Some(c))));
2088                        self.state = State::TagAttributeValueUnquoted;
2089                    }
2090                }
2091            }
2092            State::TagAttributeValueDoubleQuoted => {
2093                // Consume the next input character:
2094                match self.consume_next_char() {
2095                    // U+0022 QUOTATION MARK (")
2096                    // Switch to the tag attribute name before state.
2097                    // We set value to support empty attributes (i.e. `attr=""`)
2098                    Some(c @ '"') => {
2099                        self.append_to_attribute(None, Some((false, None, Some(c))));
2100                        self.state = State::TagAttributeValueAfter;
2101                    }
2102                    // U+0026 AMPERSAND (&)
2103                    // Switch to character reference in attribute value state, with the additional
2104                    // allowed character being U+0022 QUOTATION MARK(").
2105                    Some('&') => {
2106                        self.return_state = Some(self.state.clone());
2107                        self.state = State::CharacterReferenceInAttributeValue;
2108                        self.additional_allowed_character = Some('"');
2109                    }
2110                    // (<)
2111                    Some(c @ '<') => {
2112                        self.emit_error(ErrorKind::UnescapedCharacterInAttributeValue('<'));
2113                        self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2114                    }
2115                    // EOF
2116                    // Parse error. Emit the current token and then reprocess the current input
2117                    // character in the data state.
2118                    None => {
2119                        self.emit_error(ErrorKind::EofInTag);
2120                        self.emit_tag_token(None);
2121                        self.reconsume_in_state(State::Data);
2122                    }
2123                    // Anything else
2124                    // Append the input character to the current attribute’s value. Stay in the
2125                    // current state.
2126                    Some(c) => {
2127                        self.validate_input_stream_character(c);
2128                        self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2129                    }
2130                }
2131            }
2132            State::TagAttributeValueSingleQuoted => {
2133                // Consume the next input character:
2134                match self.consume_next_char() {
2135                    // U+0022 APOSTROPHE (')
2136                    // Switch to the tag attribute name before state.
2137                    // We set value to support empty attributes (i.e. `attr=''`)
2138                    Some(c @ '\'') => {
2139                        self.append_to_attribute(None, Some((false, None, Some(c))));
2140                        self.state = State::TagAttributeValueAfter;
2141                    }
2142                    // U+0026 AMPERSAND (&)
2143                    // Switch to character reference in attribute value state, with the additional
2144                    // allowed character being APOSTROPHE (').
2145                    Some('&') => {
2146                        self.return_state = Some(self.state.clone());
2147                        self.state = State::CharacterReferenceInAttributeValue;
2148                        self.additional_allowed_character = Some('\'');
2149                    }
2150                    // (<)
2151                    Some(c @ '<') => {
2152                        self.emit_error(ErrorKind::UnescapedCharacterInAttributeValue('<'));
2153                        self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2154                    }
2155                    // EOF
2156                    // Parse error. Emit the current token and then reprocess the current input
2157                    // character in the data state.
2158                    None => {
2159                        self.emit_error(ErrorKind::EofInTag);
2160                        self.emit_tag_token(None);
2161                        self.reconsume_in_state(State::Data);
2162                    }
2163                    // Anything else
2164                    // Append the current input character to the current attribute's value.
2165                    Some(c) => {
2166                        self.validate_input_stream_character(c);
2167                        self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2168                    }
2169                }
2170            }
2171            State::TagAttributeValueUnquoted => {
2172                // Consume the next input character:
2173                match self.consume_next_char() {
2174                    // U+0009 CHARACTER TABULATION (Tab)
2175                    // U+000A LINE FEED (LF)
2176                    // U+0020 SPACE (Space)
2177                    // Switch to the before attribute name state.
2178                    Some(c) if is_whitespace(c) => {
2179                        self.update_attribute_span();
2180                        self.skip_next_lf(c);
2181                        self.state = State::TagAttributeValueAfter;
2182                    }
2183                    // U+0026 AMPERSAND (&)
2184                    // Set the return state to the attribute value (unquoted) state. Switch to
2185                    // the character reference state.
2186                    Some('&') => {
2187                        self.return_state = Some(self.state.clone());
2188                        self.state = State::CharacterReferenceInAttributeValue;
2189                        self.additional_allowed_character = Some('>');
2190                    }
2191                    // (<)
2192                    Some(c @ '<') => {
2193                        self.emit_error(ErrorKind::UnescapedCharacterInAttributeValue('<'));
2194                        self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2195                    }
2196                    // U+003E GREATER-THAN SIGN (>)
2197                    // Emit the current token as start tag token and then switch to the data state.
2198                    Some('>') => {
2199                        self.update_attribute_span();
2200                        self.emit_tag_token(Some(TagKind::Start));
2201                        self.state = State::Data;
2202                    }
2203                    // EOF
2204                    // Parse error. Emit the current token as start tag token and then reprocess the
2205                    // current input character in the data state.
2206                    None => {
2207                        self.emit_error(ErrorKind::EofInTag);
2208                        self.update_attribute_span();
2209                        self.emit_tag_token(Some(TagKind::Start));
2210                        self.reconsume_in_state(State::Data);
2211                    }
2212                    // Anything else
2213                    // Append the input character to the current attribute’s value. Stay in the
2214                    // current state.
2215                    Some(c) => {
2216                        self.validate_input_stream_character(c);
2217                        self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2218                    }
2219                }
2220            }
2221            State::TagAttributeValueAfter => match self.consume_next_char() {
2222                Some(c) if is_whitespace(c) => {
2223                    self.reconsume_in_state(State::TagAttributeNameBefore);
2224                }
2225                Some('>') | Some('/') => {
2226                    self.reconsume_in_state(State::TagAttributeNameBefore);
2227                }
2228                None => {
2229                    self.emit_error(ErrorKind::EofInTag);
2230                    self.update_attribute_span();
2231                    self.emit_tag_token(Some(TagKind::Start));
2232                    self.reconsume_in_state(State::Data);
2233                }
2234                _ => {
2235                    self.emit_error(ErrorKind::MissingSpaceBetweenAttributes);
2236                    self.reconsume_in_state(State::TagAttributeNameBefore);
2237                }
2238            },
2239            State::CharacterReferenceInAttributeValue => {
2240                // Attempt to consume a character reference.
2241                //
2242                // If nothing is returned, append a U+0026 AMPERSAND (&) character to current
2243                // attribute’s value.
2244                //
2245                // Otherwise append returned character tokens to current attribute’s value.
2246                //
2247                // Finally, switch back to attribute value state that switched to this state.
2248
2249                let character_reference = self.consume_character_reference();
2250
2251                if let Some((c, raw)) = character_reference {
2252                    self.append_to_attribute_with_entity(Some((Some(c), Some(&raw))));
2253                } else {
2254                    self.append_to_attribute(None, Some((false, Some('&'), Some('&'))));
2255                }
2256
2257                if let Some(return_state) = &self.return_state {
2258                    self.state = return_state.clone();
2259                }
2260            }
2261            State::BogusComment => {
2262                // Consume every character up to the first U+003E GREATER-THAN SIGN (>) or EOF,
2263                // whichever comes first. Emit a comment token whose data is the concatenation
2264                // of all those consumed characters. Then consume the next input character and
2265                // switch to the data state reprocessing the EOF character if that was the
2266                // character consumed.
2267                match self.consume_next_char() {
2268                    // U+003E GREATER-THAN SIGN (>)
2269                    // Switch to the data state. Emit the current comment token.
2270                    Some('>') => {
2271                        self.emit_comment_token(Some(">"));
2272                        self.state = State::Data;
2273                    }
2274                    // EOF
2275                    // Emit the comment. Emit an end-of-file token.
2276                    None => {
2277                        self.emit_comment_token(None);
2278                        self.state = State::Data;
2279                        self.reconsume();
2280                    }
2281                    // Anything else
2282                    // Append the current input character to the comment token's data.
2283                    Some(c) => {
2284                        self.validate_input_stream_character(c);
2285                        self.handle_raw_and_append_to_comment_token(c);
2286                    }
2287                }
2288            }
2289            State::Doctype => {
2290                // Consume the next input character:
2291                match self.consume_next_char() {
2292                    // U+0009 CHARACTER TABULATION (tab)
2293                    // U+000A LINE FEED (LF)
2294                    // U+000C FORM FEED (FF)
2295                    // U+0020 SPACE
2296                    // Switch to the before DOCTYPE name state.
2297                    Some(c) if is_whitespace(c) => {
2298                        self.append_raw_to_doctype_token(c);
2299                        self.state = State::BeforeDoctypeName;
2300                    }
2301                    // EOF
2302                    // Parse error. Switch to data state. Create new Doctype token. Emit Doctype
2303                    // token. Reconsume the EOF character.
2304                    None => {
2305                        self.emit_error(ErrorKind::EofInDoctype);
2306                        self.state = State::Data;
2307                        self.create_doctype_token(None);
2308                        self.emit_doctype_token();
2309                        self.reconsume();
2310                    }
2311                    // Anything else
2312                    // This is a missing-whitespace-before-doctype-name parse error. Reconsume
2313                    // in the before DOCTYPE name state.
2314                    _ => {
2315                        self.emit_error(ErrorKind::MissingWhitespaceBeforeDoctypeName);
2316                        self.reconsume_in_state(State::BeforeDoctypeName);
2317                    }
2318                }
2319            }
2320            State::BeforeDoctypeName => {
2321                // Consume the next input character:
2322                match self.consume_next_char() {
2323                    // U+0009 CHARACTER TABULATION (tab)
2324                    // U+000A LINE FEED (LF)
2325                    // U+000C FORM FEED (FF)
2326                    // U+0020 SPACE
2327                    // Ignore the character.
2328                    Some(c) if is_whitespace(c) => {
2329                        self.append_raw_to_doctype_token(c);
2330                    }
2331                    // Uppercase ASCII letter
2332                    // Create a new DOCTYPE token. Set the token name to lowercase version of the
2333                    // current input character. Switch to the DOCTYPE name state.
2334                    Some(c) if is_ascii_upper_alpha(c) => {
2335                        self.append_raw_to_doctype_token(c);
2336                        self.create_doctype_token(Some(c.to_ascii_lowercase()));
2337                        self.state = State::DoctypeName;
2338                    }
2339                    // U+003E GREATER-THAN SIGN (>)
2340                    // This is a missing-doctype-name parse error. Create a new DOCTYPE token.
2341                    // Set its force-quirks flag to on. Switch to the data state. Emit the
2342                    // current token.
2343                    Some(c @ '>') => {
2344                        self.append_raw_to_doctype_token(c);
2345                        self.emit_error(ErrorKind::MissingDoctypeName);
2346                        self.create_doctype_token(None);
2347                        self.emit_doctype_token();
2348                        self.state = State::Data;
2349                    }
2350                    // EOF
2351                    // Parse error. Switch to data state. Create new Doctype token. Emit Doctype
2352                    // token. Reconsume the EOF character.
2353                    None => {
2354                        self.emit_error(ErrorKind::EofInDoctype);
2355                        self.state = State::Data;
2356                        self.create_doctype_token(None);
2357                        self.emit_doctype_token();
2358                        self.reconsume();
2359                    }
2360                    // Anything else
2361                    // Create new DOCTYPE token. Set the token’s name to current input character.
2362                    // Switch to DOCTYPE name state.
2363                    Some(c) => {
2364                        self.validate_input_stream_character(c);
2365                        self.append_raw_to_doctype_token(c);
2366                        self.create_doctype_token(Some(c));
2367                        self.state = State::DoctypeName;
2368                    }
2369                }
2370            }
2371            State::DoctypeName => {
2372                // Consume the next input character:
2373                match self.consume_next_char() {
2374                    // U+0009 CHARACTER TABULATION (tab)
2375                    // U+000A LINE FEED (LF)
2376                    // U+000C FORM FEED (FF)
2377                    // U+0020 SPACE
2378                    // Switch to the after DOCTYPE name state.
2379                    Some(c) if is_whitespace(c) => {
2380                        self.append_raw_to_doctype_token(c);
2381                        self.state = State::AfterDoctypeName;
2382                    }
2383                    // ASCII upper alpha
2384                    // Append the lowercase version of the current input character (add 0x0020
2385                    // to the character's code point) to the current DOCTYPE token's name.
2386                    Some(c) if is_ascii_upper_alpha(c) => {
2387                        self.append_raw_to_doctype_token(c);
2388                        self.append_to_doctype_token(Some(c.to_ascii_lowercase()), None, None);
2389                    }
2390                    // U+003E GREATER-THAN SIGN (>)
2391                    // Emit token. Switch to data state.
2392                    Some('>') => {
2393                        self.emit_doctype_token();
2394                        self.state = State::Data;
2395                    }
2396                    // EOF
2397                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2398                    // character.
2399                    None => {
2400                        self.emit_error(ErrorKind::EofInDoctype);
2401                        self.state = State::Data;
2402                        self.emit_doctype_token();
2403                        self.reconsume();
2404                    }
2405                    // Anything else
2406                    // Append the current input character to the current DOCTYPE token's name.
2407                    Some(c) => {
2408                        self.validate_input_stream_character(c);
2409                        self.append_raw_to_doctype_token(c);
2410                        self.append_to_doctype_token(Some(c), None, None);
2411                    }
2412                }
2413            }
2414            State::AfterDoctypeName => {
2415                let cur_pos = self.input.cur_pos();
2416
2417                // Consume the next input character:
2418                match self.consume_next_char() {
2419                    // U+0009 CHARACTER TABULATION (tab)
2420                    // U+000A LINE FEED (LF)
2421                    // U+000C FORM FEED (FF)
2422                    // U+0020 SPACE
2423                    // Ignore the character.
2424                    Some(c) if is_whitespace(c) => {
2425                        self.append_raw_to_doctype_token(c);
2426                    }
2427                    // U+003E GREATER-THAN SIGN (>)
2428                    // Switch to the data state. Emit the current DOCTYPE token.
2429                    Some(c @ '>') => {
2430                        self.append_raw_to_doctype_token(c);
2431                        self.state = State::Data;
2432                        self.emit_doctype_token();
2433                    }
2434                    // U+005B LEFT SQUARE BRACKET ([)
2435                    // Switch to the doctype internal subset state.
2436                    Some(c @ '[') => {
2437                        self.append_raw_to_doctype_token(c);
2438                        self.state = State::DoctypeTypeInternalSubSet;
2439                    }
2440                    // EOF
2441                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2442                    // character.
2443                    None => {
2444                        self.emit_error(ErrorKind::EofInDoctype);
2445                        self.state = State::Data;
2446                        self.emit_doctype_token();
2447                        self.reconsume();
2448                    }
2449                    // Anything else
2450                    // If the six characters starting from the current input character are an
2451                    // ASCII case-insensitive match for the word "PUBLIC", then consume those
2452                    // characters and switch to the after DOCTYPE public keyword state.
2453                    //
2454                    // Otherwise, if the six characters starting from the current input
2455                    // character are an ASCII case-insensitive match for the word "SYSTEM", then
2456                    // consume those characters and switch to the after DOCTYPE system keyword
2457                    // state.
2458                    //
2459                    // Otherwise, this is an invalid-character-sequence-after-doctype-name parse
2460                    // error. Set the current DOCTYPE token's force-quirks flag to on. Reconsume
2461                    // in the bogus DOCTYPE state.
2462                    Some(c) => {
2463                        let mut first_six_chars = String::with_capacity(6);
2464
2465                        first_six_chars.push(c);
2466
2467                        for _ in 0..5 {
2468                            match self.consume_next_char() {
2469                                Some(c) => {
2470                                    first_six_chars.push(c);
2471                                }
2472                                _ => {
2473                                    break;
2474                                }
2475                            }
2476                        }
2477
2478                        match &*first_six_chars.to_lowercase() {
2479                            "public" => {
2480                                self.state = State::AfterDoctypePublicKeyword;
2481
2482                                if let Some(doctype_raw) = &mut self.doctype_raw {
2483                                    doctype_raw.push_str(&first_six_chars);
2484                                }
2485                            }
2486                            "system" => {
2487                                self.state = State::AfterDoctypeSystemKeyword;
2488
2489                                if let Some(doctype_raw) = &mut self.doctype_raw {
2490                                    doctype_raw.push_str(&first_six_chars);
2491                                }
2492                            }
2493                            _ => {
2494                                self.cur_pos = cur_pos;
2495                                unsafe {
2496                                    // Safety: We got cur_pos from self.input.cur_pos()
2497                                    self.input.reset_to(cur_pos);
2498                                }
2499                                self.emit_error(
2500                                    ErrorKind::InvalidCharacterSequenceAfterDoctypeName,
2501                                );
2502                                self.reconsume_in_state(State::BogusDoctype);
2503                            }
2504                        }
2505                    }
2506                }
2507            }
2508            State::AfterDoctypePublicKeyword => {
2509                // Consume the next input character:
2510                match self.consume_next_char() {
2511                    // U+0009 CHARACTER TABULATION (Tab)
2512                    // U+000A LINE FEED (LF)
2513                    // U+000C FORM FEED (FF)
2514                    // U+0020 SPACE (Space)
2515                    // Switch to the before DOCTYPE public identifier state.
2516                    Some(c) if is_whitespace(c) => {
2517                        self.append_raw_to_doctype_token(c);
2518                        self.state = State::BeforeDoctypePublicIdentifier;
2519                    }
2520                    // U+0022 QUOTATION MARK (")
2521                    // This is a missing-whitespace-after-doctype-public-keyword parse error.
2522                    // Set the current DOCTYPE token's public identifier to the empty string
2523                    // (not missing), then switch to the DOCTYPE public identifier
2524                    // (double-quoted) state.
2525                    Some(c @ '"') => {
2526                        self.append_raw_to_doctype_token(c);
2527                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
2528                        self.set_doctype_token_public_id();
2529                        self.state = State::DoctypePublicIdentifierDoubleQuoted;
2530                    }
2531                    // U+0027 APOSTROPHE (')
2532                    // This is a missing-whitespace-after-doctype-public-keyword parse error.
2533                    // Set the current DOCTYPE token's public identifier to the empty string
2534                    // (not missing), then switch to the DOCTYPE public identifier
2535                    // (single-quoted) state.
2536                    Some(c @ '\'') => {
2537                        self.append_raw_to_doctype_token(c);
2538                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
2539                        self.set_doctype_token_public_id();
2540                        self.state = State::DoctypePublicIdentifierSingleQuoted;
2541                    }
2542                    // U+003E GREATER-THAN SIGN (>)
2543                    // This is a missing-doctype-public-identifier parse error. Set the current
2544                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
2545                    // the current DOCTYPE token.
2546                    Some(c @ '>') => {
2547                        self.append_raw_to_doctype_token(c);
2548                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
2549                        self.set_doctype_token_public_id();
2550                        self.state = State::DoctypePublicIdentifierSingleQuoted;
2551                    }
2552                    // EOF
2553                    // Parse error. Switch to the data state. Emit that DOCTYPE token. Reconsume the
2554                    // EOF character.
2555                    None => {
2556                        self.emit_error(ErrorKind::EofInDoctype);
2557                        self.state = State::Data;
2558                        self.emit_doctype_token();
2559                        self.reconsume()
2560                    }
2561                    // Anything else
2562                    // Parse error. Switch to the bogus DOCTYPE state. Emit that DOCTYPE token.
2563                    // Reconsume the EOF character.
2564                    _ => {
2565                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier);
2566                        self.reconsume_in_state(State::BogusDoctype);
2567                        self.emit_doctype_token();
2568                        self.reconsume()
2569                    }
2570                }
2571            }
2572            State::AfterDoctypeSystemKeyword => {
2573                // Consume the next input character:
2574                match self.consume_next_char() {
2575                    // U+0009 CHARACTER TABULATION (tab)
2576                    // U+000A LINE FEED (LF)
2577                    // U+000C FORM FEED (FF)
2578                    // U+0020 SPACE
2579                    // Switch to the before DOCTYPE system identifier state.
2580                    Some(c) if is_whitespace(c) => {
2581                        self.append_raw_to_doctype_token(c);
2582                        self.state = State::BeforeDoctypeSystemIdentifier;
2583                    }
2584                    // U+0022 QUOTATION MARK (")
2585                    // This is a missing-whitespace-after-doctype-system-keyword parse error.
2586                    // Set the current DOCTYPE token's system identifier to the empty string
2587                    // (not missing), then switch to the DOCTYPE system identifier
2588                    // (double-quoted) state.
2589                    Some(c @ '"') => {
2590                        self.append_raw_to_doctype_token(c);
2591                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
2592                        self.set_doctype_token_system_id();
2593                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2594                    }
2595                    // U+0027 APOSTROPHE (')
2596                    // This is a missing-whitespace-after-doctype-system-keyword parse error.
2597                    // Set the current DOCTYPE token's system identifier to the empty string
2598                    // (not missing), then switch to the DOCTYPE system identifier
2599                    // (single-quoted) state.
2600                    Some(c @ '\'') => {
2601                        self.append_raw_to_doctype_token(c);
2602                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
2603                        self.set_doctype_token_system_id();
2604                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
2605                    }
2606                    // U+003E GREATER-THAN SIGN(>)
2607                    // Parse error. Set the DOCTYPE token’s public identifier current DOCTYPE token
2608                    // to the empty string (not missing), then switch to the DOCTYPE system
2609                    // identifier (single-quoted) state.
2610                    Some(c @ '>') => {
2611                        self.append_raw_to_doctype_token(c);
2612                        self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
2613                        self.set_doctype_token_system_id();
2614                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
2615                    }
2616                    // EOF
2617                    // Parse error. Switch to the data state. Emit that DOCTYPE token. Reconsume the
2618                    // EOF character.
2619                    None => {
2620                        self.emit_error(ErrorKind::EofInDoctype);
2621                        self.state = State::Data;
2622                        self.emit_doctype_token();
2623                        self.reconsume()
2624                    }
2625                    // Anything else
2626                    // Parse error. Switch to the bogus DOCTYPE state.
2627                    Some(c) => {
2628                        self.validate_input_stream_character(c);
2629                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2630                        self.state = State::BogusComment
2631                    }
2632                }
2633            }
2634            State::BeforeDoctypeSystemIdentifier => {
2635                // Consume the next input character:
2636                match self.consume_next_char() {
2637                    // U+0009 CHARACTER TABULATION (tab)
2638                    // U+000A LINE FEED (LF)
2639                    // U+000C FORM FEED (FF)
2640                    // U+0020 SPACE
2641                    // Ignore the character.
2642                    Some(c) if is_whitespace(c) => {
2643                        self.append_raw_to_doctype_token(c);
2644                    }
2645                    // U+0022 QUOTATION MARK (")
2646                    // Set the current DOCTYPE token's system identifier to the empty string
2647                    // (not missing), then switch to the DOCTYPE system identifier
2648                    // (double-quoted) state.
2649                    Some(c @ '"') => {
2650                        self.append_raw_to_doctype_token(c);
2651                        self.set_doctype_token_system_id();
2652                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2653                    }
2654                    // U+0027 APOSTROPHE (')
2655                    // Set the current DOCTYPE token's system identifier to the empty string
2656                    // (not missing), then switch to the DOCTYPE system identifier
2657                    // (single-quoted) state.
2658                    Some(c @ '\'') => {
2659                        self.append_raw_to_doctype_token(c);
2660                        self.set_doctype_token_system_id();
2661                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
2662                    }
2663                    // U+003E GREATER-THAN SIGN (>)
2664                    // This is a missing-doctype-system-identifier parse error. Set the current
2665                    // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
2666                    // the current DOCTYPE token.
2667                    Some(c @ '>') => {
2668                        self.append_raw_to_doctype_token(c);
2669                        self.emit_error(ErrorKind::EofInDoctype);
2670                        self.state = State::Data;
2671                        self.emit_doctype_token();
2672                    }
2673                    // EOF
2674                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2675                    // character.
2676                    None => {
2677                        self.emit_error(ErrorKind::EofInDoctype);
2678                        self.state = State::Data;
2679                        self.emit_doctype_token();
2680                        self.reconsume();
2681                    }
2682                    // Anything else
2683                    // Parse error. Switch to the bogus DOCTYPE state.
2684                    Some(c) => {
2685                        self.validate_input_stream_character(c);
2686                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2687                        self.state = State::BogusDoctype;
2688                    }
2689                }
2690            }
2691            State::BeforeDoctypePublicIdentifier => {
2692                // Consume the next input character:
2693                match self.consume_next_char() {
2694                    // U+0009 CHARACTER TABULATION (tab)
2695                    // U+000A LINE FEED (LF)
2696                    // U+000C FORM FEED (FF)
2697                    // U+0020 SPACE
2698                    // Ignore the character.
2699                    Some(c) if is_whitespace(c) => {
2700                        self.append_raw_to_doctype_token(c);
2701                    }
2702                    // U+0022 QUOTATION MARK (")
2703                    // Set the current DOCTYPE token's public identifier to the empty string
2704                    // (not missing), then switch to the DOCTYPE public identifier
2705                    // (double-quoted) state.
2706                    Some(c @ '"') => {
2707                        self.append_raw_to_doctype_token(c);
2708                        self.set_doctype_token_public_id();
2709                        self.state = State::DoctypePublicIdentifierDoubleQuoted;
2710                    }
2711                    // U+0027 APOSTROPHE (')
2712                    // Set the current DOCTYPE token's public identifier to the empty string
2713                    // (not missing), then switch to the DOCTYPE public identifier
2714                    // (single-quoted) state.
2715                    Some(c @ '\'') => {
2716                        self.append_raw_to_doctype_token(c);
2717                        self.set_doctype_token_public_id();
2718                        self.state = State::DoctypePublicIdentifierSingleQuoted;
2719                    }
2720                    // U+003E GREATER-THAN SIGN(>)
2721                    // Parse error. Switch to data state. Emit current DOCTYPE token.
2722                    Some(c @ '>') => {
2723                        self.append_raw_to_doctype_token(c);
2724                        self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
2725                        self.state = State::Data;
2726                        self.emit_doctype_token();
2727                    }
2728                    // EOF
2729                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
2730                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
2731                    // end-of-file token.
2732                    None => {
2733                        self.emit_error(ErrorKind::EofInDoctype);
2734                        self.state = State::Data;
2735                        self.emit_doctype_token();
2736                        self.reconsume();
2737                    }
2738                    // Anything else
2739                    // Parse error. Switch to the bogus DOCTYPE state.
2740                    Some(c) => {
2741                        self.validate_input_stream_character(c);
2742                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2743                        self.state = State::BogusDoctype;
2744                    }
2745                }
2746            }
2747            State::DoctypePublicIdentifierSingleQuoted => {
2748                // Consume the next input character:
2749                match self.consume_next_char() {
2750                    // U+0027 APOSTROPHE (')
2751                    // Switch to the after DOCTYPE public identifier state.
2752                    Some(c @ '\'') => {
2753                        self.append_raw_to_doctype_token(c);
2754                        self.state = State::AfterDoctypePublicIdentifier;
2755                    }
2756                    // U+003E GREATER-THAN SIGN(>)
2757                    // Parse error. Switch to data state. Emit current DOCTYPE token.
2758                    Some(c @ '>') => {
2759                        self.append_raw_to_doctype_token(c);
2760                        self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
2761                        self.state = State::Data;
2762                        self.emit_doctype_token();
2763                    }
2764                    // EOF
2765                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2766                    // character.
2767                    None => {
2768                        self.emit_error(ErrorKind::EofInDoctype);
2769                        self.state = State::Data;
2770                        self.emit_doctype_token();
2771                        self.reconsume();
2772                    }
2773                    // Anything else
2774                    // Append the current input character to the current DOCTYPE token’s public
2775                    // identifier.
2776                    Some(c) => {
2777                        self.validate_input_stream_character(c);
2778                        self.append_raw_to_doctype_token(c);
2779                        self.append_to_doctype_token(None, Some(c), None);
2780                    }
2781                }
2782            }
2783            State::DoctypePublicIdentifierDoubleQuoted => {
2784                // Consume the next input character:
2785                match self.consume_next_char() {
2786                    // U+0022 QUOTATION MARK (")
2787                    // Switch to the after DOCTYPE public identifier state.
2788                    Some(c @ '"') => {
2789                        self.append_raw_to_doctype_token(c);
2790                        self.state = State::AfterDoctypePublicIdentifier;
2791                    }
2792                    // U+003E GREATER-THAN SIGN(>)
2793                    // Parse error. Switch to data state. Emit current DOCTYPE token.
2794                    Some(c @ '>') => {
2795                        self.append_raw_to_doctype_token(c);
2796                        self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
2797                        self.state = State::Data;
2798                        self.emit_doctype_token();
2799                    }
2800                    // EOF
2801                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2802                    // character.
2803                    None => {
2804                        self.emit_error(ErrorKind::EofInDoctype);
2805                        self.state = State::Data;
2806                        self.emit_doctype_token();
2807                        self.reconsume();
2808                    }
2809                    // Anything else
2810                    // Append the current input character to the current DOCTYPE token’s public
2811                    // identifier.
2812                    Some(c) => {
2813                        self.validate_input_stream_character(c);
2814                        self.append_raw_to_doctype_token(c);
2815                        self.append_to_doctype_token(None, Some(c), None);
2816                    }
2817                }
2818            }
2819            State::AfterDoctypePublicIdentifier => {
2820                // Consume the next input character:
2821                match self.consume_next_char() {
2822                    // U+0009 CHARACTER TABULATION (tab)
2823                    // U+000A LINE FEED (LF)
2824                    // U+000C FORM FEED (FF)
2825                    // U+0020 SPACE
2826                    // Switch to the between DOCTYPE public and system identifiers state.
2827                    Some(c) if is_whitespace(c) => {
2828                        self.append_raw_to_doctype_token(c);
2829                        self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
2830                    }
2831                    // U+0027 APOSTROPHE (')
2832                    // Parse error. Set the DOCTYPE token’s system identifier to the empty string
2833                    // (not missing) then switch to the DOCTYPE system identifier (single-quoted)
2834                    // state.
2835                    Some(c @ '\'') => {
2836                        self.append_raw_to_doctype_token(c);
2837                        self.emit_error(
2838                            ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
2839                        );
2840                        self.set_doctype_token_system_id();
2841                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
2842                    }
2843                    // U+0022 QUOTATION MARK (")
2844                    // Parse error. Set the DOCTYPE token’s system identifier to the empty string
2845                    // (not missing) then switch to the DOCTYPE system identifier (double-quoted)
2846                    // state.
2847                    Some(c @ '"') => {
2848                        self.append_raw_to_doctype_token(c);
2849                        self.emit_error(
2850                            ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
2851                        );
2852                        self.set_doctype_token_system_id();
2853                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2854                    }
2855                    // U+003E GREATER-THAN SIGN (>)
2856                    // Switch to the data state. Emit the current DOCTYPE token.
2857                    Some(c @ '>') => {
2858                        self.append_raw_to_doctype_token(c);
2859                        self.state = State::Data;
2860                        self.emit_doctype_token();
2861                    }
2862                    // EOF
2863                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2864                    // character.
2865                    None => {
2866                        self.emit_error(ErrorKind::EofInDoctype);
2867                        self.state = State::Data;
2868                        self.emit_doctype_token();
2869                        self.reconsume();
2870                    }
2871                    // Anything else
2872                    // Parse error. Switch to bogus DOCTYPE state.
2873                    Some(c) => {
2874                        self.validate_input_stream_character(c);
2875                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2876                        self.state = State::BogusComment;
2877                    }
2878                }
2879            }
2880            State::BetweenDoctypePublicAndSystemIdentifiers => {
2881                // Consume the next input character:
2882                match self.consume_next_char() {
2883                    // U+0009 CHARACTER TABULATION (tab)
2884                    // U+000A LINE FEED (LF)
2885                    // U+000C FORM FEED (FF)
2886                    // U+0020 SPACE
2887                    // Ignore the character.
2888                    Some(c) if is_whitespace(c) => {
2889                        self.append_raw_to_doctype_token(c);
2890                    }
2891                    // U+003E GREATER-THAN SIGN (>)
2892                    // Switch to the data state. Emit the current DOCTYPE token.
2893                    Some(c @ '>') => {
2894                        self.append_raw_to_doctype_token(c);
2895                        self.state = State::Data;
2896                        self.emit_doctype_token();
2897                    }
2898                    // U+0027 APOSTROPHE(')
2899                    // Set the DOCTYPE token’s system identifier to the empty string (not missing)
2900                    // then switch to the DOCTYPE system identifier (single-quoted) state.
2901                    Some(c @ '\'') => {
2902                        self.append_raw_to_doctype_token(c);
2903                        self.set_doctype_token_system_id();
2904                        self.state = State::DoctypeSystemIdentifierSingleQuoted;
2905                    }
2906                    // U+0022 QUOTATION MARK(")
2907                    // Set the DOCTYPE token’s system identifier to the empty string (not missing)
2908                    // then switch to the DOCTYPE system identifier (double-quoted) state.
2909                    Some(c @ '"') => {
2910                        self.append_raw_to_doctype_token(c);
2911                        self.set_doctype_token_system_id();
2912                        self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2913                    }
2914                    // EOF
2915                    // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
2916                    // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
2917                    // end-of-file token.
2918                    None => {
2919                        self.emit_error(ErrorKind::EofInDoctype);
2920                        self.state = State::Data;
2921                        self.emit_doctype_token();
2922                        self.reconsume();
2923                    }
2924                    // Anything else
2925                    // Parse error. Switch to Bogus DOCTYPE state.
2926                    Some(c) => {
2927                        self.validate_input_stream_character(c);
2928                        self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2929                        self.state = State::BogusDoctype;
2930                    }
2931                }
2932            }
2933            State::DoctypeSystemIdentifierSingleQuoted => {
2934                // Consume the next input character:
2935                match self.consume_next_char() {
2936                    // U+0027 APOSTROPHE (')
2937                    // Switch to the after DOCTYPE system identifier state.
2938                    Some(c @ '\'') => {
2939                        self.append_raw_to_doctype_token(c);
2940                        self.state = State::AfterDoctypeSystemIdentifier;
2941                    }
2942                    // U+003E GREATER-THAN SIGN (>)
2943                    // Parse error. Switch to data state. Emit current DOCTYPE token.
2944                    Some(c @ '>') => {
2945                        self.append_raw_to_doctype_token(c);
2946                        self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
2947                        self.state = State::Data;
2948                        self.emit_doctype_token();
2949                    }
2950                    // EOF
2951                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2952                    // character.
2953                    None => {
2954                        self.emit_error(ErrorKind::EofInDoctype);
2955                        self.state = State::Data;
2956                        self.emit_doctype_token();
2957                        self.reconsume();
2958                    }
2959                    // Anything else
2960                    // Append the current input character to the current DOCTYPE token's system
2961                    // identifier.
2962                    Some(c) => {
2963                        self.validate_input_stream_character(c);
2964                        self.append_raw_to_doctype_token(c);
2965                        self.append_to_doctype_token(None, None, Some(c));
2966                    }
2967                }
2968            }
2969            State::DoctypeSystemIdentifierDoubleQuoted => {
2970                // Consume the next input character:
2971                match self.consume_next_char() {
2972                    // U+0027 APOSTROPHE (')
2973                    // Switch to the after DOCTYPE system identifier state.
2974                    Some(c @ '"') => {
2975                        self.append_raw_to_doctype_token(c);
2976                        self.state = State::AfterDoctypeSystemIdentifier;
2977                    }
2978                    // U+003E GREATER-THAN SIGN (>)
2979                    // Parse error. Switch to data state. Emit current DOCTYPE token.
2980                    Some(c @ '>') => {
2981                        self.append_raw_to_doctype_token(c);
2982                        self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
2983                        self.state = State::Data;
2984                        self.emit_doctype_token();
2985                    }
2986                    // EOF
2987                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2988                    // character.
2989                    None => {
2990                        self.emit_error(ErrorKind::EofInDoctype);
2991                        self.state = State::Data;
2992                        self.emit_doctype_token();
2993                        self.reconsume();
2994                    }
2995                    // Anything else
2996                    // Append the current input character to the current DOCTYPE token's system
2997                    // identifier.
2998                    Some(c) => {
2999                        self.validate_input_stream_character(c);
3000                        self.append_raw_to_doctype_token(c);
3001                        self.append_to_doctype_token(None, None, Some(c));
3002                    }
3003                }
3004            }
3005            State::AfterDoctypeSystemIdentifier => {
3006                // Consume the next input character:
3007                match self.consume_next_char() {
3008                    // U+0009 CHARACTER TABULATION (tab)
3009                    // U+000A LINE FEED (LF)
3010                    // U+000C FORM FEED (FF)
3011                    // U+0020 SPACE
3012                    // Ignore the character.
3013                    Some(c) if is_whitespace(c) => {
3014                        self.append_raw_to_doctype_token(c);
3015                    }
3016                    // U+003E GREATER-THAN SIGN (>)
3017                    // Switch to the data state. Emit the current DOCTYPE token.
3018                    Some(c @ '>') => {
3019                        self.append_raw_to_doctype_token(c);
3020                        self.state = State::Data;
3021                        self.emit_doctype_token();
3022                    }
3023                    // U+005B LEFT SQUARE BRACKET ([)
3024                    // Switch to the doctype internal subset state.
3025                    Some(c @ '[') => {
3026                        self.append_raw_to_doctype_token(c);
3027                        self.state = State::DoctypeTypeInternalSubSet;
3028                    }
3029                    // EOF
3030                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
3031                    // character.
3032                    None => {
3033                        self.emit_error(ErrorKind::EofInDoctype);
3034                        self.state = State::Data;
3035                        self.emit_doctype_token();
3036                        self.reconsume();
3037                    }
3038                    // Anything else
3039                    // Parse error. Switch to Bogus DOCTYPE state.
3040                    Some(c) => {
3041                        self.validate_input_stream_character(c);
3042                        self.emit_error(ErrorKind::UnexpectedCharacterAfterDoctypeSystemIdentifier);
3043                        self.state = State::BogusDoctype;
3044                    }
3045                }
3046            }
3047            State::DoctypeTypeInternalSubSet => {
3048                // Consume the next input character:
3049                match self.consume_next_char() {
3050                    // U+005D RIGHT SQUARE BRACKET (])
3051                    // Switch to the CDATA bracket state.
3052                    Some(c @ ']') => {
3053                        self.append_raw_to_doctype_token(c);
3054                        self.state = State::AfterDoctypeName;
3055                    }
3056                    // EOF
3057                    // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
3058                    // character.
3059                    None => {
3060                        self.emit_error(ErrorKind::EofInDoctype);
3061                        self.state = State::Data;
3062                        self.emit_doctype_token();
3063                        self.reconsume();
3064                    }
3065                    // Anything else
3066                    // Append the current input character to the current DOCTYPE token's system
3067                    // identifier.
3068                    Some(c) => {
3069                        // TODO improve parse legacy declarations
3070                        self.validate_input_stream_character(c);
3071                        self.append_raw_to_doctype_token(c);
3072                    }
3073                }
3074            }
3075            State::BogusDoctype => {
3076                // Consume the next input character:
3077                match self.consume_next_char() {
3078                    // U+003E GREATER-THAN SIGN(>)
3079                    // Switch to data state. Emit DOCTYPE token.
3080                    Some(c @ '>') => {
3081                        self.append_raw_to_doctype_token(c);
3082                        self.state = State::Data;
3083                        self.emit_doctype_token();
3084                    }
3085                    // EOF
3086                    // Switch to the data state. Emit DOCTYPE token. Reconsume the EOF character.
3087                    None => {
3088                        self.state = State::Data;
3089                        self.emit_doctype_token();
3090                        self.reconsume();
3091                    }
3092                    // Anything else
3093                    // Ignore the character.
3094                    Some(c) => {
3095                        self.validate_input_stream_character(c);
3096                        self.append_raw_to_doctype_token(c);
3097                    }
3098                }
3099            }
3100        }
3101
3102        Ok(())
3103    }
3104
3105    #[inline(always)]
3106    fn skip_next_lf(&mut self, c: char) {
3107        if c == '\r' && self.input.cur() == Some('\n') {
3108            unsafe {
3109                // Safety: cur() is Some('\n')
3110                self.input.bump();
3111            }
3112        }
3113    }
3114}
3115
3116// S ::=
3117// 	(#x20 | #x9 | #xD | #xA)+
3118
3119#[inline(always)]
3120fn is_whitespace(c: char) -> bool {
3121    matches!(c, '\x20' | '\x09' | '\x0d' | '\x0a')
3122}
3123
3124#[inline(always)]
3125fn is_control(c: u32) -> bool {
3126    matches!(c, c @ 0x00..=0x1f | c @ 0x7f..=0x9f if !matches!(c, 0x09 | 0x0a | 0x0c | 0x0d | 0x20))
3127}
3128
3129#[inline(always)]
3130fn is_surrogate(c: u32) -> bool {
3131    matches!(c, 0xd800..=0xdfff)
3132}
3133
3134// A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
3135// inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE,
3136// U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
3137// U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE,
3138// U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE,
3139// U+FFFFF, U+10FFFE, or U+10FFFF.
3140#[inline(always)]
3141fn is_noncharacter(c: u32) -> bool {
3142    matches!(
3143        c,
3144        0xfdd0
3145            ..=0xfdef
3146                | 0xfffe
3147                | 0xffff
3148                | 0x1fffe
3149                | 0x1ffff
3150                | 0x2fffe
3151                | 0x2ffff
3152                | 0x3fffe
3153                | 0x3ffff
3154                | 0x4fffe
3155                | 0x4ffff
3156                | 0x5fffe
3157                | 0x5ffff
3158                | 0x6fffe
3159                | 0x6ffff
3160                | 0x7fffe
3161                | 0x7ffff
3162                | 0x8fffe
3163                | 0x8ffff
3164                | 0x9fffe
3165                | 0x9ffff
3166                | 0xafffe
3167                | 0xaffff
3168                | 0xbfffe
3169                | 0xbffff
3170                | 0xcfffe
3171                | 0xcffff
3172                | 0xdfffe
3173                | 0xdffff
3174                | 0xefffe
3175                | 0xeffff
3176                | 0xffffe
3177                | 0xfffff
3178                | 0x10fffe
3179                | 0x10ffff,
3180    )
3181}
3182
3183#[inline(always)]
3184fn is_ascii_upper_alpha(c: char) -> bool {
3185    c.is_ascii_uppercase()
3186}
3187
3188#[inline(always)]
3189fn is_upper_hex_digit(c: char) -> bool {
3190    matches!(c, '0'..='9' | 'A'..='F')
3191}
3192
3193#[inline(always)]
3194fn is_lower_hex_digit(c: char) -> bool {
3195    matches!(c, '0'..='9' | 'a'..='f')
3196}
3197
3198// NameStartChar ::=
3199// ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] |
3200// [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] |
3201// [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] |
3202// [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
3203#[inline(always)]
3204fn is_name_start_char(c: char) -> bool {
3205    match c {
3206        ':' | 'A'..='Z' | '_' | 'a'..='z' => true,
3207        _ if matches!(c as u32, 0xc0..=0xd6 | 0xd8..=0x2ff | 0x370..=0x37d | 0x37f..=0x1fff | 0x200c..=0x200d | 0x2070..=0x218f | 0x2c00..=0x2fef | 0x3001..=0xd7ff | 0xf900..=0xfdcf | 0xfdf0..=0xfffd | 0x10000..=0xeffff) => {
3208            true
3209        }
3210        _ => false,
3211    }
3212}
3213
3214// NameChar	::=
3215// NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] |
3216// [#x203F-#x2040]
3217#[inline(always)]
3218fn is_name_char(c: char) -> bool {
3219    match c {
3220        '-' | '.' | '0'..='9' => true,
3221        _ if matches!(c as u32, 0xb7 | 0x0300..=0x036f | 0x203f..=0x2040) => true,
3222        _ if is_name_start_char(c) => true,
3223        _ => false,
3224    }
3225}
swc_xml_parser/lexer/mod.rs

swc_xml_parser/lexer/
mod.rs