swc_xml_parser/lexer/mod.rs
1use std::{collections::VecDeque, mem::take};
2
3use rustc_hash::FxHashSet;
4use swc_atoms::Atom;
5use swc_common::{input::Input, BytePos, Span};
6use swc_xml_ast::{AttributeToken, Token, TokenAndSpan};
7
8use crate::{
9 error::{Error, ErrorKind},
10 parser::input::ParserInput,
11};
12
13#[derive(Debug, Clone)]
14pub enum State {
15 Data,
16 CharacterReferenceInData,
17 Pi,
18 PiTarget,
19 PiTargetQuestion,
20 PiTargetAfter,
21 PiData,
22 PiEnd,
23 MarkupDeclaration,
24 CommentStart,
25 CommentStartDash,
26 Comment,
27 CommentLessThanSign,
28 CommentLessThanSignBang,
29 CommentLessThanSignBangDash,
30 CommentLessThanSignBangDashDash,
31 CommentEndDash,
32 CommentEnd,
33 CommentEndBang,
34 Cdata,
35 CdataBracket,
36 CdataEnd,
37 TagOpen,
38 EndTagOpen,
39 TagName,
40 EmptyTag,
41 TagAttributeNameBefore,
42 TagAttributeName,
43 TagAttributeNameAfter,
44 TagAttributeValueBefore,
45 TagAttributeValueDoubleQuoted,
46 TagAttributeValueSingleQuoted,
47 TagAttributeValueUnquoted,
48 TagAttributeValueAfter,
49 CharacterReferenceInAttributeValue,
50 BogusComment,
51 Doctype,
52 BeforeDoctypeName,
53 DoctypeName,
54 AfterDoctypeName,
55 AfterDoctypePublicKeyword,
56 AfterDoctypeSystemKeyword,
57 BeforeDoctypeSystemIdentifier,
58 BeforeDoctypePublicIdentifier,
59 DoctypePublicIdentifierSingleQuoted,
60 DoctypePublicIdentifierDoubleQuoted,
61 AfterDoctypePublicIdentifier,
62 BetweenDoctypePublicAndSystemIdentifiers,
63 DoctypeSystemIdentifierSingleQuoted,
64 DoctypeSystemIdentifierDoubleQuoted,
65 AfterDoctypeSystemIdentifier,
66 DoctypeTypeInternalSubSet,
67 BogusDoctype,
68}
69
70// TODO implement `raw` for all tokens
71
72#[derive(PartialEq, Eq, Clone, Debug)]
73struct Doctype {
74 name: Option<String>,
75 public_id: Option<String>,
76 system_id: Option<String>,
77}
78
79#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
80enum TagKind {
81 Start,
82 End,
83 Empty,
84}
85
86#[derive(PartialEq, Eq, Clone, Debug)]
87struct Tag {
88 kind: TagKind,
89 tag_name: String,
90 attributes: Vec<Attribute>,
91}
92
93#[derive(PartialEq, Eq, Clone, Debug)]
94struct Attribute {
95 span: Span,
96 name: String,
97 raw_name: Option<String>,
98 value: Option<String>,
99 raw_value: Option<String>,
100}
101
102#[derive(PartialEq, Eq, Clone, Debug)]
103struct Comment {
104 data: String,
105 raw: String,
106}
107
108#[derive(PartialEq, Eq, Clone, Debug)]
109struct ProcessingInstruction {
110 target: String,
111 data: String,
112}
113
114#[derive(PartialEq, Eq, Clone, Debug)]
115struct Cdata {
116 data: String,
117 raw: String,
118}
119
120pub(crate) type LexResult<T> = Result<T, ErrorKind>;
121
122pub struct Lexer<'a, I>
123where
124 I: Input<'a>,
125{
126 input: I,
127 cur: Option<char>,
128 cur_pos: BytePos,
129 last_token_pos: BytePos,
130 finished: bool,
131 state: State,
132 return_state: Option<State>,
133 errors: Vec<Error>,
134 additional_allowed_character: Option<char>,
135 pending_tokens: VecDeque<TokenAndSpan>,
136 doctype_raw: Option<String>,
137 current_doctype_token: Option<Doctype>,
138 current_comment_token: Option<Comment>,
139 current_processing_instruction: Option<ProcessingInstruction>,
140 current_tag_token: Option<Tag>,
141 current_cdata_token: Option<Cdata>,
142 attribute_start_position: Option<BytePos>,
143 phantom: std::marker::PhantomData<&'a ()>,
144}
145
146impl<'a, I> Lexer<'a, I>
147where
148 I: Input<'a>,
149{
150 pub fn new(input: I) -> Self {
151 let start_pos = input.last_pos();
152
153 let mut lexer = Lexer {
154 input,
155 cur: None,
156 cur_pos: start_pos,
157 last_token_pos: start_pos,
158 finished: false,
159 state: State::Data,
160 return_state: None,
161 errors: Vec::new(),
162 additional_allowed_character: None,
163 pending_tokens: VecDeque::new(),
164 doctype_raw: None,
165 current_doctype_token: None,
166 current_comment_token: None,
167 current_processing_instruction: None,
168 current_tag_token: None,
169 current_cdata_token: None,
170 attribute_start_position: None,
171 phantom: std::marker::PhantomData,
172 };
173
174 // A leading Byte Order Mark (BOM) causes the character encoding argument to be
175 // ignored and will itself be skipped.
176 if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') {
177 unsafe {
178 // Safety: cur() is Some('\u{feff}')
179 lexer.input.bump();
180 }
181 }
182
183 lexer
184 }
185}
186
187impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
188 type Item = TokenAndSpan;
189
190 fn next(&mut self) -> Option<Self::Item> {
191 let token_and_span = self.read_token_and_span();
192
193 match token_and_span {
194 Ok(token_and_span) => {
195 return Some(token_and_span);
196 }
197 Err(..) => {
198 return None;
199 }
200 }
201 }
202}
203
204impl<'a, I> ParserInput for Lexer<'a, I>
205where
206 I: Input<'a>,
207{
208 fn start_pos(&mut self) -> swc_common::BytePos {
209 self.input.cur_pos()
210 }
211
212 fn last_pos(&mut self) -> swc_common::BytePos {
213 self.input.last_pos()
214 }
215
216 fn take_errors(&mut self) -> Vec<Error> {
217 take(&mut self.errors)
218 }
219}
220
221impl<'a, I> Lexer<'a, I>
222where
223 I: Input<'a>,
224{
225 #[inline(always)]
226 fn next(&mut self) -> Option<char> {
227 self.input.cur()
228 }
229
230 // Any occurrences of surrogates are surrogate-in-input-stream parse errors. Any
231 // occurrences of noncharacters are noncharacter-in-input-stream parse errors
232 // and any occurrences of controls other than ASCII whitespace and U+0000 NULL
233 // characters are control-character-in-input-stream parse errors.
234 //
235 // Postpone validation for each character for perf reasons and do it in
236 // `anything else`
237 #[inline(always)]
238 fn validate_input_stream_character(&mut self, c: char) {
239 let code = c as u32;
240
241 if (0xd800..=0xdfff).contains(&code) {
242 self.emit_error(ErrorKind::SurrogateInInputStream);
243 } else if code != 0x00 && is_control(code) {
244 self.emit_error(ErrorKind::ControlCharacterInInputStream);
245 } else if is_noncharacter(code) {
246 self.emit_error(ErrorKind::NoncharacterInInputStream);
247 }
248 }
249
250 #[inline(always)]
251 fn consume(&mut self) {
252 self.cur = self.input.cur();
253 self.cur_pos = self.input.cur_pos();
254
255 if self.cur.is_some() {
256 unsafe {
257 // Safety: cur() is Some(c)
258 self.input.bump();
259 }
260 }
261 }
262
263 #[inline(always)]
264 fn reconsume(&mut self) {
265 unsafe {
266 // Safety: We got cur_pos from self.input
267 self.input.reset_to(self.cur_pos);
268 }
269 }
270
271 #[inline(always)]
272 fn reconsume_in_state(&mut self, state: State) {
273 self.state = state;
274 self.reconsume();
275 }
276
277 #[inline(always)]
278 fn consume_next_char(&mut self) -> Option<char> {
279 // The next input character is the first character in the input stream that has
280 // not yet been consumed or explicitly ignored by the requirements in this
281 // section. Initially, the next input character is the first character in the
282 // input. The current input character is the last character to have been
283 // consumed.
284 let c = self.next();
285
286 self.consume();
287
288 c
289 }
290
291 #[cold]
292 fn emit_error(&mut self, kind: ErrorKind) {
293 self.errors.push(Error::new(
294 Span::new(self.cur_pos, self.input.cur_pos()),
295 kind,
296 ));
297 }
298
299 #[inline(always)]
300 fn emit_token(&mut self, token: Token) {
301 let cur_pos = self.input.cur_pos();
302
303 let span = Span::new(self.last_token_pos, cur_pos);
304
305 self.last_token_pos = cur_pos;
306 self.pending_tokens.push_back(TokenAndSpan { span, token });
307 }
308
309 fn consume_character_reference(&mut self) -> Option<(char, String)> {
310 let cur_pos = self.input.cur_pos();
311 let anything_else = |lexer: &mut Lexer<'a, I>| {
312 lexer.emit_error(ErrorKind::InvalidEntityCharacter);
313 lexer.cur_pos = cur_pos;
314 unsafe {
315 // Safety: We got cur_post from self.input
316 lexer.input.reset_to(cur_pos);
317 }
318 };
319
320 // This section defines how to consume a character reference, optionally with an
321 // additional allowed character, which, if specified where the algorithm is
322 // invoked, adds a character to the list of characters that cause there to not
323 // be a character reference.
324 //
325 // This definition is used when parsing character in text and in attributes.
326 //
327 // The behavior depends on identity of next character (the one immediately after
328 // the U+0026 AMPERSAND character), as follows:
329 match self.consume_next_char() {
330 // The additional allowed character if there is one
331 // Not a character reference. No characters are consumed and nothing is returned (This
332 // is not an error, either).
333 Some(c) if self.additional_allowed_character == Some(c) => {
334 self.emit_error(ErrorKind::InvalidEntityCharacter);
335 self.cur_pos = cur_pos;
336 unsafe {
337 // Safety: We got cur_post from self.input
338 self.input.reset_to(cur_pos);
339 }
340 }
341 Some('l') => match self.consume_next_char() {
342 Some('t') => {
343 match self.consume_next_char() {
344 Some(';') => {}
345 _ => {
346 self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
347 }
348 }
349
350 return Some(('<', String::from("<")));
351 }
352 _ => {
353 anything_else(self);
354 }
355 },
356 Some('g') => match self.consume_next_char() {
357 Some('t') => {
358 match self.consume_next_char() {
359 Some(';') => {}
360 _ => {
361 self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
362 }
363 }
364
365 return Some(('>', String::from(">")));
366 }
367 _ => {
368 anything_else(self);
369 }
370 },
371 Some('q') => match self.consume_next_char() {
372 Some('u') => match self.consume_next_char() {
373 Some('o') => match self.consume_next_char() {
374 Some('t') => {
375 match self.consume_next_char() {
376 Some(';') => {}
377 _ => {
378 self.emit_error(
379 ErrorKind::MissingSemicolonAfterCharacterReference,
380 );
381 }
382 }
383
384 return Some(('"', String::from(""")));
385 }
386 _ => {
387 anything_else(self);
388 }
389 },
390 _ => {
391 anything_else(self);
392 }
393 },
394 _ => {
395 anything_else(self);
396 }
397 },
398 Some('a') => match self.consume_next_char() {
399 Some('p') => match self.consume_next_char() {
400 Some('o') => match self.consume_next_char() {
401 Some('s') => {
402 match self.consume_next_char() {
403 Some(';') => {}
404 _ => {
405 self.emit_error(
406 ErrorKind::MissingSemicolonAfterCharacterReference,
407 );
408 }
409 }
410
411 return Some(('\'', String::from("'")));
412 }
413 _ => {
414 anything_else(self);
415 }
416 },
417 _ => {
418 anything_else(self);
419 }
420 },
421 Some('m') => match self.consume_next_char() {
422 Some('p') => {
423 match self.consume_next_char() {
424 Some(';') => {}
425 _ => {
426 self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
427 }
428 }
429
430 return Some(('&', String::from("&")));
431 }
432 _ => {
433 anything_else(self);
434 }
435 },
436 _ => {
437 anything_else(self);
438 }
439 },
440 Some('#') => {
441 let mut base = 10;
442 let mut characters = Vec::new();
443 let mut has_semicolon = false;
444
445 match self.consume_next_char() {
446 Some('x' | 'X') => {
447 base = 16;
448
449 while let Some(c) = &self.consume_next_char() {
450 if !c.is_ascii_hexdigit() {
451 if *c == ';' {
452 has_semicolon = true;
453 }
454
455 break;
456 }
457
458 if c.is_ascii_digit() {
459 characters.push(*c as u32 - 0x30);
460 } else if is_upper_hex_digit(*c) {
461 characters.push(*c as u32 - 0x37);
462 } else if is_lower_hex_digit(*c) {
463 characters.push(*c as u32 - 0x57);
464 }
465 }
466 }
467 Some(c) if c.is_ascii_digit() => {
468 characters.push(c as u32 - 0x30);
469
470 while let Some(c) = &self.consume_next_char() {
471 if !c.is_ascii_digit() {
472 if *c == ';' {
473 has_semicolon = true;
474 }
475
476 break;
477 }
478
479 characters.push(*c as u32 - 0x30);
480 }
481 }
482 _ => {}
483 }
484
485 if characters.is_empty() {
486 // TODO
487 self.cur_pos = cur_pos;
488 unsafe {
489 // Safety: We got cur_post from self.input
490 self.input.reset_to(cur_pos);
491 }
492
493 return None;
494 }
495
496 if !has_semicolon {
497 self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
498 }
499
500 let cr = {
501 let mut i: u32 = 0;
502 let mut overflowed = false;
503
504 for value in characters {
505 if !overflowed {
506 if let Some(result) = i.checked_mul(base as u32) {
507 i = result;
508
509 if let Some(result) = i.checked_add(value) {
510 i = result;
511 } else {
512 i = 0x110000;
513
514 overflowed = true;
515 }
516 } else {
517 i = 0x110000;
518
519 overflowed = true;
520 }
521 }
522 }
523
524 i
525 };
526
527 if is_surrogate(cr) {
528 self.emit_error(ErrorKind::SurrogateCharacterReference);
529
530 return Some((char::REPLACEMENT_CHARACTER, String::from("empty")));
531 }
532
533 let c = match char::from_u32(cr) {
534 Some(c) => c,
535 _ => {
536 unreachable!();
537 }
538 };
539
540 return Some((c, String::from("empty")));
541 }
542 _ => {
543 anything_else(self);
544 }
545 }
546
547 None
548 }
549
550 fn create_doctype_token(&mut self, name_c: Option<char>) {
551 let mut new_name = None;
552
553 if let Some(name_c) = name_c {
554 let mut name = String::with_capacity(4);
555
556 name.push(name_c);
557 new_name = Some(name);
558 }
559
560 self.current_doctype_token = Some(Doctype {
561 name: new_name,
562 public_id: None,
563 system_id: None,
564 });
565 }
566
567 fn append_raw_to_doctype_token(&mut self, c: char) {
568 if let Some(doctype_raw) = &mut self.doctype_raw {
569 let is_cr = c == '\r';
570
571 if is_cr {
572 let mut raw = String::with_capacity(2);
573
574 raw.push(c);
575
576 if self.input.cur() == Some('\n') {
577 unsafe {
578 // Safety: cur() is Some('\n')
579 self.input.bump();
580 }
581
582 raw.push('\n');
583 }
584
585 doctype_raw.push_str(&raw);
586 } else {
587 doctype_raw.push(c);
588 }
589 }
590 }
591
592 fn append_to_doctype_token(
593 &mut self,
594 name: Option<char>,
595 public_id: Option<char>,
596 system_id: Option<char>,
597 ) {
598 if let Some(ref mut token) = self.current_doctype_token {
599 if let Some(name) = name {
600 if let Doctype {
601 name: Some(old_name),
602 ..
603 } = token
604 {
605 old_name.push(name);
606 }
607 }
608
609 if let Some(public_id) = public_id {
610 if let Doctype {
611 public_id: Some(old_public_id),
612 ..
613 } = token
614 {
615 old_public_id.push(public_id);
616 }
617 }
618
619 if let Some(system_id) = system_id {
620 if let Doctype {
621 system_id: Some(old_system_id),
622 ..
623 } = token
624 {
625 old_system_id.push(system_id);
626 }
627 }
628 }
629 }
630
631 fn set_doctype_token_public_id(&mut self) {
632 if let Some(Doctype { public_id, .. }) = &mut self.current_doctype_token {
633 // The Longest public id is `-//softquad software//dtd hotmetal pro
634 // 6.0::19990601::extensions to html 4.0//`
635 *public_id = Some(String::with_capacity(78));
636 }
637 }
638
639 fn set_doctype_token_system_id(&mut self) {
640 if let Some(Doctype { system_id, .. }) = &mut self.current_doctype_token {
641 // The Longest system id is `http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd`
642 *system_id = Some(String::with_capacity(58));
643 }
644 }
645
646 fn emit_doctype_token(&mut self) {
647 let current_doctype_token = self.current_doctype_token.take().unwrap();
648
649 let raw = match self.doctype_raw.take() {
650 Some(raw) => raw,
651 _ => {
652 unreachable!();
653 }
654 };
655
656 let token = Token::Doctype {
657 name: current_doctype_token.name.map(Atom::from),
658 public_id: current_doctype_token.public_id.map(Atom::from),
659 system_id: current_doctype_token.system_id.map(Atom::from),
660 raw: Some(Atom::from(raw)),
661 };
662
663 self.emit_token(token);
664 }
665
666 fn create_tag_token(&mut self, kind: TagKind) {
667 self.current_tag_token = Some(Tag {
668 kind,
669 // Maximum known html tags are `blockquote` and `figcaption`
670 tag_name: String::with_capacity(10),
671 attributes: Vec::with_capacity(255),
672 });
673 }
674
675 fn append_to_tag_token_name(&mut self, c: char) {
676 if let Some(Tag { tag_name, .. }) = &mut self.current_tag_token {
677 tag_name.push(c);
678 }
679 }
680
681 fn start_new_attribute(&mut self, c: Option<char>) {
682 if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
683 // The longest known HTML attribute is "allowpaymentrequest" for "iframe".
684 let mut name = String::with_capacity(19);
685 let mut raw_name = String::with_capacity(19);
686
687 if let Some(c) = c {
688 name.push(c);
689 raw_name.push(c);
690 };
691
692 attributes.push(Attribute {
693 span: Default::default(),
694 name,
695 raw_name: Some(raw_name),
696 value: None,
697 raw_value: None,
698 });
699
700 self.attribute_start_position = Some(self.cur_pos);
701 }
702 }
703
704 fn append_to_attribute(
705 &mut self,
706 name: Option<(char, char)>,
707 value: Option<(bool, Option<char>, Option<char>)>,
708 ) {
709 if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
710 if let Some(attribute) = attributes.last_mut() {
711 if let Some(name) = name {
712 attribute.name.push(name.0);
713
714 if let Some(raw_name) = &mut attribute.raw_name {
715 raw_name.push(name.1);
716 }
717 }
718
719 if let Some(value) = value {
720 if let Some(c) = value.1 {
721 if let Some(old_value) = &mut attribute.value {
722 old_value.push(c);
723 } else {
724 let mut new_value = String::with_capacity(255);
725
726 new_value.push(c);
727
728 attribute.value = Some(new_value);
729 }
730 }
731
732 if let Some(c) = value.2 {
733 // Quote for attribute was found, so we set empty value by default
734 if value.0 && attribute.value.is_none() {
735 attribute.value = Some(String::with_capacity(255));
736 }
737
738 if let Some(raw_value) = &mut attribute.raw_value {
739 raw_value.push(c);
740 } else {
741 let mut raw_new_value = String::with_capacity(255);
742
743 raw_new_value.push(c);
744
745 attribute.raw_value = Some(raw_new_value);
746 }
747 }
748 }
749 }
750 }
751 }
752
753 fn append_to_attribute_with_entity(&mut self, value: Option<(Option<char>, Option<&str>)>) {
754 if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
755 if let Some(attribute) = attributes.last_mut() {
756 if let Some(value) = value {
757 if let Some(c) = value.0 {
758 if let Some(old_value) = &mut attribute.value {
759 old_value.push(c);
760 } else {
761 let mut new_value = String::with_capacity(255);
762
763 new_value.push(c);
764
765 attribute.value = Some(new_value);
766 }
767 }
768
769 if let Some(c) = value.1 {
770 if let Some(raw_value) = &mut attribute.raw_value {
771 raw_value.push_str(c);
772 } else {
773 let mut raw_new_value = String::with_capacity(255);
774
775 raw_new_value.push_str(c);
776
777 attribute.raw_value = Some(raw_new_value);
778 }
779 }
780 }
781 }
782 }
783 }
784
785 fn update_attribute_span(&mut self) {
786 if let Some(attribute_start_position) = self.attribute_start_position {
787 if let Some(Tag {
788 ref mut attributes, ..
789 }) = self.current_tag_token
790 {
791 if let Some(last) = attributes.last_mut() {
792 last.span = Span::new(attribute_start_position, self.cur_pos);
793 }
794 }
795 }
796 }
797
798 fn set_tag_to_empty_tag(&mut self) {
799 if let Some(Tag { kind, .. }) = &mut self.current_tag_token {
800 *kind = TagKind::Empty;
801 }
802 }
803
804 fn emit_tag_token(&mut self, kind: Option<TagKind>) {
805 if let Some(mut current_tag_token) = self.current_tag_token.take() {
806 if let Some(kind) = kind {
807 current_tag_token.kind = kind;
808 }
809
810 let mut already_seen: FxHashSet<Atom> = Default::default();
811
812 let attributes = current_tag_token
813 .attributes
814 .drain(..)
815 .map(|attribute| {
816 let name = Atom::from(attribute.name);
817
818 if already_seen.contains(&name) {
819 self.errors
820 .push(Error::new(attribute.span, ErrorKind::DuplicateAttribute));
821 }
822
823 already_seen.insert(name.clone());
824
825 AttributeToken {
826 span: attribute.span,
827 name,
828 raw_name: attribute.raw_name.map(Atom::from),
829 value: attribute.value.map(Atom::from),
830 raw_value: attribute.raw_value.map(Atom::from),
831 }
832 })
833 .collect();
834
835 match current_tag_token.kind {
836 TagKind::Start => {
837 let start_tag_token = Token::StartTag {
838 tag_name: current_tag_token.tag_name.into(),
839 attributes,
840 };
841
842 self.emit_token(start_tag_token);
843 }
844 TagKind::End => {
845 if !current_tag_token.attributes.is_empty() {
846 self.emit_error(ErrorKind::EndTagWithAttributes);
847 }
848
849 let end_tag_token = Token::EndTag {
850 tag_name: current_tag_token.tag_name.into(),
851 attributes,
852 };
853
854 self.emit_token(end_tag_token);
855 }
856 TagKind::Empty => {
857 let empty_tag = Token::EmptyTag {
858 tag_name: current_tag_token.tag_name.into(),
859 attributes,
860 };
861
862 self.emit_token(empty_tag);
863 }
864 }
865 }
866 }
867
868 fn create_comment_token(&mut self, new_data: Option<String>, raw_start: &str) {
869 let mut data = String::with_capacity(32);
870 let mut raw = String::with_capacity(38);
871
872 raw.push_str(raw_start);
873
874 if let Some(new_data) = new_data {
875 data.push_str(&new_data);
876 raw.push_str(&new_data);
877 };
878
879 self.current_comment_token = Some(Comment { data, raw });
880 }
881
882 fn append_to_comment_token(&mut self, c: char, raw_c: char) {
883 if let Some(Comment { data, raw }) = &mut self.current_comment_token {
884 data.push(c);
885 raw.push(raw_c);
886 }
887 }
888
889 fn handle_raw_and_append_to_comment_token(&mut self, c: char) {
890 if let Some(Comment { data, raw }) = &mut self.current_comment_token {
891 let is_cr = c == '\r';
892
893 if is_cr {
894 let mut raw_c = String::with_capacity(2);
895
896 raw_c.push(c);
897
898 if self.input.cur() == Some('\n') {
899 unsafe {
900 // Safety: cur() is Some('\n')
901 self.input.bump();
902 }
903
904 raw_c.push('\n');
905 }
906
907 data.push('\n');
908 raw.push_str(&raw_c);
909 } else {
910 data.push(c);
911 raw.push(c);
912 }
913 }
914 }
915
916 fn emit_comment_token(&mut self, raw_end: Option<&str>) {
917 let mut comment = self.current_comment_token.take().unwrap();
918
919 if let Some(raw_end) = raw_end {
920 comment.raw.push_str(raw_end);
921 }
922
923 self.emit_token(Token::Comment {
924 data: comment.data.into(),
925 raw: comment.raw.into(),
926 });
927 }
928
929 fn create_cdata_token(&mut self) {
930 let data = String::new();
931 let raw = String::with_capacity(12);
932
933 self.current_cdata_token = Some(Cdata { data, raw });
934 }
935
936 fn append_to_cdata_token(&mut self, c: Option<char>, raw_c: Option<char>) {
937 if let Some(Cdata { data, raw }) = &mut self.current_cdata_token {
938 if let Some(c) = c {
939 data.push(c);
940 }
941
942 if let Some(raw_c) = raw_c {
943 raw.push(raw_c);
944 }
945 }
946 }
947
948 fn emit_cdata_token(&mut self) {
949 let cdata = self.current_cdata_token.take().unwrap();
950
951 self.emit_token(Token::Cdata {
952 data: cdata.data.into(),
953 raw: cdata.raw.into(),
954 });
955 }
956
957 fn handle_raw_and_emit_character_token(&mut self, c: char) {
958 let is_cr = c == '\r';
959
960 if is_cr {
961 let mut raw = String::with_capacity(2);
962
963 raw.push(c);
964
965 if self.input.cur() == Some('\n') {
966 unsafe {
967 // Safety: cur() is Some('\n')
968 self.input.bump();
969 }
970
971 raw.push('\n');
972 }
973
974 self.emit_token(Token::Character {
975 value: '\n',
976 raw: Some(raw.into()),
977 });
978 } else {
979 self.emit_token(Token::Character {
980 value: c,
981 raw: Some(String::from(c).into()),
982 });
983 }
984 }
985
986 fn create_processing_instruction_token(&mut self) {
987 self.current_processing_instruction = Some(ProcessingInstruction {
988 target: String::with_capacity(3),
989 data: String::with_capacity(255),
990 });
991 }
992
993 fn set_processing_instruction_token(&mut self, target_c: Option<char>, data_c: Option<char>) {
994 if let Some(ProcessingInstruction { target, data, .. }) =
995 &mut self.current_processing_instruction
996 {
997 if let Some(target_c) = target_c {
998 target.push(target_c);
999 }
1000
1001 if let Some(data_c) = data_c {
1002 data.push(data_c);
1003 }
1004 }
1005 }
1006
1007 fn emit_current_processing_instruction(&mut self) {
1008 let processing_instruction = self.current_processing_instruction.take().unwrap();
1009
1010 let token = Token::ProcessingInstruction {
1011 target: processing_instruction.target.into(),
1012 data: processing_instruction.data.into(),
1013 };
1014
1015 self.emit_token(token);
1016 }
1017
1018 #[inline(always)]
1019 fn emit_character_token(&mut self, value: (char, char)) {
1020 self.emit_token(Token::Character {
1021 value: value.0,
1022 raw: Some(String::from(value.1).into()),
1023 });
1024 }
1025
1026 #[inline(always)]
1027 fn emit_character_token_with_entity(&mut self, c: char, raw: &str) {
1028 self.emit_token(Token::Character {
1029 value: c,
1030 raw: Some(raw.into()),
1031 });
1032 }
1033
1034 fn read_token_and_span(&mut self) -> LexResult<TokenAndSpan> {
1035 if self.finished {
1036 return Err(ErrorKind::Eof);
1037 } else {
1038 while self.pending_tokens.is_empty() {
1039 self.run()?;
1040 }
1041 }
1042
1043 let token_and_span = self.pending_tokens.pop_front().unwrap();
1044
1045 match token_and_span.token {
1046 Token::Eof => {
1047 self.finished = true;
1048
1049 return Err(ErrorKind::Eof);
1050 }
1051 _ => {
1052 return Ok(token_and_span);
1053 }
1054 }
1055 }
1056
1057 fn run(&mut self) -> LexResult<()> {
1058 match self.state {
1059 State::Data => {
1060 // Consume the next input character:
1061 match self.consume_next_char() {
1062 // U+0026 AMPERSAND (&)
1063 // Switch to character reference in data state.
1064 Some('&') => {
1065 self.state = State::CharacterReferenceInData;
1066 }
1067 // U+003C LESSER-THAN SIGN (<)
1068 // Switch to the tag open state.
1069 Some('<') => {
1070 self.state = State::TagOpen;
1071 }
1072 // EOF
1073 // Emit an end-of-file token.
1074 None => {
1075 self.emit_token(Token::Eof);
1076
1077 return Ok(());
1078 }
1079 // Anything else
1080 // Emit the current input character as character. Stay in this state.
1081 Some(c) => {
1082 self.validate_input_stream_character(c);
1083 self.handle_raw_and_emit_character_token(c);
1084 }
1085 }
1086 }
1087 State::CharacterReferenceInData => {
1088 // Switch to the data state.
1089 // Attempt to consume a character reference.
1090 //
1091 // If nothing is returned emit a U+0026 AMPERSAND character (&) token.
1092 //
1093 // Otherwise, emit character tokens that were returned.
1094 self.state = State::Data;
1095
1096 let character_reference = self.consume_character_reference();
1097
1098 if let Some((c, raw)) = character_reference {
1099 self.emit_character_token_with_entity(c, &raw);
1100 } else {
1101 self.emit_character_token(('&', '&'));
1102 }
1103 }
1104 State::Pi => {
1105 // Consume the next input character:
1106 match self.consume_next_char() {
1107 // U+0009 CHARACTER TABULATION (tab)
1108 // U+000A LINE FEED (LF)
1109 // U+0020 SPACE
1110 // EOF
1111 // Parse error.
1112 // Switch to the pi target after state.
1113 Some(c) if is_whitespace(c) => {
1114 self.emit_error(ErrorKind::InvalidCharacterOfProcessingInstruction);
1115 self.create_processing_instruction_token();
1116 self.state = State::PiTargetAfter;
1117 }
1118 None => {
1119 self.emit_error(ErrorKind::EofInProcessingInstruction);
1120 self.create_processing_instruction_token();
1121 self.emit_current_processing_instruction();
1122 self.reconsume_in_state(State::Data);
1123 }
1124 // U+003F QUESTION MARK(?)
1125 // Emit error
1126 // Reprocess the current input character in the pi end state (recovery mode).
1127 Some('?') => {
1128 self.emit_error(ErrorKind::NoTargetNameInProcessingInstruction);
1129 self.create_processing_instruction_token();
1130 self.state = State::PiEnd;
1131 }
1132 Some(c) => {
1133 self.validate_input_stream_character(c);
1134 self.create_processing_instruction_token();
1135 self.set_processing_instruction_token(Some(c), None);
1136 self.state = State::PiTarget;
1137 }
1138 }
1139 }
1140 State::PiTarget => {
1141 // Consume the next input character:
1142 match self.consume_next_char() {
1143 // U+0009 CHARACTER TABULATION (tab)
1144 // U+000A LINE FEED (LF)
1145 // U+0020 SPACE
1146 // Switch to the pi target state.
1147 Some(c) if is_whitespace(c) => {
1148 self.state = State::PiTargetAfter;
1149 }
1150 // EOF
1151 // Parse error. Emit the current processing instruction token and then reprocess
1152 // the current input character in the data state.
1153 None => {
1154 self.emit_error(ErrorKind::EofInProcessingInstruction);
1155 self.emit_current_processing_instruction();
1156 self.reconsume_in_state(State::Data);
1157 }
1158 // U+003F QUESTION MARK(?)
1159 // Switch to the pi target question.
1160 Some('?') => {
1161 self.state = State::PiTargetQuestion;
1162 }
1163 // Anything else
1164 // Append the current input character to the processing instruction target and
1165 // stay in the current state.
1166 Some(c) => {
1167 self.validate_input_stream_character(c);
1168 self.set_processing_instruction_token(Some(c), None);
1169 }
1170 }
1171 }
1172 State::PiTargetQuestion => {
1173 // Consume the next input character:
1174 match self.consume_next_char() {
1175 // U+003E GREATER-THAN SIGN (>)
1176 Some('>') => {
1177 self.reconsume_in_state(State::PiEnd);
1178 }
1179 _ => {
1180 self.errors.push(Error::new(
1181 Span::new(self.cur_pos - BytePos(1), self.input.cur_pos() - BytePos(1)),
1182 ErrorKind::MissingWhitespaceBeforeQuestionInProcessingInstruction,
1183 ));
1184 self.set_processing_instruction_token(None, Some('?'));
1185 self.reconsume_in_state(State::PiData);
1186 }
1187 }
1188 }
1189 State::PiTargetAfter => {
1190 // Consume the next input character:
1191 match self.consume_next_char() {
1192 // U+0009 CHARACTER TABULATION (Tab)
1193 // U+000A LINE FEED (LF)
1194 // U+0020 SPACE (Space)
1195 // Stay in the current state.
1196 Some(c) if is_whitespace(c) => {
1197 self.skip_next_lf(c);
1198 }
1199 // Anything else
1200 // Reprocess the current input character in the pi data state.
1201 _ => {
1202 self.reconsume_in_state(State::PiData);
1203 }
1204 }
1205 }
1206 State::PiData => {
1207 // Consume the next input character:
1208 match self.consume_next_char() {
1209 // U+003F QUESTION MARK(?)
1210 // Switch to the pi after state.
1211 Some('?') => {
1212 self.state = State::PiEnd;
1213 }
1214 // EOF
1215 // Parse error. Emit the current processing instruction token and then reprocess
1216 // the current input character in the data state.
1217 None => {
1218 self.emit_error(ErrorKind::EofInProcessingInstruction);
1219 self.emit_current_processing_instruction();
1220 self.reconsume_in_state(State::Data);
1221 }
1222 // Anything else
1223 // Append the current input character to the pi’s data and stay in the current
1224 // state.
1225 Some(c) => {
1226 self.validate_input_stream_character(c);
1227 self.set_processing_instruction_token(None, Some(c));
1228 }
1229 }
1230 }
1231 State::PiEnd => {
1232 // Consume the next input character:
1233 match self.consume_next_char() {
1234 // U+003E GREATER-THAN SIGN (>)
1235 // Emit the current token and then switch to the data state.
1236 Some('>') => {
1237 self.emit_current_processing_instruction();
1238 self.state = State::Data;
1239 }
1240 // EOF
1241 // Parse error. Emit the current processing instruction token and then reprocess
1242 // the current input character in the data state.
1243 None => {
1244 self.emit_error(ErrorKind::EofInProcessingInstruction);
1245 self.emit_current_processing_instruction();
1246 self.reconsume_in_state(State::Data);
1247 }
1248 // Anything else
1249 // Reprocess the current input character in the pi data state.
1250 _ => {
1251 self.set_processing_instruction_token(None, Some('?'));
1252 self.reconsume_in_state(State::PiData);
1253 }
1254 }
1255 }
1256 State::MarkupDeclaration => {
1257 let cur_pos = self.input.cur_pos();
1258 let anything_else = |lexer: &mut Lexer<'a, I>| {
1259 lexer.emit_error(ErrorKind::IncorrectlyOpenedComment);
1260 lexer.create_comment_token(None, "<!");
1261 lexer.state = State::BogusComment;
1262 lexer.cur_pos = cur_pos;
1263 // We don't validate input here because we reset position
1264 unsafe {
1265 // Safety: cur_pos is in the range of input
1266 lexer.input.reset_to(cur_pos);
1267 }
1268 };
1269
1270 // If the next few characters are:
1271 match self.consume_next_char() {
1272 // Two U+002D HYPEN-MINUS characters (-)
1273 // Consume those two characters, create a comment token whose data is the empty
1274 // string and switch to comment start state.
1275 Some('-') => match self.consume_next_char() {
1276 Some('-') => {
1277 self.create_comment_token(None, "<!--");
1278 self.state = State::CommentStart;
1279 }
1280 _ => {
1281 anything_else(self);
1282 }
1283 },
1284 // ASCII case-insensitive match for word "DOCTYPE"
1285 // Consume those characters and switch to Doctype state
1286 Some(d @ 'd' | d @ 'D') => match self.consume_next_char() {
1287 Some(o @ 'o' | o @ 'O') => match self.consume_next_char() {
1288 Some(c @ 'c' | c @ 'C') => match self.consume_next_char() {
1289 Some(t @ 't' | t @ 'T') => match self.consume_next_char() {
1290 Some(y @ 'y' | y @ 'Y') => match self.consume_next_char() {
1291 Some(p @ 'p' | p @ 'P') => match self.consume_next_char() {
1292 Some(e @ 'e' | e @ 'E') => {
1293 self.state = State::Doctype;
1294
1295 let mut raw_keyword = String::with_capacity(9);
1296
1297 raw_keyword.push('<');
1298 raw_keyword.push('!');
1299 raw_keyword.push(d);
1300 raw_keyword.push(o);
1301 raw_keyword.push(c);
1302 raw_keyword.push(t);
1303 raw_keyword.push(y);
1304 raw_keyword.push(p);
1305 raw_keyword.push(e);
1306
1307 self.doctype_raw = Some(raw_keyword);
1308 }
1309 _ => {
1310 anything_else(self);
1311 }
1312 },
1313 _ => {
1314 anything_else(self);
1315 }
1316 },
1317 _ => {
1318 anything_else(self);
1319 }
1320 },
1321 _ => {
1322 anything_else(self);
1323 }
1324 },
1325 _ => {
1326 anything_else(self);
1327 }
1328 },
1329 _ => {
1330 anything_else(self);
1331 }
1332 },
1333 // Exact match for word "[CDATA[" with a (the five uppercase letters "CDATA"
1334 // with a U+005B LEFT SQUARE BRACKET character before and after)
1335 // Consume those characters and switch to CDATA state
1336 Some('[') => match self.consume_next_char() {
1337 Some(c @ 'C') => match self.consume_next_char() {
1338 Some(d @ 'D') => match self.consume_next_char() {
1339 Some(a1 @ 'A') => match self.consume_next_char() {
1340 Some(t @ 'T') => match self.consume_next_char() {
1341 Some(a2 @ 'A') => match self.consume_next_char() {
1342 Some('[') => {
1343 self.create_cdata_token();
1344 self.append_to_cdata_token(None, Some('<'));
1345 self.append_to_cdata_token(None, Some('!'));
1346 self.append_to_cdata_token(None, Some('['));
1347 self.append_to_cdata_token(None, Some(c));
1348 self.append_to_cdata_token(None, Some(d));
1349 self.append_to_cdata_token(None, Some(a1));
1350 self.append_to_cdata_token(None, Some(t));
1351 self.append_to_cdata_token(None, Some(a2));
1352 self.append_to_cdata_token(None, Some('['));
1353 self.state = State::Cdata;
1354 }
1355 _ => {
1356 anything_else(self);
1357 }
1358 },
1359 _ => {
1360 anything_else(self);
1361 }
1362 },
1363 _ => {
1364 anything_else(self);
1365 }
1366 },
1367 _ => {
1368 anything_else(self);
1369 }
1370 },
1371 _ => {
1372 anything_else(self);
1373 }
1374 },
1375 _ => {
1376 anything_else(self);
1377 }
1378 },
1379 // Anything else
1380 // Emit an error. Create a comment token whose data is an empty string. Switch
1381 // to bogus comment state (don’t consume any characters)
1382 _ => {
1383 anything_else(self);
1384 }
1385 }
1386 }
1387 State::CommentStart => {
1388 // Consume the next input character:
1389 match self.consume_next_char() {
1390 // U+002D HYPHEN-MINUS (-)
1391 // Switch to the comment start dash state.
1392 Some('-') => {
1393 self.state = State::CommentStartDash;
1394 }
1395 // U+003E GREATER-THAN SIGN (>)
1396 // This is an abrupt-closing-of-empty-comment parse error. Switch to the
1397 // data state. Emit the current comment token.
1398 Some('>') => {
1399 self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
1400 self.state = State::Data;
1401 self.emit_comment_token(Some(">"));
1402 }
1403 // Anything else
1404 // Reconsume in the comment state.
1405 _ => {
1406 self.reconsume_in_state(State::Comment);
1407 }
1408 }
1409 }
1410 State::CommentStartDash => {
1411 // Consume the next input character:
1412 match self.consume_next_char() {
1413 // U+002D HYPHEN-MINUS (-)
1414 // Switch to the comment end state.
1415 Some('-') => {
1416 self.state = State::CommentEnd;
1417 }
1418 // U+003E GREATER-THAN SIGN (>)
1419 // This is an abrupt-closing-of-empty-comment parse error. Switch to the
1420 // data state. Emit the current comment token.
1421 Some('>') => {
1422 self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
1423 self.state = State::Data;
1424 self.emit_comment_token(Some("->"));
1425 }
1426 // EOF
1427 // This is an eof-in-comment parse error. Emit the current comment token.
1428 // Emit an end-of-file token.
1429 None => {
1430 self.emit_error(ErrorKind::EofInComment);
1431 self.emit_comment_token(None);
1432 self.emit_token(Token::Eof);
1433
1434 return Ok(());
1435 }
1436 // Anything else
1437 // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
1438 // Reconsume in the comment state.
1439 _ => {
1440 self.append_to_comment_token('-', '-');
1441 self.reconsume_in_state(State::Comment);
1442 }
1443 }
1444 }
1445 State::Comment => {
1446 // Consume the next input character:
1447 match self.consume_next_char() {
1448 // U+003C LESS-THAN SIGN (<)
1449 // Append the current input character to the comment token's data. Switch to
1450 // the comment less-than sign state.
1451 Some(c @ '<') => {
1452 self.append_to_comment_token(c, c);
1453 self.state = State::CommentLessThanSign;
1454 }
1455 // U+002D HYPHEN-MINUS (-)
1456 // Switch to the comment end dash state.
1457 Some('-') => {
1458 self.state = State::CommentEndDash;
1459 }
1460 // EOF
1461 // This is an eof-in-comment parse error. Emit the current comment token.
1462 // Emit an end-of-file token.
1463 None => {
1464 self.emit_error(ErrorKind::EofInComment);
1465 self.emit_comment_token(None);
1466 self.emit_token(Token::Eof);
1467
1468 return Ok(());
1469 }
1470 // Anything else
1471 // Append the current input character to the comment token's data.
1472 Some(c) => {
1473 self.validate_input_stream_character(c);
1474 self.handle_raw_and_append_to_comment_token(c);
1475 }
1476 }
1477 }
1478 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
1479 State::CommentLessThanSign => {
1480 // Consume the next input character:
1481 match self.consume_next_char() {
1482 // U+0021 EXCLAMATION MARK (!)
1483 // Append the current input character to the comment token's data. Switch to
1484 // the comment less-than sign bang state.
1485 Some(c @ '!') => {
1486 self.append_to_comment_token(c, c);
1487 self.state = State::CommentLessThanSignBang;
1488 }
1489 // U+003C LESS-THAN SIGN (<)
1490 // Append the current input character to the comment token's data.
1491 Some(c @ '<') => {
1492 self.append_to_comment_token(c, c);
1493 }
1494 // Anything else
1495 // Reconsume in the comment state.
1496 _ => {
1497 self.reconsume_in_state(State::Comment);
1498 }
1499 }
1500 }
1501 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
1502 State::CommentLessThanSignBang => {
1503 // Consume the next input character:
1504 match self.consume_next_char() {
1505 // U+002D HYPHEN-MINUS (-)
1506 // Switch to the comment less-than sign bang dash state.
1507 Some('-') => {
1508 self.state = State::CommentLessThanSignBangDash;
1509 }
1510 // Anything else
1511 // Reconsume in the comment state.
1512 _ => {
1513 self.reconsume_in_state(State::Comment);
1514 }
1515 }
1516 }
1517 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
1518 State::CommentLessThanSignBangDash => {
1519 // Consume the next input character:
1520 match self.consume_next_char() {
1521 // U+002D HYPHEN-MINUS (-)
1522 // Switch to the comment less-than sign bang dash dash state.
1523 Some('-') => {
1524 self.state = State::CommentLessThanSignBangDashDash;
1525 }
1526 // Anything else
1527 // Reconsume in the comment end dash state.
1528 _ => {
1529 self.reconsume_in_state(State::CommentEndDash);
1530 }
1531 }
1532 }
1533 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
1534 State::CommentLessThanSignBangDashDash => {
1535 // Consume the next input character:
1536 match self.consume_next_char() {
1537 // U+003E GREATER-THAN SIGN (>)
1538 // EOF
1539 // Reconsume in the comment end state.
1540 Some('>') | None => {
1541 self.reconsume_in_state(State::CommentEnd);
1542 }
1543 // Anything else
1544 // This is a nested-comment parse error. Reconsume in the comment end state.
1545 _ => {
1546 self.emit_error(ErrorKind::NestedComment);
1547 self.reconsume_in_state(State::CommentEnd);
1548 }
1549 }
1550 }
1551 // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
1552 State::CommentEndDash => {
1553 // Consume the next input character:
1554 match self.consume_next_char() {
1555 // U+002D HYPHEN-MINUS (-)
1556 // Switch to the comment end state.
1557 Some('-') => {
1558 self.state = State::CommentEnd;
1559 }
1560 // EOF
1561 // This is an eof-in-comment parse error. Emit the current comment token.
1562 // Emit an end-of-file token.
1563 None => {
1564 self.emit_error(ErrorKind::EofInComment);
1565 self.emit_comment_token(None);
1566 self.emit_token(Token::Eof);
1567
1568 return Ok(());
1569 }
1570 // Anything else
1571 // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
1572 // Reconsume in the comment state.
1573 _ => {
1574 self.append_to_comment_token('-', '-');
1575 self.reconsume_in_state(State::Comment);
1576 }
1577 }
1578 }
1579 // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
1580 State::CommentEnd => {
1581 // Consume the next input character:
1582 match self.consume_next_char() {
1583 // U+003E GREATER-THAN SIGN (>)
1584 // Switch to the data state. Emit the current comment token.
1585 Some('>') => {
1586 self.state = State::Data;
1587 self.emit_comment_token(Some("-->"));
1588 }
1589 // U+0021 EXCLAMATION MARK (!)
1590 // Switch to the comment end bang state.
1591 Some('!') => {
1592 self.state = State::CommentEndBang;
1593 }
1594 // U+002D HYPHEN-MINUS (-)
1595 // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
1596 Some(c @ '-') => {
1597 self.append_to_comment_token(c, c);
1598 self.emit_error(ErrorKind::DoubleHyphenWithInComment);
1599 }
1600 // EOF
1601 // This is an eof-in-comment parse error. Emit the current comment token.
1602 // Emit an end-of-file token.
1603 None => {
1604 self.emit_error(ErrorKind::EofInComment);
1605 self.emit_comment_token(None);
1606 self.emit_token(Token::Eof);
1607
1608 return Ok(());
1609 }
1610 // Anything else
1611 // Append two U+002D (-) characters and the current input character to the
1612 // comment token’s data. Reconsume in the comment state.
1613 _ => {
1614 self.append_to_comment_token('-', '-');
1615 self.append_to_comment_token('-', '-');
1616 self.reconsume_in_state(State::Comment);
1617 }
1618 }
1619 }
1620 // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
1621 State::CommentEndBang => {
1622 // Consume the next input character:
1623 match self.consume_next_char() {
1624 // U+002D HYPHEN-MINUS (-)
1625 // Append a U+002D HYPHEN-MINUS character (-) and U+0021 EXCLAMATION MARK
1626 // character(!) to the comment token’s data. Switch to the comment end dash
1627 // state.
1628 Some('-') => {
1629 self.append_to_comment_token('-', '-');
1630 self.append_to_comment_token('!', '!');
1631 self.state = State::CommentEndDash;
1632 }
1633 // U+003E GREATER-THAN SIGN (>)
1634 // Parse error. Switch to the data state.Emit the comment token.
1635 Some('>') => {
1636 self.emit_error(ErrorKind::IncorrectlyClosedComment);
1637 self.state = State::Data;
1638 self.emit_comment_token(Some(">"));
1639 }
1640 // EOF
1641 // Parse error. Emit the comment token. Emit an end-of-file token.
1642 None => {
1643 self.emit_error(ErrorKind::EofInComment);
1644 self.emit_comment_token(None);
1645 self.emit_token(Token::Eof);
1646
1647 return Ok(());
1648 }
1649 // Anything else
1650 // Anything else
1651 // Append two U+002D (-) characters and U+0021 EXCLAMATION MARK character(!) to
1652 // the comment token’s data. Reconsume in the comment state.
1653 _ => {
1654 self.append_to_comment_token('-', '-');
1655 self.append_to_comment_token('-', '-');
1656 self.append_to_comment_token('!', '!');
1657 self.reconsume_in_state(State::Comment);
1658 }
1659 }
1660 }
1661 State::Cdata => {
1662 // Consume the next input character:
1663 match self.consume_next_char() {
1664 // U+005D RIGHT SQUARE BRACKET (])
1665 // Switch to the CDATA bracket state.
1666 Some(']') => {
1667 self.state = State::CdataBracket;
1668 }
1669 // EOF
1670 // Parse error. Reprocess the current input character in the data state.
1671 None => {
1672 self.emit_error(ErrorKind::EofInCdata);
1673 self.reconsume_in_state(State::Data);
1674 }
1675 // Anything else
1676 // Append the current input character to the cdata dta. Stay in the current
1677 // state.
1678 Some(c) => {
1679 self.validate_input_stream_character(c);
1680 self.append_to_cdata_token(Some(c), Some(c));
1681 }
1682 }
1683 }
1684 State::CdataBracket => {
1685 // Consume the next input character:
1686 match self.consume_next_char() {
1687 // U+005D RIGHT SQUARE BRACKET (])
1688 // Switch to the CDATA end state.
1689 Some(']') => {
1690 self.state = State::CdataEnd;
1691 }
1692 // EOF
1693 // Parse error. Reconsume the current input character in the data state.
1694 None => {
1695 self.emit_error(ErrorKind::EofInCdata);
1696 self.reconsume_in_state(State::Data);
1697 }
1698 // Anything else
1699 // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
1700 // CDATA section state.
1701 Some(c) => {
1702 self.append_to_cdata_token(Some(']'), Some(']'));
1703 self.append_to_cdata_token(Some(c), Some(c));
1704 self.state = State::Cdata;
1705 }
1706 }
1707 }
1708 State::CdataEnd => {
1709 // Consume the next input character:
1710 match self.consume_next_char() {
1711 // U+003E GREATER-THAN SIGN (>)
1712 // Switch to the data state.
1713 Some('>') => {
1714 self.append_to_cdata_token(None, Some(']'));
1715 self.append_to_cdata_token(None, Some(']'));
1716 self.append_to_cdata_token(None, Some('>'));
1717 self.emit_cdata_token();
1718 self.state = State::Data;
1719 }
1720 // U+005D RIGHT SQUARE BRACKET (])
1721 // Emit the current input character as character token. Stay in the current
1722 // state.
1723 Some(c @ ']') => {
1724 self.append_to_cdata_token(Some(c), Some(c));
1725 }
1726 // EOF
1727 // Parse error. Reconsume the current input character in the data state.
1728 None => {
1729 self.emit_error(ErrorKind::EofInCdata);
1730 self.reconsume_in_state(State::Data);
1731 }
1732 // Anything else
1733 // Emit two U+005D RIGHT SQUARE BRACKET (]) characters as character tokens and
1734 // also emit the current input character as character token. Switch to the CDATA
1735 // state.
1736 Some(c) => {
1737 self.append_to_cdata_token(Some(']'), Some(']'));
1738 self.append_to_cdata_token(Some(']'), Some(']'));
1739 self.append_to_cdata_token(Some(c), Some(c));
1740 self.state = State::Cdata;
1741 }
1742 }
1743 }
1744 State::TagOpen => {
1745 // Consume the next input character:
1746 match self.consume_next_char() {
1747 // U+002F SOLIDUS (/)
1748 // Switch to the end tag open state.
1749 Some('/') => {
1750 self.state = State::EndTagOpen;
1751 }
1752 // U+0021 EXCLAMATION MARK (!)
1753 // Switch to the markup declaration open state.
1754 Some('!') => {
1755 self.state = State::MarkupDeclaration;
1756 }
1757 // U+003F QUESTION MARK(?)
1758 // Switch to the pi state.
1759 Some('?') => {
1760 self.state = State::Pi;
1761 }
1762 // Name start character
1763 // Create a new tag token and set its name to the input character, then switch
1764 // to the tag name state.
1765 Some(c) if is_name_start_char(c) => {
1766 self.create_tag_token(TagKind::Start);
1767 self.reconsume_in_state(State::TagName);
1768 }
1769 // EOF
1770 // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1771 // character token and an end-of-file token.
1772 None => {
1773 self.emit_error(ErrorKind::EofBeforeTagName);
1774 self.emit_character_token(('<', '<'));
1775 self.emit_token(Token::Eof);
1776
1777 return Ok(());
1778 }
1779 // Anything else
1780 // This is an invalid-first-character-of-tag-name parse error. Emit a U+003C
1781 // LESS-THAN SIGN character token. Reconsume in the data state.
1782 _ => {
1783 self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1784 self.emit_character_token(('<', '<'));
1785 self.reconsume_in_state(State::Data);
1786 }
1787 }
1788 }
1789 State::EndTagOpen => {
1790 // Consume the next input character:
1791 match self.consume_next_char() {
1792 // ASCII alpha
1793 // Create a new end tag token, set its tag name to the empty string.
1794 // Reconsume in the tag name state.
1795 Some(c) if is_name_char(c) => {
1796 self.create_tag_token(TagKind::End);
1797 self.reconsume_in_state(State::TagName);
1798 }
1799 // U+003E GREATER-THAN SIGN (>)
1800 // This is a missing-end-tag-name parse error. Switch to the data state.
1801 Some('>') => {
1802 self.emit_error(ErrorKind::MissingEndTagName);
1803 self.state = State::Data;
1804 }
1805 // EOF
1806 // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1807 // character token, a U+002F SOLIDUS character token and an end-of-file
1808 // token.
1809 None => {
1810 self.emit_error(ErrorKind::EofBeforeTagName);
1811 self.emit_character_token(('<', '<'));
1812 self.emit_character_token(('/', '/'));
1813 self.emit_token(Token::Eof);
1814
1815 return Ok(());
1816 }
1817 // Anything else
1818 // This is an invalid-first-character-of-tag-name parse error. Create a
1819 // comment token whose data is the empty string. Reconsume in the bogus
1820 // comment state.
1821 _ => {
1822 self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1823 self.emit_character_token(('<', '<'));
1824 self.emit_character_token(('/', '/'));
1825 self.reconsume_in_state(State::BogusComment);
1826 }
1827 }
1828 }
1829 State::TagName => {
1830 // Consume the next input character:
1831 match self.consume_next_char() {
1832 // U+0009 CHARACTER TABULATION (Tab)
1833 // U+000A LINE FEED (LF)
1834 // U+0020 SPACE (Space)
1835 // Switch to the before attribute name state.
1836 Some(c) if is_whitespace(c) => {
1837 self.skip_next_lf(c);
1838 self.state = State::TagAttributeNameBefore;
1839 }
1840 // U+002F SOLIDUS (/)
1841 // Set current tag to empty tag. Switch to the empty tag state.
1842 Some('/') => {
1843 self.set_tag_to_empty_tag();
1844 self.state = State::EmptyTag;
1845 }
1846 // U+003E GREATER-THAN SIGN (>)
1847 // Switch to the data state. Emit the current tag token.
1848 Some('>') => {
1849 self.state = State::Data;
1850 self.emit_tag_token(None);
1851 }
1852 // EOF
1853 // This is an eof-in-tag parse error. Emit an end-of-file token.
1854 None => {
1855 self.emit_error(ErrorKind::EofInTag);
1856 self.emit_tag_token(None);
1857
1858 return Ok(());
1859 }
1860 // Name character
1861 // Append the current input character to the tag name and stay in the current
1862 // state.
1863 Some(c) if is_name_char(c) => {
1864 self.validate_input_stream_character(c);
1865 self.append_to_tag_token_name(c);
1866 }
1867 // Anything else
1868 // Parse error. Append the current input character to the tag name and stay in
1869 // the current state.
1870 Some(c) => {
1871 self.emit_error(ErrorKind::InvalidCharacterInTag);
1872 self.validate_input_stream_character(c);
1873 self.append_to_tag_token_name(c);
1874 }
1875 }
1876 }
1877 State::EmptyTag => {
1878 // Consume the next input character:
1879 match self.consume_next_char() {
1880 // U+003E GREATER-THAN SIGN (>)
1881 // Emit the current tag token as empty tag token and then switch to the data
1882 // state.
1883 Some('>') => {
1884 self.emit_tag_token(Some(TagKind::Empty));
1885 self.state = State::Data;
1886 }
1887 // Anything else
1888 // Parse error. Reprocess the current input character in the tag attribute name
1889 // before state.
1890 _ => {
1891 self.emit_error(ErrorKind::UnexpectedSolidusInTag);
1892 self.reconsume_in_state(State::TagAttributeNameBefore);
1893 }
1894 }
1895 }
1896 State::TagAttributeNameBefore => {
1897 // Consume the next input character:
1898 match self.consume_next_char() {
1899 // U+0009 CHARACTER TABULATION (tab)
1900 // U+000A LINE FEED (LF)
1901 // U+0020 SPACE
1902 // Ignore the character.
1903 Some(c) if is_whitespace(c) => {
1904 self.skip_next_lf(c);
1905 }
1906 // U+003E GREATER-THAN SIGN(>)
1907 // Emit the current token and then switch to the data state.
1908 Some('>') => {
1909 self.emit_tag_token(None);
1910 self.state = State::Data;
1911 }
1912 // U+002F SOLIDUS (/)
1913 // Set current tag to empty tag. Switch to the empty tag state.
1914 Some('/') => {
1915 self.set_tag_to_empty_tag();
1916 self.state = State::EmptyTag;
1917 }
1918 // U+003A COLON (:)
1919 // Parse error. Stay in the current state.
1920 Some(':') => {
1921 self.emit_error(ErrorKind::UnexpectedColonBeforeAttributeName);
1922 }
1923 // EOF
1924 // Parse error. Emit the current token and then reprocess the current input
1925 // character in the data state.
1926 None => {
1927 self.emit_error(ErrorKind::EofBeforeTagName);
1928 self.emit_tag_token(None);
1929 self.reconsume_in_state(State::Data);
1930 }
1931 // Anything else
1932 // Start a new attribute in the current tag token. Set that attribute’s name to
1933 // the current input character and its value to the empty string and then switch
1934 // to the tag attribute name state.
1935 _ => {
1936 self.start_new_attribute(None);
1937 self.reconsume_in_state(State::TagAttributeName);
1938 }
1939 }
1940 }
1941 State::TagAttributeName => {
1942 // Consume the next input character:
1943 match self.consume_next_char() {
1944 // U+003D EQUALS SIGN (=)
1945 // Switch to the before attribute value state.
1946 Some('=') => {
1947 self.state = State::TagAttributeValueBefore;
1948 }
1949 // U+003E GREATER-THEN SIGN (>)
1950 // Emit the current token as start tag token. Switch to the data state.
1951 Some('>') => {
1952 self.emit_error(ErrorKind::MissingEqualAfterAttributeName);
1953 self.emit_tag_token(None);
1954 self.state = State::Data;
1955 }
1956 // U+0009 CHARACTER TABULATION (Tab)
1957 // U+000A LINE FEED (LF)
1958 // U+0020 SPACE (Space)
1959 // Switch to the tag attribute name after state.
1960 Some(c) if is_whitespace(c) => {
1961 self.update_attribute_span();
1962 self.skip_next_lf(c);
1963 self.reconsume_in_state(State::TagAttributeNameAfter);
1964 }
1965 // U+002F SOLIDUS (/)
1966 // Set current tag to empty tag. Switch to the empty tag state.
1967 Some('/') => {
1968 self.emit_error(ErrorKind::MissingEqualAfterAttributeName);
1969 self.set_tag_to_empty_tag();
1970 self.state = State::EmptyTag;
1971 }
1972 // EOF
1973 // Parse error. Emit the current token as start tag token and then reprocess the
1974 // current input character in the data state.
1975 None => {
1976 self.emit_error(ErrorKind::EofInTag);
1977 self.emit_tag_token(Some(TagKind::Start));
1978 self.reconsume_in_state(State::Data);
1979 }
1980 // Anything else
1981 // Append the current input character to the current attribute's name.
1982 Some(c) => {
1983 self.validate_input_stream_character(c);
1984 self.append_to_attribute(Some((c, c)), None);
1985 }
1986 }
1987
1988 // When the user agent leaves the attribute name state (and
1989 // before emitting the tag token, if appropriate), the
1990 // complete attribute's name must be compared to the other
1991 // attributes on the same token; if there is already an
1992 // attribute on the token with the exact same name, then
1993 // this is a duplicate-attribute parse error and the new
1994 // attribute must be removed from the token.
1995 //
1996 // We postpone it when we will emit current tag token
1997 }
1998 State::TagAttributeNameAfter => {
1999 // Consume the next input character:
2000 match self.consume_next_char() {
2001 // U+0009 CHARACTER TABULATION (tab)
2002 // U+000A LINE FEED (LF)
2003 // U+0020 SPACE
2004 // Ignore the character.
2005 Some(c) if is_whitespace(c) => {
2006 self.skip_next_lf(c);
2007 }
2008 // U+003D EQUALS SIGN(=)
2009 // Switch to the tag attribute value before state.
2010 Some('=') => {
2011 self.state = State::TagAttributeValueBefore;
2012 }
2013 // U+003E GREATER-THEN SIGN(>)
2014 // Emit the current token and then switch to the data state.
2015 Some('>') => {
2016 self.emit_tag_token(None);
2017 self.state = State::Data;
2018 }
2019 // U+002F SOLIDUS (/)
2020 // Set current tag to empty tag. Switch to the empty tag state.
2021 Some('/') => {
2022 self.set_tag_to_empty_tag();
2023 self.state = State::EmptyTag;
2024 }
2025 // EOF
2026 // Parse error. Emit the current token and then reprocess the current input
2027 // character in the data state.
2028 None => {
2029 self.emit_error(ErrorKind::EofInTag);
2030 self.emit_tag_token(None);
2031 self.reconsume_in_state(State::Data);
2032 }
2033 // Anything else
2034 // Start a new attribute in the current tag token. Set that attribute’s name to
2035 // the current input character and its value to the empty string and then switch
2036 // to the tag attribute name state.
2037 Some(c) => {
2038 self.emit_error(ErrorKind::MissingEqualAfterAttributeName);
2039 self.validate_input_stream_character(c);
2040 self.start_new_attribute(Some(c));
2041 self.state = State::TagAttributeName;
2042 }
2043 }
2044 }
2045 State::TagAttributeValueBefore => {
2046 // Consume the next input character:
2047 match self.consume_next_char() {
2048 // U+0009 CHARACTER TABULATION (tab)
2049 // U+000A LINE FEED (LF)
2050 // U+0020 SPACE
2051 // Ignore the character.
2052 Some(c) if is_whitespace(c) => {
2053 self.skip_next_lf(c);
2054 }
2055 // U+0022 QUOTATION MARK (")
2056 // Switch to the attribute value (double-quoted) state.
2057 Some(c @ '"') => {
2058 self.append_to_attribute(None, Some((true, None, Some(c))));
2059 self.state = State::TagAttributeValueDoubleQuoted;
2060 }
2061 // U+0027 APOSTROPHE (')
2062 // Switch to the attribute value (single-quoted) state.
2063 Some(c @ '\'') => {
2064 self.append_to_attribute(None, Some((true, None, Some(c))));
2065 self.state = State::TagAttributeValueSingleQuoted;
2066 }
2067 // U+003E GREATER-THAN SIGN(>)
2068 // Emit the current token and then switch to the data state.
2069 Some('>') => {
2070 self.emit_tag_token(None);
2071 self.state = State::Data;
2072 }
2073 // EOF
2074 // Parse error. Emit the current token and then reprocess the current input
2075 // character in the data state.
2076 None => {
2077 self.emit_error(ErrorKind::EofInTag);
2078 self.emit_tag_token(None);
2079 self.reconsume_in_state(State::Data);
2080 }
2081 // Anything else
2082 // Append the current input character to the current attribute’s value and then
2083 // switch to the tag attribute value unquoted state.
2084 Some(c) => {
2085 self.emit_error(ErrorKind::MissingQuoteBeforeAttributeValue);
2086 self.validate_input_stream_character(c);
2087 self.append_to_attribute(None, Some((true, Some(c), Some(c))));
2088 self.state = State::TagAttributeValueUnquoted;
2089 }
2090 }
2091 }
2092 State::TagAttributeValueDoubleQuoted => {
2093 // Consume the next input character:
2094 match self.consume_next_char() {
2095 // U+0022 QUOTATION MARK (")
2096 // Switch to the tag attribute name before state.
2097 // We set value to support empty attributes (i.e. `attr=""`)
2098 Some(c @ '"') => {
2099 self.append_to_attribute(None, Some((false, None, Some(c))));
2100 self.state = State::TagAttributeValueAfter;
2101 }
2102 // U+0026 AMPERSAND (&)
2103 // Switch to character reference in attribute value state, with the additional
2104 // allowed character being U+0022 QUOTATION MARK(").
2105 Some('&') => {
2106 self.return_state = Some(self.state.clone());
2107 self.state = State::CharacterReferenceInAttributeValue;
2108 self.additional_allowed_character = Some('"');
2109 }
2110 // (<)
2111 Some(c @ '<') => {
2112 self.emit_error(ErrorKind::UnescapedCharacterInAttributeValue('<'));
2113 self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2114 }
2115 // EOF
2116 // Parse error. Emit the current token and then reprocess the current input
2117 // character in the data state.
2118 None => {
2119 self.emit_error(ErrorKind::EofInTag);
2120 self.emit_tag_token(None);
2121 self.reconsume_in_state(State::Data);
2122 }
2123 // Anything else
2124 // Append the input character to the current attribute’s value. Stay in the
2125 // current state.
2126 Some(c) => {
2127 self.validate_input_stream_character(c);
2128 self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2129 }
2130 }
2131 }
2132 State::TagAttributeValueSingleQuoted => {
2133 // Consume the next input character:
2134 match self.consume_next_char() {
2135 // U+0022 APOSTROPHE (')
2136 // Switch to the tag attribute name before state.
2137 // We set value to support empty attributes (i.e. `attr=''`)
2138 Some(c @ '\'') => {
2139 self.append_to_attribute(None, Some((false, None, Some(c))));
2140 self.state = State::TagAttributeValueAfter;
2141 }
2142 // U+0026 AMPERSAND (&)
2143 // Switch to character reference in attribute value state, with the additional
2144 // allowed character being APOSTROPHE (').
2145 Some('&') => {
2146 self.return_state = Some(self.state.clone());
2147 self.state = State::CharacterReferenceInAttributeValue;
2148 self.additional_allowed_character = Some('\'');
2149 }
2150 // (<)
2151 Some(c @ '<') => {
2152 self.emit_error(ErrorKind::UnescapedCharacterInAttributeValue('<'));
2153 self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2154 }
2155 // EOF
2156 // Parse error. Emit the current token and then reprocess the current input
2157 // character in the data state.
2158 None => {
2159 self.emit_error(ErrorKind::EofInTag);
2160 self.emit_tag_token(None);
2161 self.reconsume_in_state(State::Data);
2162 }
2163 // Anything else
2164 // Append the current input character to the current attribute's value.
2165 Some(c) => {
2166 self.validate_input_stream_character(c);
2167 self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2168 }
2169 }
2170 }
2171 State::TagAttributeValueUnquoted => {
2172 // Consume the next input character:
2173 match self.consume_next_char() {
2174 // U+0009 CHARACTER TABULATION (Tab)
2175 // U+000A LINE FEED (LF)
2176 // U+0020 SPACE (Space)
2177 // Switch to the before attribute name state.
2178 Some(c) if is_whitespace(c) => {
2179 self.update_attribute_span();
2180 self.skip_next_lf(c);
2181 self.state = State::TagAttributeValueAfter;
2182 }
2183 // U+0026 AMPERSAND (&)
2184 // Set the return state to the attribute value (unquoted) state. Switch to
2185 // the character reference state.
2186 Some('&') => {
2187 self.return_state = Some(self.state.clone());
2188 self.state = State::CharacterReferenceInAttributeValue;
2189 self.additional_allowed_character = Some('>');
2190 }
2191 // (<)
2192 Some(c @ '<') => {
2193 self.emit_error(ErrorKind::UnescapedCharacterInAttributeValue('<'));
2194 self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2195 }
2196 // U+003E GREATER-THAN SIGN (>)
2197 // Emit the current token as start tag token and then switch to the data state.
2198 Some('>') => {
2199 self.update_attribute_span();
2200 self.emit_tag_token(Some(TagKind::Start));
2201 self.state = State::Data;
2202 }
2203 // EOF
2204 // Parse error. Emit the current token as start tag token and then reprocess the
2205 // current input character in the data state.
2206 None => {
2207 self.emit_error(ErrorKind::EofInTag);
2208 self.update_attribute_span();
2209 self.emit_tag_token(Some(TagKind::Start));
2210 self.reconsume_in_state(State::Data);
2211 }
2212 // Anything else
2213 // Append the input character to the current attribute’s value. Stay in the
2214 // current state.
2215 Some(c) => {
2216 self.validate_input_stream_character(c);
2217 self.append_to_attribute(None, Some((false, Some(c), Some(c))));
2218 }
2219 }
2220 }
2221 State::TagAttributeValueAfter => match self.consume_next_char() {
2222 Some(c) if is_whitespace(c) => {
2223 self.reconsume_in_state(State::TagAttributeNameBefore);
2224 }
2225 Some('>') | Some('/') => {
2226 self.reconsume_in_state(State::TagAttributeNameBefore);
2227 }
2228 None => {
2229 self.emit_error(ErrorKind::EofInTag);
2230 self.update_attribute_span();
2231 self.emit_tag_token(Some(TagKind::Start));
2232 self.reconsume_in_state(State::Data);
2233 }
2234 _ => {
2235 self.emit_error(ErrorKind::MissingSpaceBetweenAttributes);
2236 self.reconsume_in_state(State::TagAttributeNameBefore);
2237 }
2238 },
2239 State::CharacterReferenceInAttributeValue => {
2240 // Attempt to consume a character reference.
2241 //
2242 // If nothing is returned, append a U+0026 AMPERSAND (&) character to current
2243 // attribute’s value.
2244 //
2245 // Otherwise append returned character tokens to current attribute’s value.
2246 //
2247 // Finally, switch back to attribute value state that switched to this state.
2248
2249 let character_reference = self.consume_character_reference();
2250
2251 if let Some((c, raw)) = character_reference {
2252 self.append_to_attribute_with_entity(Some((Some(c), Some(&raw))));
2253 } else {
2254 self.append_to_attribute(None, Some((false, Some('&'), Some('&'))));
2255 }
2256
2257 if let Some(return_state) = &self.return_state {
2258 self.state = return_state.clone();
2259 }
2260 }
2261 State::BogusComment => {
2262 // Consume every character up to the first U+003E GREATER-THAN SIGN (>) or EOF,
2263 // whichever comes first. Emit a comment token whose data is the concatenation
2264 // of all those consumed characters. Then consume the next input character and
2265 // switch to the data state reprocessing the EOF character if that was the
2266 // character consumed.
2267 match self.consume_next_char() {
2268 // U+003E GREATER-THAN SIGN (>)
2269 // Switch to the data state. Emit the current comment token.
2270 Some('>') => {
2271 self.emit_comment_token(Some(">"));
2272 self.state = State::Data;
2273 }
2274 // EOF
2275 // Emit the comment. Emit an end-of-file token.
2276 None => {
2277 self.emit_comment_token(None);
2278 self.state = State::Data;
2279 self.reconsume();
2280 }
2281 // Anything else
2282 // Append the current input character to the comment token's data.
2283 Some(c) => {
2284 self.validate_input_stream_character(c);
2285 self.handle_raw_and_append_to_comment_token(c);
2286 }
2287 }
2288 }
2289 State::Doctype => {
2290 // Consume the next input character:
2291 match self.consume_next_char() {
2292 // U+0009 CHARACTER TABULATION (tab)
2293 // U+000A LINE FEED (LF)
2294 // U+000C FORM FEED (FF)
2295 // U+0020 SPACE
2296 // Switch to the before DOCTYPE name state.
2297 Some(c) if is_whitespace(c) => {
2298 self.append_raw_to_doctype_token(c);
2299 self.state = State::BeforeDoctypeName;
2300 }
2301 // EOF
2302 // Parse error. Switch to data state. Create new Doctype token. Emit Doctype
2303 // token. Reconsume the EOF character.
2304 None => {
2305 self.emit_error(ErrorKind::EofInDoctype);
2306 self.state = State::Data;
2307 self.create_doctype_token(None);
2308 self.emit_doctype_token();
2309 self.reconsume();
2310 }
2311 // Anything else
2312 // This is a missing-whitespace-before-doctype-name parse error. Reconsume
2313 // in the before DOCTYPE name state.
2314 _ => {
2315 self.emit_error(ErrorKind::MissingWhitespaceBeforeDoctypeName);
2316 self.reconsume_in_state(State::BeforeDoctypeName);
2317 }
2318 }
2319 }
2320 State::BeforeDoctypeName => {
2321 // Consume the next input character:
2322 match self.consume_next_char() {
2323 // U+0009 CHARACTER TABULATION (tab)
2324 // U+000A LINE FEED (LF)
2325 // U+000C FORM FEED (FF)
2326 // U+0020 SPACE
2327 // Ignore the character.
2328 Some(c) if is_whitespace(c) => {
2329 self.append_raw_to_doctype_token(c);
2330 }
2331 // Uppercase ASCII letter
2332 // Create a new DOCTYPE token. Set the token name to lowercase version of the
2333 // current input character. Switch to the DOCTYPE name state.
2334 Some(c) if is_ascii_upper_alpha(c) => {
2335 self.append_raw_to_doctype_token(c);
2336 self.create_doctype_token(Some(c.to_ascii_lowercase()));
2337 self.state = State::DoctypeName;
2338 }
2339 // U+003E GREATER-THAN SIGN (>)
2340 // This is a missing-doctype-name parse error. Create a new DOCTYPE token.
2341 // Set its force-quirks flag to on. Switch to the data state. Emit the
2342 // current token.
2343 Some(c @ '>') => {
2344 self.append_raw_to_doctype_token(c);
2345 self.emit_error(ErrorKind::MissingDoctypeName);
2346 self.create_doctype_token(None);
2347 self.emit_doctype_token();
2348 self.state = State::Data;
2349 }
2350 // EOF
2351 // Parse error. Switch to data state. Create new Doctype token. Emit Doctype
2352 // token. Reconsume the EOF character.
2353 None => {
2354 self.emit_error(ErrorKind::EofInDoctype);
2355 self.state = State::Data;
2356 self.create_doctype_token(None);
2357 self.emit_doctype_token();
2358 self.reconsume();
2359 }
2360 // Anything else
2361 // Create new DOCTYPE token. Set the token’s name to current input character.
2362 // Switch to DOCTYPE name state.
2363 Some(c) => {
2364 self.validate_input_stream_character(c);
2365 self.append_raw_to_doctype_token(c);
2366 self.create_doctype_token(Some(c));
2367 self.state = State::DoctypeName;
2368 }
2369 }
2370 }
2371 State::DoctypeName => {
2372 // Consume the next input character:
2373 match self.consume_next_char() {
2374 // U+0009 CHARACTER TABULATION (tab)
2375 // U+000A LINE FEED (LF)
2376 // U+000C FORM FEED (FF)
2377 // U+0020 SPACE
2378 // Switch to the after DOCTYPE name state.
2379 Some(c) if is_whitespace(c) => {
2380 self.append_raw_to_doctype_token(c);
2381 self.state = State::AfterDoctypeName;
2382 }
2383 // ASCII upper alpha
2384 // Append the lowercase version of the current input character (add 0x0020
2385 // to the character's code point) to the current DOCTYPE token's name.
2386 Some(c) if is_ascii_upper_alpha(c) => {
2387 self.append_raw_to_doctype_token(c);
2388 self.append_to_doctype_token(Some(c.to_ascii_lowercase()), None, None);
2389 }
2390 // U+003E GREATER-THAN SIGN (>)
2391 // Emit token. Switch to data state.
2392 Some('>') => {
2393 self.emit_doctype_token();
2394 self.state = State::Data;
2395 }
2396 // EOF
2397 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2398 // character.
2399 None => {
2400 self.emit_error(ErrorKind::EofInDoctype);
2401 self.state = State::Data;
2402 self.emit_doctype_token();
2403 self.reconsume();
2404 }
2405 // Anything else
2406 // Append the current input character to the current DOCTYPE token's name.
2407 Some(c) => {
2408 self.validate_input_stream_character(c);
2409 self.append_raw_to_doctype_token(c);
2410 self.append_to_doctype_token(Some(c), None, None);
2411 }
2412 }
2413 }
2414 State::AfterDoctypeName => {
2415 let cur_pos = self.input.cur_pos();
2416
2417 // Consume the next input character:
2418 match self.consume_next_char() {
2419 // U+0009 CHARACTER TABULATION (tab)
2420 // U+000A LINE FEED (LF)
2421 // U+000C FORM FEED (FF)
2422 // U+0020 SPACE
2423 // Ignore the character.
2424 Some(c) if is_whitespace(c) => {
2425 self.append_raw_to_doctype_token(c);
2426 }
2427 // U+003E GREATER-THAN SIGN (>)
2428 // Switch to the data state. Emit the current DOCTYPE token.
2429 Some(c @ '>') => {
2430 self.append_raw_to_doctype_token(c);
2431 self.state = State::Data;
2432 self.emit_doctype_token();
2433 }
2434 // U+005B LEFT SQUARE BRACKET ([)
2435 // Switch to the doctype internal subset state.
2436 Some(c @ '[') => {
2437 self.append_raw_to_doctype_token(c);
2438 self.state = State::DoctypeTypeInternalSubSet;
2439 }
2440 // EOF
2441 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2442 // character.
2443 None => {
2444 self.emit_error(ErrorKind::EofInDoctype);
2445 self.state = State::Data;
2446 self.emit_doctype_token();
2447 self.reconsume();
2448 }
2449 // Anything else
2450 // If the six characters starting from the current input character are an
2451 // ASCII case-insensitive match for the word "PUBLIC", then consume those
2452 // characters and switch to the after DOCTYPE public keyword state.
2453 //
2454 // Otherwise, if the six characters starting from the current input
2455 // character are an ASCII case-insensitive match for the word "SYSTEM", then
2456 // consume those characters and switch to the after DOCTYPE system keyword
2457 // state.
2458 //
2459 // Otherwise, this is an invalid-character-sequence-after-doctype-name parse
2460 // error. Set the current DOCTYPE token's force-quirks flag to on. Reconsume
2461 // in the bogus DOCTYPE state.
2462 Some(c) => {
2463 let mut first_six_chars = String::with_capacity(6);
2464
2465 first_six_chars.push(c);
2466
2467 for _ in 0..5 {
2468 match self.consume_next_char() {
2469 Some(c) => {
2470 first_six_chars.push(c);
2471 }
2472 _ => {
2473 break;
2474 }
2475 }
2476 }
2477
2478 match &*first_six_chars.to_lowercase() {
2479 "public" => {
2480 self.state = State::AfterDoctypePublicKeyword;
2481
2482 if let Some(doctype_raw) = &mut self.doctype_raw {
2483 doctype_raw.push_str(&first_six_chars);
2484 }
2485 }
2486 "system" => {
2487 self.state = State::AfterDoctypeSystemKeyword;
2488
2489 if let Some(doctype_raw) = &mut self.doctype_raw {
2490 doctype_raw.push_str(&first_six_chars);
2491 }
2492 }
2493 _ => {
2494 self.cur_pos = cur_pos;
2495 unsafe {
2496 // Safety: We got cur_pos from self.input.cur_pos()
2497 self.input.reset_to(cur_pos);
2498 }
2499 self.emit_error(
2500 ErrorKind::InvalidCharacterSequenceAfterDoctypeName,
2501 );
2502 self.reconsume_in_state(State::BogusDoctype);
2503 }
2504 }
2505 }
2506 }
2507 }
2508 State::AfterDoctypePublicKeyword => {
2509 // Consume the next input character:
2510 match self.consume_next_char() {
2511 // U+0009 CHARACTER TABULATION (Tab)
2512 // U+000A LINE FEED (LF)
2513 // U+000C FORM FEED (FF)
2514 // U+0020 SPACE (Space)
2515 // Switch to the before DOCTYPE public identifier state.
2516 Some(c) if is_whitespace(c) => {
2517 self.append_raw_to_doctype_token(c);
2518 self.state = State::BeforeDoctypePublicIdentifier;
2519 }
2520 // U+0022 QUOTATION MARK (")
2521 // This is a missing-whitespace-after-doctype-public-keyword parse error.
2522 // Set the current DOCTYPE token's public identifier to the empty string
2523 // (not missing), then switch to the DOCTYPE public identifier
2524 // (double-quoted) state.
2525 Some(c @ '"') => {
2526 self.append_raw_to_doctype_token(c);
2527 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
2528 self.set_doctype_token_public_id();
2529 self.state = State::DoctypePublicIdentifierDoubleQuoted;
2530 }
2531 // U+0027 APOSTROPHE (')
2532 // This is a missing-whitespace-after-doctype-public-keyword parse error.
2533 // Set the current DOCTYPE token's public identifier to the empty string
2534 // (not missing), then switch to the DOCTYPE public identifier
2535 // (single-quoted) state.
2536 Some(c @ '\'') => {
2537 self.append_raw_to_doctype_token(c);
2538 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
2539 self.set_doctype_token_public_id();
2540 self.state = State::DoctypePublicIdentifierSingleQuoted;
2541 }
2542 // U+003E GREATER-THAN SIGN (>)
2543 // This is a missing-doctype-public-identifier parse error. Set the current
2544 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
2545 // the current DOCTYPE token.
2546 Some(c @ '>') => {
2547 self.append_raw_to_doctype_token(c);
2548 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
2549 self.set_doctype_token_public_id();
2550 self.state = State::DoctypePublicIdentifierSingleQuoted;
2551 }
2552 // EOF
2553 // Parse error. Switch to the data state. Emit that DOCTYPE token. Reconsume the
2554 // EOF character.
2555 None => {
2556 self.emit_error(ErrorKind::EofInDoctype);
2557 self.state = State::Data;
2558 self.emit_doctype_token();
2559 self.reconsume()
2560 }
2561 // Anything else
2562 // Parse error. Switch to the bogus DOCTYPE state. Emit that DOCTYPE token.
2563 // Reconsume the EOF character.
2564 _ => {
2565 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier);
2566 self.reconsume_in_state(State::BogusDoctype);
2567 self.emit_doctype_token();
2568 self.reconsume()
2569 }
2570 }
2571 }
2572 State::AfterDoctypeSystemKeyword => {
2573 // Consume the next input character:
2574 match self.consume_next_char() {
2575 // U+0009 CHARACTER TABULATION (tab)
2576 // U+000A LINE FEED (LF)
2577 // U+000C FORM FEED (FF)
2578 // U+0020 SPACE
2579 // Switch to the before DOCTYPE system identifier state.
2580 Some(c) if is_whitespace(c) => {
2581 self.append_raw_to_doctype_token(c);
2582 self.state = State::BeforeDoctypeSystemIdentifier;
2583 }
2584 // U+0022 QUOTATION MARK (")
2585 // This is a missing-whitespace-after-doctype-system-keyword parse error.
2586 // Set the current DOCTYPE token's system identifier to the empty string
2587 // (not missing), then switch to the DOCTYPE system identifier
2588 // (double-quoted) state.
2589 Some(c @ '"') => {
2590 self.append_raw_to_doctype_token(c);
2591 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
2592 self.set_doctype_token_system_id();
2593 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2594 }
2595 // U+0027 APOSTROPHE (')
2596 // This is a missing-whitespace-after-doctype-system-keyword parse error.
2597 // Set the current DOCTYPE token's system identifier to the empty string
2598 // (not missing), then switch to the DOCTYPE system identifier
2599 // (single-quoted) state.
2600 Some(c @ '\'') => {
2601 self.append_raw_to_doctype_token(c);
2602 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
2603 self.set_doctype_token_system_id();
2604 self.state = State::DoctypeSystemIdentifierSingleQuoted;
2605 }
2606 // U+003E GREATER-THAN SIGN(>)
2607 // Parse error. Set the DOCTYPE token’s public identifier current DOCTYPE token
2608 // to the empty string (not missing), then switch to the DOCTYPE system
2609 // identifier (single-quoted) state.
2610 Some(c @ '>') => {
2611 self.append_raw_to_doctype_token(c);
2612 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
2613 self.set_doctype_token_system_id();
2614 self.state = State::DoctypeSystemIdentifierSingleQuoted;
2615 }
2616 // EOF
2617 // Parse error. Switch to the data state. Emit that DOCTYPE token. Reconsume the
2618 // EOF character.
2619 None => {
2620 self.emit_error(ErrorKind::EofInDoctype);
2621 self.state = State::Data;
2622 self.emit_doctype_token();
2623 self.reconsume()
2624 }
2625 // Anything else
2626 // Parse error. Switch to the bogus DOCTYPE state.
2627 Some(c) => {
2628 self.validate_input_stream_character(c);
2629 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2630 self.state = State::BogusComment
2631 }
2632 }
2633 }
2634 State::BeforeDoctypeSystemIdentifier => {
2635 // Consume the next input character:
2636 match self.consume_next_char() {
2637 // U+0009 CHARACTER TABULATION (tab)
2638 // U+000A LINE FEED (LF)
2639 // U+000C FORM FEED (FF)
2640 // U+0020 SPACE
2641 // Ignore the character.
2642 Some(c) if is_whitespace(c) => {
2643 self.append_raw_to_doctype_token(c);
2644 }
2645 // U+0022 QUOTATION MARK (")
2646 // Set the current DOCTYPE token's system identifier to the empty string
2647 // (not missing), then switch to the DOCTYPE system identifier
2648 // (double-quoted) state.
2649 Some(c @ '"') => {
2650 self.append_raw_to_doctype_token(c);
2651 self.set_doctype_token_system_id();
2652 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2653 }
2654 // U+0027 APOSTROPHE (')
2655 // Set the current DOCTYPE token's system identifier to the empty string
2656 // (not missing), then switch to the DOCTYPE system identifier
2657 // (single-quoted) state.
2658 Some(c @ '\'') => {
2659 self.append_raw_to_doctype_token(c);
2660 self.set_doctype_token_system_id();
2661 self.state = State::DoctypeSystemIdentifierSingleQuoted;
2662 }
2663 // U+003E GREATER-THAN SIGN (>)
2664 // This is a missing-doctype-system-identifier parse error. Set the current
2665 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
2666 // the current DOCTYPE token.
2667 Some(c @ '>') => {
2668 self.append_raw_to_doctype_token(c);
2669 self.emit_error(ErrorKind::EofInDoctype);
2670 self.state = State::Data;
2671 self.emit_doctype_token();
2672 }
2673 // EOF
2674 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2675 // character.
2676 None => {
2677 self.emit_error(ErrorKind::EofInDoctype);
2678 self.state = State::Data;
2679 self.emit_doctype_token();
2680 self.reconsume();
2681 }
2682 // Anything else
2683 // Parse error. Switch to the bogus DOCTYPE state.
2684 Some(c) => {
2685 self.validate_input_stream_character(c);
2686 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2687 self.state = State::BogusDoctype;
2688 }
2689 }
2690 }
2691 State::BeforeDoctypePublicIdentifier => {
2692 // Consume the next input character:
2693 match self.consume_next_char() {
2694 // U+0009 CHARACTER TABULATION (tab)
2695 // U+000A LINE FEED (LF)
2696 // U+000C FORM FEED (FF)
2697 // U+0020 SPACE
2698 // Ignore the character.
2699 Some(c) if is_whitespace(c) => {
2700 self.append_raw_to_doctype_token(c);
2701 }
2702 // U+0022 QUOTATION MARK (")
2703 // Set the current DOCTYPE token's public identifier to the empty string
2704 // (not missing), then switch to the DOCTYPE public identifier
2705 // (double-quoted) state.
2706 Some(c @ '"') => {
2707 self.append_raw_to_doctype_token(c);
2708 self.set_doctype_token_public_id();
2709 self.state = State::DoctypePublicIdentifierDoubleQuoted;
2710 }
2711 // U+0027 APOSTROPHE (')
2712 // Set the current DOCTYPE token's public identifier to the empty string
2713 // (not missing), then switch to the DOCTYPE public identifier
2714 // (single-quoted) state.
2715 Some(c @ '\'') => {
2716 self.append_raw_to_doctype_token(c);
2717 self.set_doctype_token_public_id();
2718 self.state = State::DoctypePublicIdentifierSingleQuoted;
2719 }
2720 // U+003E GREATER-THAN SIGN(>)
2721 // Parse error. Switch to data state. Emit current DOCTYPE token.
2722 Some(c @ '>') => {
2723 self.append_raw_to_doctype_token(c);
2724 self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
2725 self.state = State::Data;
2726 self.emit_doctype_token();
2727 }
2728 // EOF
2729 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
2730 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
2731 // end-of-file token.
2732 None => {
2733 self.emit_error(ErrorKind::EofInDoctype);
2734 self.state = State::Data;
2735 self.emit_doctype_token();
2736 self.reconsume();
2737 }
2738 // Anything else
2739 // Parse error. Switch to the bogus DOCTYPE state.
2740 Some(c) => {
2741 self.validate_input_stream_character(c);
2742 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2743 self.state = State::BogusDoctype;
2744 }
2745 }
2746 }
2747 State::DoctypePublicIdentifierSingleQuoted => {
2748 // Consume the next input character:
2749 match self.consume_next_char() {
2750 // U+0027 APOSTROPHE (')
2751 // Switch to the after DOCTYPE public identifier state.
2752 Some(c @ '\'') => {
2753 self.append_raw_to_doctype_token(c);
2754 self.state = State::AfterDoctypePublicIdentifier;
2755 }
2756 // U+003E GREATER-THAN SIGN(>)
2757 // Parse error. Switch to data state. Emit current DOCTYPE token.
2758 Some(c @ '>') => {
2759 self.append_raw_to_doctype_token(c);
2760 self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
2761 self.state = State::Data;
2762 self.emit_doctype_token();
2763 }
2764 // EOF
2765 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2766 // character.
2767 None => {
2768 self.emit_error(ErrorKind::EofInDoctype);
2769 self.state = State::Data;
2770 self.emit_doctype_token();
2771 self.reconsume();
2772 }
2773 // Anything else
2774 // Append the current input character to the current DOCTYPE token’s public
2775 // identifier.
2776 Some(c) => {
2777 self.validate_input_stream_character(c);
2778 self.append_raw_to_doctype_token(c);
2779 self.append_to_doctype_token(None, Some(c), None);
2780 }
2781 }
2782 }
2783 State::DoctypePublicIdentifierDoubleQuoted => {
2784 // Consume the next input character:
2785 match self.consume_next_char() {
2786 // U+0022 QUOTATION MARK (")
2787 // Switch to the after DOCTYPE public identifier state.
2788 Some(c @ '"') => {
2789 self.append_raw_to_doctype_token(c);
2790 self.state = State::AfterDoctypePublicIdentifier;
2791 }
2792 // U+003E GREATER-THAN SIGN(>)
2793 // Parse error. Switch to data state. Emit current DOCTYPE token.
2794 Some(c @ '>') => {
2795 self.append_raw_to_doctype_token(c);
2796 self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
2797 self.state = State::Data;
2798 self.emit_doctype_token();
2799 }
2800 // EOF
2801 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2802 // character.
2803 None => {
2804 self.emit_error(ErrorKind::EofInDoctype);
2805 self.state = State::Data;
2806 self.emit_doctype_token();
2807 self.reconsume();
2808 }
2809 // Anything else
2810 // Append the current input character to the current DOCTYPE token’s public
2811 // identifier.
2812 Some(c) => {
2813 self.validate_input_stream_character(c);
2814 self.append_raw_to_doctype_token(c);
2815 self.append_to_doctype_token(None, Some(c), None);
2816 }
2817 }
2818 }
2819 State::AfterDoctypePublicIdentifier => {
2820 // Consume the next input character:
2821 match self.consume_next_char() {
2822 // U+0009 CHARACTER TABULATION (tab)
2823 // U+000A LINE FEED (LF)
2824 // U+000C FORM FEED (FF)
2825 // U+0020 SPACE
2826 // Switch to the between DOCTYPE public and system identifiers state.
2827 Some(c) if is_whitespace(c) => {
2828 self.append_raw_to_doctype_token(c);
2829 self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
2830 }
2831 // U+0027 APOSTROPHE (')
2832 // Parse error. Set the DOCTYPE token’s system identifier to the empty string
2833 // (not missing) then switch to the DOCTYPE system identifier (single-quoted)
2834 // state.
2835 Some(c @ '\'') => {
2836 self.append_raw_to_doctype_token(c);
2837 self.emit_error(
2838 ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
2839 );
2840 self.set_doctype_token_system_id();
2841 self.state = State::DoctypeSystemIdentifierSingleQuoted;
2842 }
2843 // U+0022 QUOTATION MARK (")
2844 // Parse error. Set the DOCTYPE token’s system identifier to the empty string
2845 // (not missing) then switch to the DOCTYPE system identifier (double-quoted)
2846 // state.
2847 Some(c @ '"') => {
2848 self.append_raw_to_doctype_token(c);
2849 self.emit_error(
2850 ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
2851 );
2852 self.set_doctype_token_system_id();
2853 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2854 }
2855 // U+003E GREATER-THAN SIGN (>)
2856 // Switch to the data state. Emit the current DOCTYPE token.
2857 Some(c @ '>') => {
2858 self.append_raw_to_doctype_token(c);
2859 self.state = State::Data;
2860 self.emit_doctype_token();
2861 }
2862 // EOF
2863 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2864 // character.
2865 None => {
2866 self.emit_error(ErrorKind::EofInDoctype);
2867 self.state = State::Data;
2868 self.emit_doctype_token();
2869 self.reconsume();
2870 }
2871 // Anything else
2872 // Parse error. Switch to bogus DOCTYPE state.
2873 Some(c) => {
2874 self.validate_input_stream_character(c);
2875 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2876 self.state = State::BogusComment;
2877 }
2878 }
2879 }
2880 State::BetweenDoctypePublicAndSystemIdentifiers => {
2881 // Consume the next input character:
2882 match self.consume_next_char() {
2883 // U+0009 CHARACTER TABULATION (tab)
2884 // U+000A LINE FEED (LF)
2885 // U+000C FORM FEED (FF)
2886 // U+0020 SPACE
2887 // Ignore the character.
2888 Some(c) if is_whitespace(c) => {
2889 self.append_raw_to_doctype_token(c);
2890 }
2891 // U+003E GREATER-THAN SIGN (>)
2892 // Switch to the data state. Emit the current DOCTYPE token.
2893 Some(c @ '>') => {
2894 self.append_raw_to_doctype_token(c);
2895 self.state = State::Data;
2896 self.emit_doctype_token();
2897 }
2898 // U+0027 APOSTROPHE(')
2899 // Set the DOCTYPE token’s system identifier to the empty string (not missing)
2900 // then switch to the DOCTYPE system identifier (single-quoted) state.
2901 Some(c @ '\'') => {
2902 self.append_raw_to_doctype_token(c);
2903 self.set_doctype_token_system_id();
2904 self.state = State::DoctypeSystemIdentifierSingleQuoted;
2905 }
2906 // U+0022 QUOTATION MARK(")
2907 // Set the DOCTYPE token’s system identifier to the empty string (not missing)
2908 // then switch to the DOCTYPE system identifier (double-quoted) state.
2909 Some(c @ '"') => {
2910 self.append_raw_to_doctype_token(c);
2911 self.set_doctype_token_system_id();
2912 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
2913 }
2914 // EOF
2915 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
2916 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
2917 // end-of-file token.
2918 None => {
2919 self.emit_error(ErrorKind::EofInDoctype);
2920 self.state = State::Data;
2921 self.emit_doctype_token();
2922 self.reconsume();
2923 }
2924 // Anything else
2925 // Parse error. Switch to Bogus DOCTYPE state.
2926 Some(c) => {
2927 self.validate_input_stream_character(c);
2928 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
2929 self.state = State::BogusDoctype;
2930 }
2931 }
2932 }
2933 State::DoctypeSystemIdentifierSingleQuoted => {
2934 // Consume the next input character:
2935 match self.consume_next_char() {
2936 // U+0027 APOSTROPHE (')
2937 // Switch to the after DOCTYPE system identifier state.
2938 Some(c @ '\'') => {
2939 self.append_raw_to_doctype_token(c);
2940 self.state = State::AfterDoctypeSystemIdentifier;
2941 }
2942 // U+003E GREATER-THAN SIGN (>)
2943 // Parse error. Switch to data state. Emit current DOCTYPE token.
2944 Some(c @ '>') => {
2945 self.append_raw_to_doctype_token(c);
2946 self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
2947 self.state = State::Data;
2948 self.emit_doctype_token();
2949 }
2950 // EOF
2951 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2952 // character.
2953 None => {
2954 self.emit_error(ErrorKind::EofInDoctype);
2955 self.state = State::Data;
2956 self.emit_doctype_token();
2957 self.reconsume();
2958 }
2959 // Anything else
2960 // Append the current input character to the current DOCTYPE token's system
2961 // identifier.
2962 Some(c) => {
2963 self.validate_input_stream_character(c);
2964 self.append_raw_to_doctype_token(c);
2965 self.append_to_doctype_token(None, None, Some(c));
2966 }
2967 }
2968 }
2969 State::DoctypeSystemIdentifierDoubleQuoted => {
2970 // Consume the next input character:
2971 match self.consume_next_char() {
2972 // U+0027 APOSTROPHE (')
2973 // Switch to the after DOCTYPE system identifier state.
2974 Some(c @ '"') => {
2975 self.append_raw_to_doctype_token(c);
2976 self.state = State::AfterDoctypeSystemIdentifier;
2977 }
2978 // U+003E GREATER-THAN SIGN (>)
2979 // Parse error. Switch to data state. Emit current DOCTYPE token.
2980 Some(c @ '>') => {
2981 self.append_raw_to_doctype_token(c);
2982 self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
2983 self.state = State::Data;
2984 self.emit_doctype_token();
2985 }
2986 // EOF
2987 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
2988 // character.
2989 None => {
2990 self.emit_error(ErrorKind::EofInDoctype);
2991 self.state = State::Data;
2992 self.emit_doctype_token();
2993 self.reconsume();
2994 }
2995 // Anything else
2996 // Append the current input character to the current DOCTYPE token's system
2997 // identifier.
2998 Some(c) => {
2999 self.validate_input_stream_character(c);
3000 self.append_raw_to_doctype_token(c);
3001 self.append_to_doctype_token(None, None, Some(c));
3002 }
3003 }
3004 }
3005 State::AfterDoctypeSystemIdentifier => {
3006 // Consume the next input character:
3007 match self.consume_next_char() {
3008 // U+0009 CHARACTER TABULATION (tab)
3009 // U+000A LINE FEED (LF)
3010 // U+000C FORM FEED (FF)
3011 // U+0020 SPACE
3012 // Ignore the character.
3013 Some(c) if is_whitespace(c) => {
3014 self.append_raw_to_doctype_token(c);
3015 }
3016 // U+003E GREATER-THAN SIGN (>)
3017 // Switch to the data state. Emit the current DOCTYPE token.
3018 Some(c @ '>') => {
3019 self.append_raw_to_doctype_token(c);
3020 self.state = State::Data;
3021 self.emit_doctype_token();
3022 }
3023 // U+005B LEFT SQUARE BRACKET ([)
3024 // Switch to the doctype internal subset state.
3025 Some(c @ '[') => {
3026 self.append_raw_to_doctype_token(c);
3027 self.state = State::DoctypeTypeInternalSubSet;
3028 }
3029 // EOF
3030 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
3031 // character.
3032 None => {
3033 self.emit_error(ErrorKind::EofInDoctype);
3034 self.state = State::Data;
3035 self.emit_doctype_token();
3036 self.reconsume();
3037 }
3038 // Anything else
3039 // Parse error. Switch to Bogus DOCTYPE state.
3040 Some(c) => {
3041 self.validate_input_stream_character(c);
3042 self.emit_error(ErrorKind::UnexpectedCharacterAfterDoctypeSystemIdentifier);
3043 self.state = State::BogusDoctype;
3044 }
3045 }
3046 }
3047 State::DoctypeTypeInternalSubSet => {
3048 // Consume the next input character:
3049 match self.consume_next_char() {
3050 // U+005D RIGHT SQUARE BRACKET (])
3051 // Switch to the CDATA bracket state.
3052 Some(c @ ']') => {
3053 self.append_raw_to_doctype_token(c);
3054 self.state = State::AfterDoctypeName;
3055 }
3056 // EOF
3057 // Parse error. Switch to the data state. Emit DOCTYPE token. Reconsume the EOF
3058 // character.
3059 None => {
3060 self.emit_error(ErrorKind::EofInDoctype);
3061 self.state = State::Data;
3062 self.emit_doctype_token();
3063 self.reconsume();
3064 }
3065 // Anything else
3066 // Append the current input character to the current DOCTYPE token's system
3067 // identifier.
3068 Some(c) => {
3069 // TODO improve parse legacy declarations
3070 self.validate_input_stream_character(c);
3071 self.append_raw_to_doctype_token(c);
3072 }
3073 }
3074 }
3075 State::BogusDoctype => {
3076 // Consume the next input character:
3077 match self.consume_next_char() {
3078 // U+003E GREATER-THAN SIGN(>)
3079 // Switch to data state. Emit DOCTYPE token.
3080 Some(c @ '>') => {
3081 self.append_raw_to_doctype_token(c);
3082 self.state = State::Data;
3083 self.emit_doctype_token();
3084 }
3085 // EOF
3086 // Switch to the data state. Emit DOCTYPE token. Reconsume the EOF character.
3087 None => {
3088 self.state = State::Data;
3089 self.emit_doctype_token();
3090 self.reconsume();
3091 }
3092 // Anything else
3093 // Ignore the character.
3094 Some(c) => {
3095 self.validate_input_stream_character(c);
3096 self.append_raw_to_doctype_token(c);
3097 }
3098 }
3099 }
3100 }
3101
3102 Ok(())
3103 }
3104
3105 #[inline(always)]
3106 fn skip_next_lf(&mut self, c: char) {
3107 if c == '\r' && self.input.cur() == Some('\n') {
3108 unsafe {
3109 // Safety: cur() is Some('\n')
3110 self.input.bump();
3111 }
3112 }
3113 }
3114}
3115
3116// S ::=
3117// (#x20 | #x9 | #xD | #xA)+
3118
3119#[inline(always)]
3120fn is_whitespace(c: char) -> bool {
3121 matches!(c, '\x20' | '\x09' | '\x0d' | '\x0a')
3122}
3123
3124#[inline(always)]
3125fn is_control(c: u32) -> bool {
3126 matches!(c, c @ 0x00..=0x1f | c @ 0x7f..=0x9f if !matches!(c, 0x09 | 0x0a | 0x0c | 0x0d | 0x20))
3127}
3128
3129#[inline(always)]
3130fn is_surrogate(c: u32) -> bool {
3131 matches!(c, 0xd800..=0xdfff)
3132}
3133
3134// A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
3135// inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE,
3136// U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
3137// U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE,
3138// U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE,
3139// U+FFFFF, U+10FFFE, or U+10FFFF.
3140#[inline(always)]
3141fn is_noncharacter(c: u32) -> bool {
3142 matches!(
3143 c,
3144 0xfdd0
3145 ..=0xfdef
3146 | 0xfffe
3147 | 0xffff
3148 | 0x1fffe
3149 | 0x1ffff
3150 | 0x2fffe
3151 | 0x2ffff
3152 | 0x3fffe
3153 | 0x3ffff
3154 | 0x4fffe
3155 | 0x4ffff
3156 | 0x5fffe
3157 | 0x5ffff
3158 | 0x6fffe
3159 | 0x6ffff
3160 | 0x7fffe
3161 | 0x7ffff
3162 | 0x8fffe
3163 | 0x8ffff
3164 | 0x9fffe
3165 | 0x9ffff
3166 | 0xafffe
3167 | 0xaffff
3168 | 0xbfffe
3169 | 0xbffff
3170 | 0xcfffe
3171 | 0xcffff
3172 | 0xdfffe
3173 | 0xdffff
3174 | 0xefffe
3175 | 0xeffff
3176 | 0xffffe
3177 | 0xfffff
3178 | 0x10fffe
3179 | 0x10ffff,
3180 )
3181}
3182
3183#[inline(always)]
3184fn is_ascii_upper_alpha(c: char) -> bool {
3185 c.is_ascii_uppercase()
3186}
3187
3188#[inline(always)]
3189fn is_upper_hex_digit(c: char) -> bool {
3190 matches!(c, '0'..='9' | 'A'..='F')
3191}
3192
3193#[inline(always)]
3194fn is_lower_hex_digit(c: char) -> bool {
3195 matches!(c, '0'..='9' | 'a'..='f')
3196}
3197
3198// NameStartChar ::=
3199// ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] |
3200// [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] |
3201// [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] |
3202// [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
3203#[inline(always)]
3204fn is_name_start_char(c: char) -> bool {
3205 match c {
3206 ':' | 'A'..='Z' | '_' | 'a'..='z' => true,
3207 _ if matches!(c as u32, 0xc0..=0xd6 | 0xd8..=0x2ff | 0x370..=0x37d | 0x37f..=0x1fff | 0x200c..=0x200d | 0x2070..=0x218f | 0x2c00..=0x2fef | 0x3001..=0xd7ff | 0xf900..=0xfdcf | 0xfdf0..=0xfffd | 0x10000..=0xeffff) => {
3208 true
3209 }
3210 _ => false,
3211 }
3212}
3213
3214// NameChar ::=
3215// NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] |
3216// [#x203F-#x2040]
3217#[inline(always)]
3218fn is_name_char(c: char) -> bool {
3219 match c {
3220 '-' | '.' | '0'..='9' => true,
3221 _ if matches!(c as u32, 0xb7 | 0x0300..=0x036f | 0x203f..=0x2040) => true,
3222 _ if is_name_start_char(c) => true,
3223 _ => false,
3224 }
3225}