swc_html_parser/lexer/mod.rs
1use std::{cell::RefCell, char::REPLACEMENT_CHARACTER, collections::VecDeque, mem::take, rc::Rc};
2
3use rustc_hash::FxHashSet;
4use swc_atoms::{atom, Atom};
5use swc_common::{input::Input, BytePos, Span};
6use swc_html_ast::{AttributeToken, Raw, Token, TokenAndSpan};
7use swc_html_utils::{Entity, HTML_ENTITIES};
8
9use crate::{
10 error::{Error, ErrorKind},
11 parser::input::ParserInput,
12};
13
14#[derive(Debug, Clone)]
15pub enum State {
16 Data,
17 Rcdata,
18 Rawtext,
19 ScriptData,
20 PlainText,
21 TagOpen,
22 EndTagOpen,
23 TagName,
24 RcdataLessThanSign,
25 RcdataEndTagOpen,
26 RcdataEndTagName,
27 RawtextLessThanSign,
28 RawtextEndTagOpen,
29 RawtextEndTagName,
30 ScriptDataLessThanSign,
31 ScriptDataEndTagOpen,
32 ScriptDataEndTagName,
33 ScriptDataEscapeStart,
34 ScriptDataEscapeStartDash,
35 ScriptDataEscaped,
36 ScriptDataEscapedDash,
37 ScriptDataEscapedDashDash,
38 ScriptDataEscapedLessThanSign,
39 ScriptDataEscapedEndTagOpen,
40 ScriptDataEscapedEndTagName,
41 ScriptDataDoubleEscapeStart,
42 ScriptDataDoubleEscaped,
43 ScriptDataDoubleEscapedDash,
44 ScriptDataDoubleEscapedDashDash,
45 ScriptDataDoubleEscapedLessThanSign,
46 ScriptDataDoubleEscapeEnd,
47 BeforeAttributeName,
48 AttributeName,
49 AfterAttributeName,
50 BeforeAttributeValue,
51 AttributeValueDoubleQuoted,
52 AttributeValueSingleQuoted,
53 AttributeValueUnquoted,
54 AfterAttributeValueQuoted,
55 SelfClosingStartTag,
56 BogusComment,
57 MarkupDeclarationOpen,
58 CommentStart,
59 CommentStartDash,
60 Comment,
61 CommentLessThanSign,
62 CommentLessThanSignBang,
63 CommentLessThanSignBangDash,
64 CommentLessThanSignBangDashDash,
65 CommentEndDash,
66 CommentEnd,
67 CommentEndBang,
68 Doctype,
69 BeforeDoctypeName,
70 DoctypeName,
71 AfterDoctypeName,
72 AfterDoctypePublicKeyword,
73 BeforeDoctypePublicIdentifier,
74 DoctypePublicIdentifierDoubleQuoted,
75 DoctypePublicIdentifierSingleQuoted,
76 AfterDoctypePublicIdentifier,
77 BetweenDoctypePublicAndSystemIdentifiers,
78 AfterDoctypeSystemKeyword,
79 BeforeDoctypeSystemIdentifier,
80 DoctypeSystemIdentifierDoubleQuoted,
81 DoctypeSystemIdentifierSingleQuoted,
82 AfterDoctypeSystemIdentifier,
83 BogusDoctype,
84 CdataSection,
85 CdataSectionBracket,
86 CdataSectionEnd,
87 CharacterReference,
88 NamedCharacterReference,
89 AmbiguousAmpersand,
90 NumericCharacterReference,
91 HexademicalCharacterReferenceStart,
92 DecimalCharacterReferenceStart,
93 HexademicalCharacterReference,
94 DecimalCharacterReference,
95 NumericCharacterReferenceEnd,
96}
97
98pub(crate) type LexResult<T> = Result<T, ErrorKind>;
99
100pub struct Lexer<'a, I>
101where
102 I: Input<'a>,
103{
104 input: I,
105 cur: Option<char>,
106 cur_pos: BytePos,
107 last_token_pos: BytePos,
108 finished: bool,
109 state: State,
110 return_state: State,
111 errors: Vec<Error>,
112 last_start_tag_name: Option<Atom>,
113 pending_tokens: VecDeque<TokenAndSpan>,
114 buf: Rc<RefCell<String>>,
115 sub_buf: Rc<RefCell<String>>,
116 current_token: Option<Token>,
117 attributes_validator: FxHashSet<Atom>,
118 attribute_start_position: Option<BytePos>,
119 character_reference_code: Option<Vec<(u8, u32, Option<char>)>>,
120 temporary_buffer: String,
121 is_adjusted_current_node_is_element_in_html_namespace: Option<bool>,
122 phantom: std::marker::PhantomData<&'a ()>,
123}
124
125impl<'a, I> Lexer<'a, I>
126where
127 I: Input<'a>,
128{
129 pub fn new(input: I) -> Self {
130 let start_pos = input.last_pos();
131
132 let mut lexer = Lexer {
133 input,
134 cur: None,
135 cur_pos: start_pos,
136 last_token_pos: start_pos,
137 finished: false,
138 state: State::Data,
139 return_state: State::Data,
140 errors: Vec::new(),
141 last_start_tag_name: None,
142 pending_tokens: VecDeque::with_capacity(16),
143 buf: Rc::new(RefCell::new(String::with_capacity(256))),
144 sub_buf: Rc::new(RefCell::new(String::with_capacity(256))),
145 current_token: None,
146 attributes_validator: Default::default(),
147 attribute_start_position: None,
148 character_reference_code: None,
149 // Do this without a new allocation.
150 temporary_buffer: String::with_capacity(33),
151 is_adjusted_current_node_is_element_in_html_namespace: None,
152 phantom: std::marker::PhantomData,
153 };
154
155 // A leading Byte Order Mark (BOM) causes the character encoding argument to be
156 // ignored and will itself be skipped.
157 if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') {
158 unsafe {
159 // Safety: We know that the current character is '\u{feff}'.
160 lexer.input.bump();
161 }
162 }
163
164 lexer
165 }
166}
167
168impl<'a, I: Input<'a>> Iterator for Lexer<'a, I> {
169 type Item = TokenAndSpan;
170
171 fn next(&mut self) -> Option<Self::Item> {
172 let token_and_span = self.read_token_and_span();
173
174 match token_and_span {
175 Ok(token_and_span) => {
176 return Some(token_and_span);
177 }
178 Err(..) => {
179 return None;
180 }
181 }
182 }
183}
184
185impl<'a, I> ParserInput for Lexer<'a, I>
186where
187 I: Input<'a>,
188{
189 fn start_pos(&mut self) -> BytePos {
190 self.input.cur_pos()
191 }
192
193 fn last_pos(&mut self) -> BytePos {
194 self.input.last_pos()
195 }
196
197 fn take_errors(&mut self) -> Vec<Error> {
198 take(&mut self.errors)
199 }
200
201 fn set_last_start_tag_name(&mut self, tag_name: &Atom) {
202 self.last_start_tag_name = Some(tag_name.clone());
203 }
204
205 fn set_adjusted_current_node_to_html_namespace(&mut self, value: bool) {
206 self.is_adjusted_current_node_is_element_in_html_namespace = Some(value);
207 }
208
209 fn set_input_state(&mut self, state: State) {
210 self.state = state;
211 }
212}
213
214impl<'a, I> Lexer<'a, I>
215where
216 I: Input<'a>,
217{
218 #[inline(always)]
219 fn next(&mut self) -> Option<char> {
220 self.input.cur()
221 }
222
223 // Any occurrences of surrogates are surrogate-in-input-stream parse errors. Any
224 // occurrences of noncharacters are noncharacter-in-input-stream parse errors
225 // and any occurrences of controls other than ASCII whitespace and U+0000 NULL
226 // characters are control-character-in-input-stream parse errors.
227 //
228 // Postpone validation for each character for perf reasons and do it in
229 // `anything else`
230 #[inline(always)]
231 fn validate_input_stream_character(&mut self, c: char) {
232 let code = c as u32;
233
234 if is_surrogate(code) {
235 self.emit_error(ErrorKind::SurrogateInInputStream);
236 } else if is_allowed_control_character(code) {
237 self.emit_error(ErrorKind::ControlCharacterInInputStream);
238 } else if is_noncharacter(code) {
239 self.emit_error(ErrorKind::NoncharacterInInputStream);
240 }
241 }
242
243 #[inline(always)]
244 fn consume(&mut self) {
245 self.cur = self.input.cur();
246 self.cur_pos = self.input.cur_pos();
247
248 if self.cur.is_some() {
249 unsafe {
250 // Safety: self.cur is Some()
251 self.input.bump();
252 }
253 }
254 }
255
256 #[inline(always)]
257 fn reconsume(&mut self) {
258 unsafe {
259 // Safety: self.cur_pos is valid position because we got it from self.input
260 self.input.reset_to(self.cur_pos);
261 }
262 }
263
264 #[inline(always)]
265 fn reconsume_in_state(&mut self, state: State) {
266 self.state = state;
267 self.reconsume();
268 }
269
270 #[inline(always)]
271 fn consume_next_char(&mut self) -> Option<char> {
272 // The next input character is the first character in the input stream that has
273 // not yet been consumed or explicitly ignored by the requirements in this
274 // section. Initially, the next input character is the first character in the
275 // input. The current input character is the last character to have been
276 // consumed.
277 let c = self.next();
278
279 self.consume();
280
281 c
282 }
283
284 #[cold]
285 fn emit_error(&mut self, kind: ErrorKind) {
286 self.errors.push(Error::new(
287 Span::new(self.cur_pos, self.input.cur_pos()),
288 kind,
289 ));
290 }
291
292 #[inline(always)]
293 fn emit_token(&mut self, token: Token) {
294 let cur_pos = self.input.cur_pos();
295
296 let span = Span::new(self.last_token_pos, cur_pos);
297
298 self.last_token_pos = cur_pos;
299 self.pending_tokens.push_back(TokenAndSpan { span, token });
300 }
301
302 #[inline(always)]
303 fn is_consumed_as_part_of_an_attribute(&mut self) -> bool {
304 matches!(
305 self.return_state,
306 State::AttributeValueSingleQuoted
307 | State::AttributeValueDoubleQuoted
308 | State::AttributeValueUnquoted
309 )
310 }
311
312 // An appropriate end tag token is an end tag token whose tag name matches the
313 // tag name of the last start tag to have been emitted from this tokenizer, if
314 // any. If no start tag has been emitted from this tokenizer, then no end tag
315 // token is appropriate.
316 #[inline(always)]
317 fn current_end_tag_token_is_an_appropriate_end_tag_token(&mut self) -> bool {
318 if let Some(last_start_tag_name) = &self.last_start_tag_name {
319 let b = self.buf.clone();
320 let buf = b.borrow();
321
322 return *last_start_tag_name == *buf;
323 }
324
325 false
326 }
327
328 #[inline(always)]
329 fn emit_temporary_buffer_as_character_tokens(&mut self) {
330 for c in take(&mut self.temporary_buffer).chars() {
331 self.emit_token(Token::Character {
332 value: c,
333 raw: Some(Raw::Same),
334 });
335 }
336 }
337
338 fn flush_code_points_consumed_as_character_reference(&mut self, raw: Option<String>) {
339 if self.is_consumed_as_part_of_an_attribute() {
340 let b = self.buf.clone();
341 let mut buf = b.borrow_mut();
342 let b = self.sub_buf.clone();
343 let mut sub_buf = b.borrow_mut();
344
345 // When the length of raw is more than the length of temporary buffer we emit a
346 // raw character in the first character token
347 let mut once_raw = raw;
348 let mut once_emitted = false;
349
350 for c in take(&mut self.temporary_buffer).chars() {
351 buf.push(c);
352
353 let raw = match once_raw {
354 Some(_) => {
355 once_emitted = true;
356 once_raw.take()
357 }
358 _ => {
359 if once_emitted {
360 None
361 } else {
362 Some(String::from(c))
363 }
364 }
365 };
366
367 if let Some(raw) = raw {
368 sub_buf.push_str(&raw);
369 }
370 }
371 } else {
372 // When the length of raw is more than the length of temporary buffer we emit a
373 // raw character in the first character token
374 let mut once_raw = raw;
375
376 let is_value_eq_raw = if let Some(raw) = &once_raw {
377 *raw == self.temporary_buffer
378 } else {
379 true
380 };
381
382 for c in take(&mut self.temporary_buffer).chars() {
383 self.emit_token(Token::Character {
384 value: c,
385 raw: if is_value_eq_raw {
386 Some(Raw::Same)
387 } else {
388 once_raw.take().map(|x| Raw::Atom(Atom::new(x)))
389 },
390 });
391 }
392 }
393 }
394
395 #[inline(always)]
396 fn create_doctype_token(&mut self) {
397 self.current_token = Some(Token::Doctype {
398 name: None,
399 force_quirks: false,
400 public_id: None,
401 system_id: None,
402 raw: None,
403 });
404 }
405
406 fn append_raw_to_doctype_token(&mut self, c: char) {
407 let b = self.sub_buf.clone();
408 let mut sub_buf = b.borrow_mut();
409
410 let is_cr = c == '\r';
411
412 if is_cr {
413 sub_buf.push(c);
414
415 if self.input.cur() == Some('\n') {
416 unsafe {
417 // Safety: cur() is Some('\n')
418 self.input.bump();
419 }
420
421 sub_buf.push('\n');
422 }
423 } else {
424 sub_buf.push(c);
425 }
426 }
427
428 fn append_to_doctype_token(
429 &mut self,
430 name: Option<char>,
431 public_id: Option<char>,
432 system_id: Option<char>,
433 ) {
434 let b = self.buf.clone();
435 let mut buf = b.borrow_mut();
436
437 if let Some(name) = name {
438 buf.push(name);
439 }
440
441 if let Some(public_id) = public_id {
442 buf.push(public_id);
443 }
444
445 if let Some(system_id) = system_id {
446 buf.push(system_id);
447 }
448 }
449
450 fn consume_and_append_to_doctype_token_name<F>(&mut self, c: char, f: F)
451 where
452 F: Fn(char) -> bool,
453 {
454 let b = self.buf.clone();
455 let mut buf = b.borrow_mut();
456 let b = self.sub_buf.clone();
457 let mut sub_buf = b.borrow_mut();
458
459 buf.push(c.to_ascii_lowercase());
460 sub_buf.push(c);
461
462 let value = self.input.uncons_while(f);
463
464 buf.push_str(&value.to_ascii_lowercase());
465 sub_buf.push_str(value);
466 }
467
468 fn consume_and_append_to_doctype_token_public_id<F>(&mut self, c: char, f: F)
469 where
470 F: Fn(char) -> bool,
471 {
472 let b = self.buf.clone();
473 let mut buf = b.borrow_mut();
474 let b = self.sub_buf.clone();
475 let mut sub_buf = b.borrow_mut();
476
477 let is_cr = c == '\r';
478
479 if is_cr {
480 buf.push('\n');
481 sub_buf.push(c);
482
483 if self.input.cur() == Some('\n') {
484 unsafe {
485 // Safety: cur() is Some('\n')
486 self.input.bump();
487 }
488
489 sub_buf.push('\n');
490 }
491 } else {
492 buf.push(c);
493 sub_buf.push(c);
494 }
495
496 let value = self.input.uncons_while(f);
497
498 buf.push_str(value);
499 sub_buf.push_str(value);
500 }
501
502 fn consume_and_append_to_doctype_token_system_id<F>(&mut self, c: char, f: F)
503 where
504 F: Fn(char) -> bool,
505 {
506 let b = self.buf.clone();
507 let mut buf = b.borrow_mut();
508 let b = self.sub_buf.clone();
509 let mut sub_buf = b.borrow_mut();
510
511 let is_cr = c == '\r';
512
513 if is_cr {
514 buf.push('\n');
515 sub_buf.push(c);
516
517 if self.input.cur() == Some('\n') {
518 unsafe {
519 // Safety: cur() is Some('\n')
520 self.input.bump();
521 }
522
523 sub_buf.push('\n');
524 }
525 } else {
526 buf.push(c);
527 sub_buf.push(c);
528 }
529
530 let value = self.input.uncons_while(f);
531
532 buf.push_str(value);
533 sub_buf.push_str(value);
534 }
535
536 #[inline(always)]
537 fn set_doctype_token_force_quirks(&mut self) {
538 if let Some(Token::Doctype { force_quirks, .. }) = &mut self.current_token {
539 *force_quirks = true;
540 }
541 }
542
543 #[inline(always)]
544 fn set_doctype_token_name(&mut self, c: char) {
545 let b = self.buf.clone();
546 let mut buf = b.borrow_mut();
547
548 buf.push(c);
549 }
550
551 #[inline(always)]
552 fn set_doctype_token_public_id(&mut self) {
553 if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token {
554 *public_id = Some(atom!(""));
555 }
556 }
557
558 #[inline(always)]
559 fn set_doctype_token_system_id(&mut self) {
560 if let Some(Token::Doctype { system_id, .. }) = &mut self.current_token {
561 // The Longest system id is `http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd`
562 *system_id = Some(atom!(""));
563 }
564 }
565
566 fn finish_doctype_token_name(&mut self) {
567 if let Some(Token::Doctype { name, .. }) = &mut self.current_token {
568 let b = self.buf.clone();
569 let mut buf = b.borrow_mut();
570
571 *name = Some(buf.clone().into());
572
573 buf.clear();
574 }
575 }
576
577 fn finish_doctype_token_public_id(&mut self) {
578 if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token {
579 let b = self.buf.clone();
580 let mut buf = b.borrow_mut();
581
582 *public_id = Some(buf.clone().into());
583
584 buf.clear();
585 }
586 }
587
588 fn finish_doctype_token_system_id(&mut self) {
589 if let Some(Token::Doctype { system_id, .. }) = &mut self.current_token {
590 let b = self.buf.clone();
591 let mut buf = b.borrow_mut();
592
593 *system_id = Some(buf.clone().into());
594
595 buf.clear();
596 }
597 }
598
599 fn emit_doctype_token(&mut self) {
600 if let Some(mut token @ Token::Doctype { .. }) = self.current_token.take() {
601 let b = self.sub_buf.clone();
602 let mut sub_buf = b.borrow_mut();
603
604 match &mut token {
605 Token::Doctype { raw, .. } => {
606 *raw = Some(Atom::new(sub_buf.clone()));
607 }
608 _ => {
609 unreachable!();
610 }
611 }
612
613 sub_buf.clear();
614
615 self.emit_token(token);
616 }
617 }
618
619 #[inline(always)]
620 fn create_start_tag_token(&mut self) {
621 self.current_token = Some(Token::StartTag {
622 // Maximum known tag is `feComponentTransfer` (SVG)
623 tag_name: atom!(""),
624 raw_tag_name: None,
625 is_self_closing: false,
626 attributes: Vec::new(),
627 });
628 }
629
630 #[inline(always)]
631 fn create_end_tag_token(&mut self) {
632 self.current_token = Some(Token::EndTag {
633 // Maximum known tag is `feComponentTransfer` (SVG)
634 tag_name: atom!(""),
635 raw_tag_name: None,
636 is_self_closing: false,
637 // In valid HTML code closed tags do not have attributes
638 attributes: Vec::new(),
639 });
640 }
641
642 fn append_to_tag_token_name(&mut self, c: char, raw_c: char) {
643 if let Some(Token::StartTag { .. } | Token::EndTag { .. }) = &mut self.current_token {
644 let b = self.buf.clone();
645 let mut buf = b.borrow_mut();
646 let b = self.sub_buf.clone();
647 let mut sub_buf = b.borrow_mut();
648
649 buf.push(c);
650 sub_buf.push(raw_c);
651 }
652 }
653
654 fn consume_and_append_to_tag_token_name<F>(&mut self, c: char, f: F)
655 where
656 F: Fn(char) -> bool,
657 {
658 let b = self.buf.clone();
659 let mut buf = b.borrow_mut();
660 let b = self.sub_buf.clone();
661 let mut sub_buf = b.borrow_mut();
662
663 buf.push(c.to_ascii_lowercase());
664 sub_buf.push(c);
665
666 let value = self.input.uncons_while(f);
667
668 buf.push_str(&value.to_ascii_lowercase());
669 sub_buf.push_str(value);
670 }
671
672 fn finish_tag_token_name(&mut self) {
673 if let Some(
674 Token::StartTag {
675 tag_name,
676 raw_tag_name,
677 ..
678 }
679 | Token::EndTag {
680 tag_name,
681 raw_tag_name,
682 ..
683 },
684 ) = &mut self.current_token
685 {
686 let b = self.buf.clone();
687 let mut buf = b.borrow_mut();
688 let b = self.sub_buf.clone();
689 let mut sub_buf = b.borrow_mut();
690
691 *tag_name = buf.clone().into();
692 *raw_tag_name = Some(Atom::new(sub_buf.clone()));
693
694 buf.clear();
695 sub_buf.clear();
696 }
697 }
698
699 fn start_new_attribute_token(&mut self) {
700 if let Some(Token::StartTag { attributes, .. } | Token::EndTag { attributes, .. }) =
701 &mut self.current_token
702 {
703 attributes.push(AttributeToken {
704 span: Default::default(),
705 name: atom!(""),
706 raw_name: None,
707 value: None,
708 raw_value: None,
709 });
710
711 self.attribute_start_position = Some(self.cur_pos);
712 }
713 }
714
715 fn append_to_attribute_token_name(&mut self, c: char, raw_c: char) {
716 let b = self.buf.clone();
717 let mut buf = b.borrow_mut();
718 let b = self.sub_buf.clone();
719 let mut sub_buf = b.borrow_mut();
720
721 buf.push(c);
722 sub_buf.push(raw_c);
723 }
724
725 fn consume_and_append_to_attribute_token_name<F>(&mut self, c: char, f: F)
726 where
727 F: FnMut(char) -> bool,
728 {
729 let b = self.buf.clone();
730 let mut buf = b.borrow_mut();
731 let b = self.sub_buf.clone();
732 let mut sub_buf = b.borrow_mut();
733
734 buf.push(c.to_ascii_lowercase());
735 sub_buf.push(c);
736
737 let value = self.input.uncons_while(f);
738
739 buf.push_str(&value.to_ascii_lowercase());
740 sub_buf.push_str(value);
741 }
742
743 fn consume_and_append_to_attribute_token_name_and_temp_buf<F>(&mut self, c: char, f: F)
744 where
745 F: FnMut(char) -> bool,
746 {
747 let b = self.buf.clone();
748 let mut buf = b.borrow_mut();
749 let b = self.sub_buf.clone();
750 let mut sub_buf = b.borrow_mut();
751
752 buf.push(c.to_ascii_lowercase());
753 sub_buf.push(c);
754
755 self.temporary_buffer.push(c);
756
757 let value = self.input.uncons_while(f);
758
759 buf.push_str(&value.to_ascii_lowercase());
760 sub_buf.push_str(value);
761
762 self.temporary_buffer.push_str(value);
763 }
764
765 fn finish_attribute_token_name(&mut self) {
766 if let Some(attribute_start_position) = self.attribute_start_position {
767 if let Some(
768 Token::StartTag {
769 ref mut attributes, ..
770 }
771 | Token::EndTag {
772 ref mut attributes, ..
773 },
774 ) = self.current_token
775 {
776 if let Some(last) = attributes.last_mut() {
777 let b = self.buf.clone();
778 let mut buf = b.borrow_mut();
779 let b = self.sub_buf.clone();
780 let mut sub_buf = b.borrow_mut();
781
782 let name: Atom = buf.clone().into();
783 let raw_name = Atom::new(sub_buf.clone());
784 let span = Span::new(attribute_start_position, self.cur_pos);
785
786 if self.attributes_validator.contains(&name) {
787 self.errors
788 .push(Error::new(span, ErrorKind::DuplicateAttribute));
789 }
790
791 self.attributes_validator.insert(name.clone());
792
793 last.name = name;
794 last.raw_name = Some(raw_name);
795
796 buf.clear();
797 sub_buf.clear();
798
799 last.span = span;
800 }
801 }
802 }
803 }
804
805 fn append_to_attribute_token_value(&mut self, c: Option<char>, raw_c: Option<char>) {
806 let b = self.buf.clone();
807 let mut buf = b.borrow_mut();
808 let b = self.sub_buf.clone();
809 let mut sub_buf = b.borrow_mut();
810
811 let is_cr = raw_c == Some('\r');
812
813 if is_cr {
814 buf.push('\n');
815 sub_buf.push('\r');
816
817 if self.input.cur() == Some('\n') {
818 unsafe {
819 // Safety: cur() is Some('\n')
820 self.input.bump();
821 }
822
823 sub_buf.push('\n');
824 }
825 } else {
826 if let Some(c) = c {
827 buf.push(c);
828 }
829
830 if let Some(raw_c) = raw_c {
831 sub_buf.push(raw_c);
832 }
833 }
834 }
835
836 fn consume_and_append_to_attribute_token_value<F>(&mut self, c: char, f: F)
837 where
838 F: FnMut(char) -> bool,
839 {
840 let b = self.buf.clone();
841 let mut buf = b.borrow_mut();
842 let b = self.sub_buf.clone();
843 let mut sub_buf = b.borrow_mut();
844
845 let is_cr = c == '\r';
846
847 if is_cr {
848 buf.push('\n');
849 sub_buf.push(c);
850
851 if self.input.cur() == Some('\n') {
852 unsafe {
853 // Safety: cur() is Some('\n')
854 self.input.bump();
855 }
856
857 sub_buf.push('\n');
858 }
859 } else {
860 buf.push(c);
861 sub_buf.push(c);
862 }
863
864 let value = self.input.uncons_while(f);
865
866 buf.push_str(value);
867 sub_buf.push_str(value);
868 }
869
870 fn finish_attribute_token_value(&mut self) {
871 if let Some(attribute_start_position) = self.attribute_start_position {
872 if let Some(
873 Token::StartTag {
874 ref mut attributes, ..
875 }
876 | Token::EndTag {
877 ref mut attributes, ..
878 },
879 ) = self.current_token
880 {
881 if let Some(last) = attributes.last_mut() {
882 let b = self.buf.clone();
883 let mut buf = b.borrow_mut();
884 let b = self.sub_buf.clone();
885 let mut sub_buf = b.borrow_mut();
886
887 if !buf.is_empty() {
888 last.value = Some(buf.clone().into());
889 } else if !sub_buf.is_empty() {
890 last.value = Some("".into());
891 }
892
893 buf.clear();
894
895 if !sub_buf.is_empty() {
896 last.raw_value = Some(Atom::new(sub_buf.clone()));
897
898 sub_buf.clear();
899 }
900
901 last.span = Span::new(attribute_start_position, self.cur_pos);
902 }
903 }
904 }
905 }
906
907 fn emit_tag_token(&mut self) {
908 if let Some(current_tag_token) = self.current_token.take() {
909 self.attributes_validator.clear();
910
911 match ¤t_tag_token {
912 Token::StartTag { ref tag_name, .. } => {
913 self.last_start_tag_name = Some(tag_name.clone());
914 }
915 Token::EndTag {
916 ref is_self_closing,
917 ref attributes,
918 ..
919 } => {
920 if !attributes.is_empty() {
921 self.emit_error(ErrorKind::EndTagWithAttributes);
922 }
923
924 if *is_self_closing {
925 self.emit_error(ErrorKind::EndTagWithTrailingSolidus);
926 }
927 }
928 _ => {
929 unreachable!();
930 }
931 }
932
933 self.emit_token(current_tag_token);
934 }
935 }
936
937 #[inline(always)]
938 fn create_comment_token(&mut self, raw_start: &str) {
939 let b = self.sub_buf.clone();
940 let mut sub_buf = b.borrow_mut();
941
942 sub_buf.push_str(raw_start);
943 }
944
945 #[inline(always)]
946 fn create_comment_token_with_cdata(&mut self) {
947 let b = self.buf.clone();
948 let mut buf = b.borrow_mut();
949 let b = self.sub_buf.clone();
950 let mut sub_buf = b.borrow_mut();
951
952 buf.push_str("[CDATA[");
953 sub_buf.push_str("<!");
954 sub_buf.push_str("[CDATA[");
955 }
956
957 fn append_to_comment_token(&mut self, c: char, raw_c: char) {
958 let b = self.buf.clone();
959 let mut buf = b.borrow_mut();
960 let b = self.sub_buf.clone();
961 let mut sub_buf = b.borrow_mut();
962
963 buf.push(c);
964 sub_buf.push(raw_c);
965 }
966
967 fn consume_and_append_to_comment_token<F>(&mut self, c: char, f: F)
968 where
969 F: Fn(char) -> bool,
970 {
971 let b = self.buf.clone();
972 let mut buf = b.borrow_mut();
973 let b = self.sub_buf.clone();
974 let mut sub_buf = b.borrow_mut();
975
976 let is_cr = c == '\r';
977
978 if is_cr {
979 buf.push('\n');
980 sub_buf.push(c);
981
982 if self.input.cur() == Some('\n') {
983 unsafe {
984 // Safety: cur() is Some('\n')
985 self.input.bump();
986 }
987
988 sub_buf.push('\n');
989 }
990 } else {
991 buf.push(c);
992 sub_buf.push(c);
993 }
994
995 let value = self.input.uncons_while(f);
996
997 buf.push_str(value);
998 sub_buf.push_str(value);
999 }
1000
1001 fn emit_comment_token(&mut self, raw_end: Option<&str>) {
1002 let b = self.buf.clone();
1003 let mut buf = b.borrow_mut();
1004 let b = self.sub_buf.clone();
1005 let mut sub_buf = b.borrow_mut();
1006
1007 if let Some(raw_end) = raw_end {
1008 sub_buf.push_str(raw_end);
1009 }
1010
1011 self.emit_token(Token::Comment {
1012 data: buf.clone().into(),
1013 raw: Some(Atom::new(sub_buf.clone())),
1014 });
1015
1016 buf.clear();
1017 sub_buf.clear();
1018 }
1019
1020 #[inline(always)]
1021 fn emit_character_token(&mut self, value: char) {
1022 self.emit_token(Token::Character {
1023 value,
1024 raw: Some(Raw::Same),
1025 });
1026 }
1027
1028 #[inline(always)]
1029 fn emit_character_token_with_raw(&mut self, c: char, raw_c: char) {
1030 let b = self.buf.clone();
1031 let mut buf = b.borrow_mut();
1032
1033 buf.push(raw_c);
1034
1035 self.emit_token(Token::Character {
1036 value: c,
1037 raw: Some(Raw::Atom(Atom::new(&**buf))),
1038 });
1039
1040 buf.clear();
1041 }
1042
1043 fn handle_raw_and_emit_character_token(&mut self, c: char) {
1044 let is_cr = c == '\r';
1045
1046 if is_cr {
1047 let b = self.buf.clone();
1048 let mut buf = b.borrow_mut();
1049
1050 buf.push(c);
1051
1052 if self.input.cur() == Some('\n') {
1053 unsafe {
1054 // Safety: cur() is Some('\n')
1055 self.input.bump();
1056 }
1057 buf.push('\n');
1058 }
1059
1060 self.emit_token(Token::Character {
1061 value: '\n',
1062 raw: Some(Raw::Atom(Atom::new(&**buf))),
1063 });
1064
1065 buf.clear();
1066 } else {
1067 self.emit_token(Token::Character {
1068 value: c,
1069 raw: Some(Raw::Same),
1070 });
1071 }
1072 }
1073
1074 fn read_token_and_span(&mut self) -> LexResult<TokenAndSpan> {
1075 if self.finished {
1076 return Err(ErrorKind::Eof);
1077 } else {
1078 while self.pending_tokens.is_empty() {
1079 self.run()?;
1080 }
1081 }
1082
1083 let token_and_span = self.pending_tokens.pop_front().unwrap();
1084
1085 match token_and_span.token {
1086 Token::Eof => {
1087 self.finished = true;
1088
1089 return Err(ErrorKind::Eof);
1090 }
1091 _ => {
1092 return Ok(token_and_span);
1093 }
1094 }
1095 }
1096
1097 fn run(&mut self) -> LexResult<()> {
1098 match self.state {
1099 // https://html.spec.whatwg.org/multipage/parsing.html#data-state
1100 State::Data => {
1101 // Consume the next input character:
1102 match self.consume_next_char() {
1103 // U+0026 AMPERSAND (&)
1104 // Set the return state to the data state. Switch to the character reference
1105 // state.
1106 Some('&') => {
1107 self.return_state = State::Data;
1108 self.state = State::CharacterReference;
1109 }
1110 // U+003C LESS-THAN SIGN (<)
1111 // Switch to the tag open state.
1112 Some('<') => {
1113 self.state = State::TagOpen;
1114 }
1115 // U+0000 NULL
1116 // This is an unexpected-null-character parse error. Emit the current input
1117 // character as a character token.
1118 Some(c @ '\x00') => {
1119 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1120 self.emit_character_token(c);
1121 }
1122 // EOF
1123 // Emit an end-of-file token.
1124 None => {
1125 self.emit_token(Token::Eof);
1126
1127 return Ok(());
1128 }
1129 // Anything else
1130 // Emit the current input character as a character token.
1131 Some(c) => {
1132 self.validate_input_stream_character(c);
1133 self.handle_raw_and_emit_character_token(c);
1134 }
1135 }
1136 }
1137 // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1138 State::Rcdata => {
1139 // Consume the next input character:
1140 match self.consume_next_char() {
1141 // U+0026 AMPERSAND (&)
1142 // Set the return state to the RCDATA state. Switch to the character
1143 // reference state.
1144 Some('&') => {
1145 self.return_state = State::Rcdata;
1146 self.state = State::CharacterReference;
1147 }
1148 // U+003C LESS-THAN SIGN (<)
1149 // Switch to the RCDATA less-than sign state.
1150 Some('<') => {
1151 self.state = State::RcdataLessThanSign;
1152 }
1153 // U+0000 NULL
1154 // This is an unexpected-null-character parse error. Emit a U+FFFD
1155 // REPLACEMENT CHARACTER character token.
1156 Some(c @ '\x00') => {
1157 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1158 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1159 }
1160 // EOF
1161 // Emit an end-of-file token.
1162 None => {
1163 self.emit_token(Token::Eof);
1164
1165 return Ok(());
1166 }
1167 // Anything else
1168 // Emit the current input character as a character token.
1169 Some(c) => {
1170 self.validate_input_stream_character(c);
1171 self.handle_raw_and_emit_character_token(c);
1172 }
1173 }
1174 }
1175 // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1176 State::Rawtext => {
1177 // Consume the next input character:
1178 match self.consume_next_char() {
1179 // U+003C LESS-THAN SIGN (<)
1180 // Switch to the RAWTEXT less-than sign state.
1181 Some('<') => self.state = State::RawtextLessThanSign,
1182 // U+0000 NULL
1183 // This is an unexpected-null-character parse error. Emit a U+FFFD
1184 // REPLACEMENT CHARACTER character token.
1185 Some(c @ '\x00') => {
1186 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1187 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1188 }
1189 // EOF
1190 // Emit an end-of-file token.
1191 None => {
1192 self.emit_token(Token::Eof);
1193
1194 return Ok(());
1195 }
1196 // Anything else
1197 // Emit the current input character as a character token.
1198 Some(c) => {
1199 self.validate_input_stream_character(c);
1200 self.handle_raw_and_emit_character_token(c);
1201 }
1202 }
1203 }
1204 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1205 State::ScriptData => {
1206 // Consume the next input character:
1207 match self.consume_next_char() {
1208 // U+003C LESS-THAN SIGN (<)
1209 // Switch to the script data less-than sign state.
1210 Some('<') => self.state = State::ScriptDataLessThanSign,
1211 // U+0000 NULL
1212 // This is an unexpected-null-character parse error. Emit a U+FFFD
1213 // REPLACEMENT CHARACTER character token.
1214 Some(c @ '\x00') => {
1215 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1216 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1217 }
1218 // EOF
1219 // Emit an end-of-file token.
1220 None => {
1221 self.emit_token(Token::Eof);
1222
1223 return Ok(());
1224 }
1225 // Anything else
1226 // Emit the current input character as a character token.
1227 Some(c) => {
1228 self.validate_input_stream_character(c);
1229 self.handle_raw_and_emit_character_token(c);
1230 }
1231 }
1232 }
1233 // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
1234 State::PlainText => {
1235 // Consume the next input character:
1236 match self.consume_next_char() {
1237 // U+0000 NULL
1238 // This is an unexpected-null-character parse error. Emit a U+FFFD
1239 // REPLACEMENT CHARACTER character token.
1240 Some(c @ '\x00') => {
1241 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1242 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1243 }
1244 // EOF
1245 // Emit an end-of-file token.
1246 None => {
1247 self.emit_token(Token::Eof);
1248
1249 return Ok(());
1250 }
1251 // Anything else
1252 // Emit the current input character as a character token.
1253 Some(c) => {
1254 self.validate_input_stream_character(c);
1255 self.handle_raw_and_emit_character_token(c);
1256 }
1257 }
1258 }
1259 // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1260 State::TagOpen => {
1261 // Consume the next input character:
1262 match self.consume_next_char() {
1263 // U+002F SOLIDUS (/)
1264 // Switch to the end tag open state.
1265 Some('/') => {
1266 self.state = State::EndTagOpen;
1267 }
1268 // U+0021 EXCLAMATION MARK (!)
1269 // Switch to the markup declaration open state.
1270 Some('!') => {
1271 self.state = State::MarkupDeclarationOpen;
1272 }
1273 // ASCII alpha
1274 // Create a new start tag token, set its tag name to the empty string.
1275 // Reconsume in the tag name state.
1276 Some(c) if is_ascii_alpha(c) => {
1277 self.create_start_tag_token();
1278 self.reconsume_in_state(State::TagName);
1279 }
1280 // U+003F QUESTION MARK (?)
1281 // This is an unexpected-question-mark-instead-of-tag-name parse error.
1282 // Create a comment token whose data is the empty string. Reconsume in the
1283 // bogus comment state.
1284 Some('?') => {
1285 self.emit_error(ErrorKind::UnexpectedQuestionMarkInsteadOfTagName);
1286 self.create_comment_token("<");
1287 self.reconsume_in_state(State::BogusComment);
1288 }
1289 // EOF
1290 // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1291 // character token and an end-of-file token.
1292 None => {
1293 self.emit_error(ErrorKind::EofBeforeTagName);
1294 self.emit_character_token('<');
1295 self.emit_token(Token::Eof);
1296
1297 return Ok(());
1298 }
1299 // Anything else
1300 // This is an invalid-first-character-of-tag-name parse error. Emit a U+003C
1301 // LESS-THAN SIGN character token. Reconsume in the data state.
1302 _ => {
1303 self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1304 self.emit_character_token('<');
1305 self.reconsume_in_state(State::Data);
1306 }
1307 }
1308 }
1309 // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1310 State::EndTagOpen => {
1311 // Consume the next input character:
1312 match self.consume_next_char() {
1313 // ASCII alpha
1314 // Create a new end tag token, set its tag name to the empty string.
1315 // Reconsume in the tag name state.
1316 Some(c) if is_ascii_alpha(c) => {
1317 self.create_end_tag_token();
1318 self.reconsume_in_state(State::TagName);
1319 }
1320 // U+003E GREATER-THAN SIGN (>)
1321 // This is a missing-end-tag-name parse error. Switch to the data state.
1322 Some('>') => {
1323 self.emit_error(ErrorKind::MissingEndTagName);
1324 self.state = State::Data;
1325 }
1326 // EOF
1327 // This is an eof-before-tag-name parse error. Emit a U+003C LESS-THAN SIGN
1328 // character token, a U+002F SOLIDUS character token and an end-of-file
1329 // token.
1330 None => {
1331 self.emit_error(ErrorKind::EofBeforeTagName);
1332 self.emit_character_token('<');
1333 self.emit_character_token('/');
1334 self.emit_token(Token::Eof);
1335
1336 return Ok(());
1337 }
1338 // Anything else
1339 // This is an invalid-first-character-of-tag-name parse error. Create a
1340 // comment token whose data is the empty string. Reconsume in the bogus
1341 // comment state.
1342 _ => {
1343 self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
1344 self.create_comment_token("</");
1345 self.reconsume_in_state(State::BogusComment);
1346 }
1347 }
1348 }
1349 // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
1350 State::TagName => {
1351 // Consume the next input character:
1352 match self.consume_next_char() {
1353 // U+0009 CHARACTER TABULATION (tab)
1354 // U+000A LINE FEED (LF)
1355 // U+000C FORM FEED (FF)
1356 // U+0020 SPACE
1357 // Switch to the before attribute name state.
1358 Some(c) if is_spacy(c) => {
1359 self.finish_tag_token_name();
1360 self.skip_whitespaces(c);
1361 self.state = State::BeforeAttributeName;
1362 }
1363 // U+002F SOLIDUS (/)
1364 // Switch to the self-closing start tag state.
1365 Some('/') => {
1366 self.finish_tag_token_name();
1367 self.state = State::SelfClosingStartTag;
1368 }
1369 // U+003E GREATER-THAN SIGN (>)
1370 // Switch to the data state. Emit the current tag token.
1371 Some('>') => {
1372 self.finish_tag_token_name();
1373 self.state = State::Data;
1374 self.emit_tag_token();
1375 }
1376 // ASCII upper alpha
1377 // Append the lowercase version of the current input character (add 0x0020
1378 // to the character's code point) to the current tag token's tag name.
1379 Some(c) if is_ascii_upper_alpha(c) => {
1380 self.consume_and_append_to_tag_token_name(c, is_ascii_upper_alpha);
1381 }
1382 // U+0000 NULL
1383 // This is an unexpected-null-character parse error. Append a U+FFFD
1384 // REPLACEMENT CHARACTER character to the current tag token's tag name.
1385 Some(c @ '\x00') => {
1386 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1387 self.append_to_tag_token_name(REPLACEMENT_CHARACTER, c);
1388 }
1389 // EOF
1390 // This is an eof-in-tag parse error. Emit an end-of-file token.
1391 None => {
1392 self.finish_tag_token_name();
1393 self.emit_error(ErrorKind::EofInTag);
1394 self.emit_token(Token::Eof);
1395
1396 return Ok(());
1397 }
1398 // Anything else
1399 // Append the current input character to the current tag token's tag name.
1400 Some(c) => {
1401 self.validate_input_stream_character(c);
1402 self.consume_and_append_to_tag_token_name(c, |c| {
1403 if !is_allowed_character(c) {
1404 return false;
1405 }
1406
1407 // List of characters from above to stop consumption and a certain
1408 // branch took control
1409 !is_spacy(c)
1410 && !matches!(c, '/' | '>' | '\x00')
1411 && !is_ascii_upper_alpha(c)
1412 });
1413 }
1414 }
1415 }
1416 // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
1417 State::RcdataLessThanSign => {
1418 // Consume the next input character:
1419 match self.consume_next_char() {
1420 // U+002F SOLIDUS (/)
1421 // Set the temporary buffer to the empty string. Switch to the RCDATA end
1422 // tag open state.
1423 Some('/') => {
1424 self.temporary_buffer.clear();
1425 self.state = State::RcdataEndTagOpen;
1426 }
1427 // Anything else
1428 // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RCDATA
1429 // state.
1430 _ => {
1431 self.emit_character_token('<');
1432 self.reconsume_in_state(State::Rcdata);
1433 }
1434 }
1435 }
1436 // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1437 State::RcdataEndTagOpen => {
1438 // Consume the next input character:
1439 match self.consume_next_char() {
1440 // ASCII alpha
1441 // Create a new end tag token, set its tag name to the empty string.
1442 // Reconsume in the RCDATA end tag name state.
1443 Some(c) if is_ascii_alpha(c) => {
1444 self.create_end_tag_token();
1445 self.reconsume_in_state(State::RcdataEndTagName);
1446 }
1447 // Anything else
1448 // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
1449 // character token. Reconsume in the RCDATA state.
1450 _ => {
1451 self.emit_character_token('<');
1452 self.emit_character_token('/');
1453 self.reconsume_in_state(State::Rcdata);
1454 }
1455 }
1456 }
1457 // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
1458 State::RcdataEndTagName => {
1459 let anything_else = |lexer: &mut Lexer<'a, I>| {
1460 lexer.finish_tag_token_name();
1461 lexer.emit_character_token('<');
1462 lexer.emit_character_token('/');
1463 lexer.emit_temporary_buffer_as_character_tokens();
1464 lexer.reconsume_in_state(State::Rcdata);
1465 };
1466
1467 // Consume the next input character:
1468 match self.consume_next_char() {
1469 // U+0009 CHARACTER TABULATION (tab)
1470 // U+000A LINE FEED (LF)
1471 // U+000C FORM FEED (FF)
1472 // U+0020 SPACE
1473 // If the current end tag token is an appropriate end tag token, then switch
1474 // to the before attribute name state. Otherwise, treat it as per the
1475 // "anything else" entry below.
1476 Some(c) if is_spacy(c) => {
1477 self.skip_whitespaces(c);
1478
1479 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1480 self.finish_tag_token_name();
1481 self.state = State::BeforeAttributeName;
1482 } else {
1483 anything_else(self);
1484 }
1485 }
1486 // U+002F SOLIDUS (/)
1487 // If the current end tag token is an appropriate end tag token, then switch
1488 // to the self-closing start tag state. Otherwise, treat it as per the
1489 // "anything else" entry below.
1490 Some('/') => {
1491 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1492 self.finish_tag_token_name();
1493 self.state = State::SelfClosingStartTag;
1494 } else {
1495 anything_else(self);
1496 }
1497 }
1498 // U+003E GREATER-THAN SIGN (>)
1499 // If the current end tag token is an appropriate end tag token, then switch
1500 // to the data state and emit the current tag token. Otherwise, treat it as
1501 // per the "anything else" entry below.
1502 Some('>') => {
1503 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1504 self.finish_tag_token_name();
1505 self.state = State::Data;
1506 self.emit_tag_token();
1507 } else {
1508 anything_else(self);
1509 }
1510 }
1511 // ASCII upper alpha
1512 // Append the lowercase version of the current input character (add 0x0020
1513 // to the character's code point) to the current tag token's tag name.
1514 // Append the current input character to the temporary buffer.
1515 Some(c) if is_ascii_upper_alpha(c) => {
1516 self.consume_and_append_to_attribute_token_name_and_temp_buf(
1517 c,
1518 is_ascii_upper_alpha,
1519 );
1520 }
1521 // ASCII lower alpha
1522 // Append the current input character to the current tag token's tag name.
1523 // Append the current input character to the temporary buffer.
1524 Some(c) if is_ascii_lower_alpha(c) => {
1525 self.consume_and_append_to_attribute_token_name_and_temp_buf(
1526 c,
1527 is_ascii_lower_alpha,
1528 );
1529 }
1530 // Anything else
1531 // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
1532 // token, and a character token for each of the characters in the temporary
1533 // buffer (in the order they were added to the buffer). Reconsume in the
1534 // RCDATA state.
1535 _ => {
1536 anything_else(self);
1537 }
1538 }
1539 }
1540 // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
1541 State::RawtextLessThanSign => {
1542 // Consume the next input character:
1543 match self.consume_next_char() {
1544 // U+002F SOLIDUS (/)
1545 // Set the temporary buffer to the empty string. Switch to the RAWTEXT end
1546 // tag open state.
1547 Some('/') => {
1548 self.temporary_buffer.clear();
1549 self.state = State::RawtextEndTagOpen;
1550 }
1551 // Anything else
1552 // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RAWTEXT
1553 // state.
1554 _ => {
1555 self.emit_character_token('<');
1556 self.reconsume_in_state(State::Rawtext);
1557 }
1558 }
1559 }
1560 // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1561 State::RawtextEndTagOpen => {
1562 // Consume the next input character:
1563 match self.consume_next_char() {
1564 // ASCII alpha
1565 // Create a new end tag token, set its tag name to the empty string.
1566 // Reconsume in the RAWTEXT end tag name state.
1567 Some(c) if is_ascii_alpha(c) => {
1568 self.create_end_tag_token();
1569 self.reconsume_in_state(State::RawtextEndTagName);
1570 }
1571 // Anything else
1572 // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
1573 // character token. Reconsume in the RAWTEXT state.
1574 _ => {
1575 self.emit_character_token('<');
1576 self.emit_character_token('/');
1577 self.reconsume_in_state(State::Rawtext);
1578 }
1579 }
1580 }
1581 // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
1582 State::RawtextEndTagName => {
1583 let anything_else = |lexer: &mut Lexer<'a, I>| {
1584 lexer.finish_tag_token_name();
1585 lexer.emit_character_token('<');
1586 lexer.emit_character_token('/');
1587 lexer.emit_temporary_buffer_as_character_tokens();
1588 lexer.reconsume_in_state(State::Rawtext);
1589 };
1590
1591 // Consume the next input character:
1592 match self.consume_next_char() {
1593 // U+0009 CHARACTER TABULATION (tab)
1594 // U+000A LINE FEED (LF)
1595 // U+000C FORM FEED (FF)
1596 // U+0020 SPACE
1597 // If the current end tag token is an appropriate end tag token, then switch
1598 // to the before attribute name state. Otherwise, treat it as per the
1599 // "anything else" entry below.
1600 Some(c) if is_spacy(c) => {
1601 self.skip_whitespaces(c);
1602
1603 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1604 self.finish_tag_token_name();
1605 self.state = State::BeforeAttributeName;
1606 } else {
1607 anything_else(self);
1608 }
1609 }
1610 // U+002F SOLIDUS (/)
1611 // If the current end tag token is an appropriate end tag token, then switch
1612 // to the self-closing start tag state. Otherwise, treat it as per the
1613 // "anything else" entry below.
1614 Some('/') => {
1615 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1616 self.finish_tag_token_name();
1617 self.state = State::SelfClosingStartTag;
1618 } else {
1619 anything_else(self);
1620 }
1621 }
1622 // U+003E GREATER-THAN SIGN (>)
1623 // If the current end tag token is an appropriate end tag token, then switch
1624 // to the data state and emit the current tag token. Otherwise, treat it as
1625 // per the "anything else" entry below.
1626 Some('>') => {
1627 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1628 self.finish_tag_token_name();
1629 self.state = State::Data;
1630 self.emit_tag_token();
1631 } else {
1632 anything_else(self);
1633 }
1634 }
1635 // ASCII upper alpha
1636 // Append the lowercase version of the current input character (add 0x0020
1637 // to the character's code point) to the current tag token's tag name.
1638 // Append the current input character to the temporary buffer.
1639 Some(c) if is_ascii_upper_alpha(c) => {
1640 self.consume_and_append_to_attribute_token_name_and_temp_buf(
1641 c,
1642 is_ascii_upper_alpha,
1643 );
1644 }
1645 // ASCII lower alpha
1646 // Append the current input character to the current tag token's tag name.
1647 // Append the current input character to the temporary buffer.
1648 Some(c) if is_ascii_lower_alpha(c) => {
1649 self.consume_and_append_to_attribute_token_name_and_temp_buf(
1650 c,
1651 is_ascii_lower_alpha,
1652 );
1653 }
1654 // Anything else
1655 // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
1656 // token, and a character token for each of the characters in the temporary
1657 // buffer (in the order they were added to the buffer). Reconsume in the
1658 // RAWTEXT state.
1659 _ => {
1660 anything_else(self);
1661 }
1662 }
1663 }
1664 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1665 State::ScriptDataLessThanSign => {
1666 // Consume the next input character:
1667 match self.consume_next_char() {
1668 // U+002F SOLIDUS (/)
1669 // Set the temporary buffer to the empty string. Switch to the script data
1670 // end tag open state.
1671 Some('/') => {
1672 self.temporary_buffer.clear();
1673 self.state = State::ScriptDataEndTagOpen;
1674 }
1675 // U+0021 EXCLAMATION MARK (!)
1676 // Switch to the script data escape start state. Emit a U+003C LESS-THAN
1677 // SIGN character token and a U+0021 EXCLAMATION MARK character token.
1678 Some('!') => {
1679 self.state = State::ScriptDataEscapeStart;
1680 self.emit_character_token('<');
1681 self.emit_character_token('!');
1682 }
1683 // Anything else
1684 // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script
1685 // data state.
1686 _ => {
1687 self.emit_character_token('<');
1688 self.reconsume_in_state(State::ScriptData);
1689 }
1690 }
1691 }
1692 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1693 State::ScriptDataEndTagOpen => {
1694 // Consume the next input character:
1695 match self.consume_next_char() {
1696 // ASCII alpha
1697 // Create a new end tag token, set its tag name to the empty string.
1698 // Reconsume in the script data end tag name state.
1699 Some(c) if is_ascii_alpha(c) => {
1700 self.create_end_tag_token();
1701 self.reconsume_in_state(State::ScriptDataEndTagName);
1702 }
1703 // Anything else
1704 // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
1705 // character token. Reconsume in the script data state.
1706 _ => {
1707 self.emit_character_token('<');
1708 self.emit_character_token('/');
1709 self.reconsume_in_state(State::ScriptData);
1710 }
1711 }
1712 }
1713 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1714 State::ScriptDataEndTagName => {
1715 let anything_else = |lexer: &mut Lexer<'a, I>| {
1716 lexer.finish_tag_token_name();
1717 lexer.emit_character_token('<');
1718 lexer.emit_character_token('/');
1719 lexer.emit_temporary_buffer_as_character_tokens();
1720 lexer.reconsume_in_state(State::ScriptData);
1721 };
1722
1723 // Consume the next input character:
1724 match self.consume_next_char() {
1725 // U+0009 CHARACTER TABULATION (tab)
1726 // U+000A LINE FEED (LF)
1727 // U+000C FORM FEED (FF)
1728 // U+0020 SPACE
1729 // If the current end tag token is an appropriate end tag token, then switch
1730 // to the before attribute name state. Otherwise, treat it as per the
1731 // "anything else" entry below.
1732 Some(c) if is_spacy(c) => {
1733 self.skip_whitespaces(c);
1734
1735 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1736 self.finish_tag_token_name();
1737 self.state = State::BeforeAttributeName;
1738 } else {
1739 anything_else(self);
1740 }
1741 }
1742 // U+002F SOLIDUS (/)
1743 // If the current end tag token is an appropriate end tag token, then switch
1744 // to the self-closing start tag state. Otherwise, treat it as per the
1745 // "anything else" entry below.
1746 Some('/') => {
1747 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1748 self.finish_tag_token_name();
1749 self.state = State::SelfClosingStartTag;
1750 } else {
1751 anything_else(self);
1752 }
1753 }
1754 // U+003E GREATER-THAN SIGN (>)
1755 // If the current end tag token is an appropriate end tag token, then switch
1756 // to the data state and emit the current tag token. Otherwise, treat it as
1757 // per the "anything else" entry below.
1758 Some('>') => {
1759 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
1760 self.finish_tag_token_name();
1761 self.state = State::Data;
1762 self.emit_tag_token();
1763 } else {
1764 anything_else(self);
1765 }
1766 }
1767 // ASCII upper alpha
1768 // Append the lowercase version of the current input character (add 0x0020
1769 // to the character's code point) to the current tag token's tag name.
1770 // Append the current input character to the temporary buffer.
1771 Some(c) if is_ascii_upper_alpha(c) => {
1772 self.consume_and_append_to_attribute_token_name_and_temp_buf(
1773 c,
1774 is_ascii_upper_alpha,
1775 );
1776 }
1777 // ASCII lower alpha
1778 // Append the current input character to the current tag token's tag name.
1779 // Append the current input character to the temporary buffer.
1780 Some(c) if is_ascii_lower_alpha(c) => {
1781 self.consume_and_append_to_attribute_token_name_and_temp_buf(
1782 c,
1783 is_ascii_lower_alpha,
1784 );
1785 }
1786 // Anything else
1787 // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
1788 // token, and a character token for each of the characters in the temporary
1789 // buffer (in the order they were added to the buffer). Reconsume in the
1790 // script data state.
1791 _ => {
1792 anything_else(self);
1793 }
1794 }
1795 }
1796 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1797 State::ScriptDataEscapeStart => {
1798 // Consume the next input character:
1799 match self.consume_next_char() {
1800 // U+002D HYPHEN-MINUS (-)
1801 // Switch to the script data escape start dash state. Emit a U+002D
1802 // HYPHEN-MINUS character token.
1803 Some(c @ '-') => {
1804 self.state = State::ScriptDataEscapeStartDash;
1805 self.emit_character_token(c);
1806 }
1807 // Anything else
1808 // Reconsume in the script data state.
1809 _ => {
1810 self.reconsume_in_state(State::ScriptData);
1811 }
1812 }
1813 }
1814 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1815 State::ScriptDataEscapeStartDash => {
1816 // Consume the next input character:
1817 match self.consume_next_char() {
1818 // U+002D HYPHEN-MINUS (-)
1819 // Switch to the script data escaped dash dash state. Emit a U+002D
1820 // HYPHEN-MINUS character token.
1821 Some(c @ '-') => {
1822 self.state = State::ScriptDataEscapedDashDash;
1823 self.emit_character_token(c);
1824 }
1825 // Anything else
1826 // Reconsume in the script data state.
1827 _ => {
1828 self.reconsume_in_state(State::ScriptData);
1829 }
1830 }
1831 }
1832 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1833 State::ScriptDataEscaped => {
1834 // Consume the next input character:
1835 match self.consume_next_char() {
1836 // U+002D HYPHEN-MINUS (-)
1837 // Switch to the script data escaped dash state. Emit a U+002D HYPHEN-MINUS
1838 // character token.
1839 Some(c @ '-') => {
1840 self.state = State::ScriptDataEscapedDash;
1841 self.emit_character_token(c);
1842 }
1843 // U+003C LESS-THAN SIGN (<)
1844 // Switch to the script data escaped less-than sign state.
1845 Some('<') => {
1846 self.state = State::ScriptDataEscapedLessThanSign;
1847 }
1848 // U+0000 NULL
1849 // This is an unexpected-null-character parse error. Emit a U+FFFD
1850 // REPLACEMENT CHARACTER character token.
1851 Some(c @ '\x00') => {
1852 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1853 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1854 }
1855 // EOF
1856 // This is an eof-in-script-html-comment-like-text parse error. Emit an
1857 // end-of-file token.
1858 None => {
1859 self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
1860 self.emit_token(Token::Eof);
1861
1862 return Ok(());
1863 }
1864 // Anything else
1865 // Emit the current input character as a character token.
1866 Some(c) => {
1867 self.validate_input_stream_character(c);
1868 self.handle_raw_and_emit_character_token(c);
1869 }
1870 }
1871 }
1872 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1873 State::ScriptDataEscapedDash => {
1874 // Consume the next input character:
1875 match self.consume_next_char() {
1876 // U+002D HYPHEN-MINUS (-)
1877 // Switch to the script data escaped dash dash state. Emit a U+002D
1878 // HYPHEN-MINUS character token.
1879 Some(c @ '-') => {
1880 self.state = State::ScriptDataEscapedDashDash;
1881 self.emit_character_token(c);
1882 }
1883 // U+003C LESS-THAN SIGN (<)
1884 // Switch to the script data escaped less-than sign state.
1885 Some('<') => {
1886 self.state = State::ScriptDataEscapedLessThanSign;
1887 }
1888 // U+0000 NULL
1889 // This is an unexpected-null-character parse error. Switch to the script
1890 // data escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character token.
1891 Some(c @ '\x00') => {
1892 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1893 self.state = State::ScriptDataEscaped;
1894 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1895 }
1896 // EOF
1897 // This is an eof-in-script-html-comment-like-text parse error. Emit an
1898 // end-of-file token.
1899 None => {
1900 self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
1901 self.emit_token(Token::Eof);
1902
1903 return Ok(());
1904 }
1905 // Anything else
1906 // Switch to the script data escaped state. Emit the current input character
1907 // as a character token.
1908 Some(c) => {
1909 self.validate_input_stream_character(c);
1910 self.state = State::ScriptDataEscaped;
1911 self.handle_raw_and_emit_character_token(c);
1912 }
1913 }
1914 }
1915 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1916 State::ScriptDataEscapedDashDash => {
1917 // Consume the next input character:
1918 match self.consume_next_char() {
1919 // U+002D HYPHEN-MINUS (-)
1920 // Emit a U+002D HYPHEN-MINUS character token.
1921 Some(c @ '-') => {
1922 self.emit_character_token(c);
1923 }
1924 // U+003C LESS-THAN SIGN (<)
1925 // Switch to the script data escaped less-than sign state.
1926 Some('<') => {
1927 self.state = State::ScriptDataEscapedLessThanSign;
1928 }
1929 // U+003E GREATER-THAN SIGN (>)
1930 // Switch to the script data state. Emit a U+003E GREATER-THAN SIGN
1931 // character token.
1932 Some(c @ '>') => {
1933 self.state = State::ScriptData;
1934 self.emit_character_token(c);
1935 }
1936 // U+0000 NULL
1937 // This is an unexpected-null-character parse error. Switch to the script
1938 // data escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character token.
1939 Some(c @ '\x00') => {
1940 self.emit_error(ErrorKind::UnexpectedNullCharacter);
1941 self.state = State::ScriptDataEscaped;
1942 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
1943 }
1944 // EOF
1945 // This is an eof-in-script-html-comment-like-text parse error. Emit an
1946 // end-of-file token.
1947 None => {
1948 self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
1949 self.emit_token(Token::Eof);
1950
1951 return Ok(());
1952 }
1953 // Anything else
1954 // Switch to the script data escaped state. Emit the current input character
1955 // as a character token.
1956 Some(c) => {
1957 self.validate_input_stream_character(c);
1958 self.state = State::ScriptDataEscaped;
1959 self.handle_raw_and_emit_character_token(c);
1960 }
1961 }
1962 }
1963 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1964 State::ScriptDataEscapedLessThanSign => {
1965 // Consume the next input character:
1966 match self.consume_next_char() {
1967 // U+002F SOLIDUS (/)
1968 // Set the temporary buffer to the empty string. Switch to the script data
1969 // escaped end tag open state.
1970 Some('/') => {
1971 self.temporary_buffer.clear();
1972 self.state = State::ScriptDataEscapedEndTagOpen;
1973 }
1974 // ASCII alpha
1975 // Set the temporary buffer to the empty string. Emit a U+003C LESS-THAN
1976 // SIGN character token. Reconsume in the script data double escape start
1977 // state.
1978 Some(c) if is_ascii_alpha(c) => {
1979 self.temporary_buffer.clear();
1980 self.emit_character_token('<');
1981 self.reconsume_in_state(State::ScriptDataDoubleEscapeStart);
1982 }
1983 // Anything else
1984 // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script
1985 // data escaped state.
1986 _ => {
1987 self.emit_character_token('<');
1988 self.reconsume_in_state(State::ScriptDataEscaped);
1989 }
1990 }
1991 }
1992 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1993 State::ScriptDataEscapedEndTagOpen => {
1994 // Consume the next input character:
1995 match self.consume_next_char() {
1996 // ASCII alpha
1997 // Create a new end tag token, set its tag name to the empty string.
1998 // Reconsume in the script data escaped end tag name state.
1999 Some(c) if is_ascii_alpha(c) => {
2000 self.create_end_tag_token();
2001 self.reconsume_in_state(State::ScriptDataEscapedEndTagName);
2002 }
2003 // Anything else
2004 // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
2005 // character token. Reconsume in the script data escaped state.
2006 _ => {
2007 self.emit_character_token('<');
2008 self.emit_character_token('/');
2009 self.reconsume_in_state(State::ScriptDataEscaped);
2010 }
2011 }
2012 }
2013 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
2014 State::ScriptDataEscapedEndTagName => {
2015 let anything_else = |lexer: &mut Lexer<'a, I>| {
2016 lexer.finish_tag_token_name();
2017 lexer.emit_character_token('<');
2018 lexer.emit_character_token('/');
2019 lexer.emit_temporary_buffer_as_character_tokens();
2020 lexer.reconsume_in_state(State::ScriptDataEscaped);
2021 };
2022
2023 // Consume the next input character:
2024 match self.consume_next_char() {
2025 // U+0009 CHARACTER TABULATION (tab)
2026 // U+000A LINE FEED (LF)
2027 // U+000C FORM FEED (FF)
2028 // U+0020 SPACE
2029 // If the current end tag token is an appropriate end tag token, then switch
2030 // to the before attribute name state. Otherwise, treat it as per the
2031 // "anything else" entry below.
2032 Some(c) if is_spacy(c) => {
2033 self.skip_whitespaces(c);
2034
2035 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
2036 self.finish_tag_token_name();
2037 self.state = State::BeforeAttributeName;
2038 } else {
2039 anything_else(self);
2040 }
2041 }
2042 // U+002F SOLIDUS (/)
2043 // If the current end tag token is an appropriate end tag token, then switch
2044 // to the self-closing start tag state. Otherwise, treat it as per the
2045 // "anything else" entry below.
2046 Some('/') => {
2047 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
2048 self.finish_tag_token_name();
2049 self.state = State::SelfClosingStartTag;
2050 } else {
2051 anything_else(self);
2052 }
2053 }
2054 // U+003E GREATER-THAN SIGN (>)
2055 // If the current end tag token is an appropriate end tag token, then switch
2056 // to the data state and emit the current tag token. Otherwise, treat it as
2057 // per the "anything else" entry below.
2058 Some('>') => {
2059 if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
2060 self.finish_tag_token_name();
2061 self.state = State::Data;
2062 self.emit_tag_token();
2063 } else {
2064 anything_else(self);
2065 }
2066 }
2067 // ASCII upper alpha
2068 // Append the lowercase version of the current input character (add 0x0020
2069 // to the character's code point) to the current tag token's tag name.
2070 // Append the current input character to the temporary buffer.
2071 Some(c) if is_ascii_upper_alpha(c) => {
2072 self.consume_and_append_to_attribute_token_name_and_temp_buf(
2073 c,
2074 is_ascii_upper_alpha,
2075 );
2076 }
2077 // ASCII lower alpha
2078 // Append the current input character to the current tag token's tag name.
2079 // Append the current input character to the temporary buffer.
2080 Some(c) if is_ascii_lower_alpha(c) => {
2081 self.consume_and_append_to_attribute_token_name_and_temp_buf(
2082 c,
2083 is_ascii_lower_alpha,
2084 );
2085 }
2086 // Anything else
2087 // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
2088 // token, and a character token for each of the characters in the temporary
2089 // buffer (in the order they were added to the buffer). Reconsume in the
2090 // script data escaped state.
2091 _ => {
2092 anything_else(self);
2093 }
2094 }
2095 }
2096 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
2097 State::ScriptDataDoubleEscapeStart => {
2098 // Consume the next input character:
2099 match self.consume_next_char() {
2100 // U+0009 CHARACTER TABULATION (tab)
2101 // U+000A LINE FEED (LF)
2102 // U+000C FORM FEED (FF)
2103 // U+0020 SPACE
2104 // U+002F SOLIDUS (/)
2105 // U+003E GREATER-THAN SIGN (>)
2106 // If the temporary buffer is the string "script", then switch to the script
2107 // data double escaped state. Otherwise, switch to the script data escaped
2108 // state. Emit the current input character as a character token.
2109 Some(c) if is_spacy(c) => {
2110 let is_script = self.temporary_buffer == "script";
2111
2112 if is_script {
2113 self.state = State::ScriptDataDoubleEscaped;
2114 } else {
2115 self.state = State::ScriptDataEscaped;
2116 }
2117
2118 self.handle_raw_and_emit_character_token(c);
2119 }
2120 Some(c @ '/' | c @ '>') => {
2121 let is_script = self.temporary_buffer == "script";
2122
2123 if is_script {
2124 self.state = State::ScriptDataDoubleEscaped;
2125 } else {
2126 self.state = State::ScriptDataEscaped;
2127 }
2128
2129 self.emit_character_token(c);
2130 }
2131 // ASCII upper alpha
2132 // Append the lowercase version of the current input character (add 0x0020
2133 // to the character's code point) to the temporary buffer. Emit the current
2134 // input character as a character token.
2135 Some(c) if is_ascii_upper_alpha(c) => {
2136 self.temporary_buffer.push(c.to_ascii_lowercase());
2137 self.emit_character_token(c);
2138 }
2139 // ASCII lower alpha
2140 // Append the current input character to the temporary buffer. Emit the
2141 // current input character as a character token.
2142 Some(c) if is_ascii_lower_alpha(c) => {
2143 self.temporary_buffer.push(c);
2144 self.emit_character_token(c);
2145 }
2146 // Anything else
2147 // Reconsume in the script data escaped state.
2148 _ => {
2149 self.reconsume_in_state(State::ScriptDataEscaped);
2150 }
2151 }
2152 }
2153 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
2154 State::ScriptDataDoubleEscaped => {
2155 // Consume the next input character:
2156 match self.consume_next_char() {
2157 // U+002D HYPHEN-MINUS (-)
2158 // Switch to the script data double escaped dash state. Emit a U+002D
2159 // HYPHEN-MINUS character token.
2160 Some(c @ '-') => {
2161 self.state = State::ScriptDataDoubleEscapedDash;
2162 self.emit_character_token(c);
2163 }
2164 // U+003C LESS-THAN SIGN (<)
2165 // Switch to the script data double escaped less-than sign state. Emit a
2166 // U+003C LESS-THAN SIGN character token.
2167 Some(c @ '<') => {
2168 self.state = State::ScriptDataDoubleEscapedLessThanSign;
2169 self.emit_character_token(c);
2170 }
2171 // U+0000 NULL
2172 // This is an unexpected-null-character parse error. Emit a U+FFFD
2173 // REPLACEMENT CHARACTER character token.
2174 Some(c @ '\x00') => {
2175 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2176 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
2177 }
2178 // EOF
2179 // This is an eof-in-script-html-comment-like-text parse error. Emit an
2180 // end-of-file token.
2181 None => {
2182 self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
2183 self.emit_token(Token::Eof);
2184
2185 return Ok(());
2186 }
2187 // Anything else
2188 // Emit the current input character as a character token.
2189 Some(c) => {
2190 self.validate_input_stream_character(c);
2191 self.handle_raw_and_emit_character_token(c);
2192 }
2193 }
2194 }
2195 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
2196 State::ScriptDataDoubleEscapedDash => {
2197 // Consume the next input character:
2198 match self.consume_next_char() {
2199 // U+002D HYPHEN-MINUS (-)
2200 // Switch to the script data double escaped dash dash state. Emit a U+002D
2201 // HYPHEN-MINUS character token.
2202 Some(c @ '-') => {
2203 self.state = State::ScriptDataDoubleEscapedDashDash;
2204 self.emit_character_token(c);
2205 }
2206 // U+003C LESS-THAN SIGN (<)
2207 // Switch to the script data double escaped less-than sign state. Emit a
2208 // U+003C LESS-THAN SIGN character token.
2209 Some(c @ '<') => {
2210 self.state = State::ScriptDataDoubleEscapedLessThanSign;
2211 self.emit_character_token(c);
2212 }
2213 // U+0000 NULL
2214 // This is an unexpected-null-character parse error. Switch to the script
2215 // data double escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character
2216 // token.
2217 Some(c @ '\x00') => {
2218 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2219 self.state = State::ScriptDataDoubleEscaped;
2220 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
2221 }
2222 // EOF
2223 // This is an eof-in-script-html-comment-like-text parse error. Emit an
2224 // end-of-file token.
2225 None => {
2226 self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
2227 self.emit_token(Token::Eof);
2228
2229 return Ok(());
2230 }
2231 // Anything else
2232 // Switch to the script data double escaped state. Emit the current input
2233 // character as a character token.
2234 Some(c) => {
2235 self.validate_input_stream_character(c);
2236 self.state = State::ScriptDataDoubleEscaped;
2237 self.handle_raw_and_emit_character_token(c);
2238 }
2239 }
2240 }
2241 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
2242 State::ScriptDataDoubleEscapedDashDash => {
2243 // Consume the next input character:
2244 match self.consume_next_char() {
2245 // U+002D HYPHEN-MINUS (-)
2246 // Emit a U+002D HYPHEN-MINUS character token.
2247 Some(c @ '-') => {
2248 self.emit_character_token(c);
2249 }
2250 // U+003C LESS-THAN SIGN (<)
2251 // Switch to the script data double escaped less-than sign state. Emit a
2252 // U+003C LESS-THAN SIGN character token.
2253 Some(c @ '<') => {
2254 self.state = State::ScriptDataDoubleEscapedLessThanSign;
2255 self.emit_character_token(c);
2256 }
2257 // U+003E GREATER-THAN SIGN (>)
2258 // Switch to the script data state. Emit a U+003E GREATER-THAN SIGN
2259 // character token.
2260 Some(c @ '>') => {
2261 self.state = State::ScriptData;
2262 self.emit_character_token(c);
2263 }
2264 // U+0000 NULL
2265 // This is an unexpected-null-character parse error. Switch to the script
2266 // data double escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character
2267 // token.
2268 Some(c @ '\x00') => {
2269 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2270 self.state = State::ScriptDataDoubleEscaped;
2271 self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
2272 }
2273 // EOF
2274 // This is an eof-in-script-html-comment-like-text parse error. Emit an
2275 // end-of-file token.
2276 None => {
2277 self.emit_error(ErrorKind::EofInScriptHtmlCommentLikeText);
2278 self.emit_token(Token::Eof);
2279
2280 return Ok(());
2281 }
2282 // Anything else
2283 // Switch to the script data double escaped state. Emit the current input
2284 // character as a character token.
2285 Some(c) => {
2286 self.validate_input_stream_character(c);
2287 self.state = State::ScriptDataDoubleEscaped;
2288 self.handle_raw_and_emit_character_token(c);
2289 }
2290 }
2291 }
2292 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
2293 State::ScriptDataDoubleEscapedLessThanSign => {
2294 // Consume the next input character:
2295 match self.consume_next_char() {
2296 // U+002F SOLIDUS (/)
2297 // Set the temporary buffer to the empty string. Switch to the script data
2298 // double escape end state. Emit a U+002F SOLIDUS character token.
2299 Some(c @ '/') => {
2300 self.temporary_buffer.clear();
2301 self.state = State::ScriptDataDoubleEscapeEnd;
2302 self.emit_character_token(c);
2303 }
2304 // Anything else
2305 // Reconsume in the script data double escaped state.
2306 _ => {
2307 self.reconsume_in_state(State::ScriptDataDoubleEscaped);
2308 }
2309 }
2310 }
2311 // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
2312 State::ScriptDataDoubleEscapeEnd => {
2313 // Consume the next input character:
2314 match self.consume_next_char() {
2315 // U+0009 CHARACTER TABULATION (tab)
2316 // U+000A LINE FEED (LF)
2317 // U+000C FORM FEED (FF)
2318 // U+0020 SPACE
2319 // U+002F SOLIDUS (/)
2320 // U+003E GREATER-THAN SIGN (>)
2321 // If the temporary buffer is the string "script", then switch to the script
2322 // data escaped state. Otherwise, switch to the script data double escaped
2323 // state. Emit the current input character as a character token.
2324 Some(c) if is_spacy(c) => {
2325 let is_script = self.temporary_buffer == "script";
2326
2327 if is_script {
2328 self.state = State::ScriptDataEscaped;
2329 } else {
2330 self.state = State::ScriptDataDoubleEscaped;
2331 }
2332
2333 self.handle_raw_and_emit_character_token(c);
2334 }
2335 Some(c @ '/' | c @ '>') => {
2336 let is_script = self.temporary_buffer == "script";
2337
2338 if is_script {
2339 self.state = State::ScriptDataEscaped;
2340 } else {
2341 self.state = State::ScriptDataDoubleEscaped;
2342 }
2343
2344 self.emit_character_token(c);
2345 }
2346 // ASCII upper alpha
2347 // Append the lowercase version of the current input character (add 0x0020
2348 // to the character's code point) to the temporary buffer. Emit the current
2349 // input character as a character token.
2350 Some(c) if is_ascii_upper_alpha(c) => {
2351 self.temporary_buffer.push(c.to_ascii_lowercase());
2352 self.emit_character_token(c);
2353 }
2354 // ASCII lower alpha
2355 // Append the current input character to the temporary buffer. Emit the
2356 // current input character as a character token.
2357 Some(c) if is_ascii_lower_alpha(c) => {
2358 self.temporary_buffer.push(c);
2359
2360 self.emit_character_token(c);
2361 }
2362 // Anything else
2363 // Reconsume in the script data double escaped state.
2364 _ => {
2365 self.reconsume_in_state(State::ScriptDataDoubleEscaped);
2366 }
2367 }
2368 }
2369 // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
2370 State::BeforeAttributeName => {
2371 // Consume the next input character:
2372 match self.consume_next_char() {
2373 // U+0009 CHARACTER TABULATION (tab)
2374 // U+000A LINE FEED (LF)
2375 // U+000C FORM FEED (FF)
2376 // U+0020 SPACE
2377 // Ignore the character.
2378 Some(c) if is_spacy(c) => {
2379 self.skip_whitespaces(c);
2380 }
2381 // U+002F SOLIDUS (/)
2382 // U+003E GREATER-THAN SIGN (>)
2383 // EOF
2384 // Reconsume in the after attribute name state.
2385 Some('/') | Some('>') | None => {
2386 self.reconsume_in_state(State::AfterAttributeName);
2387 }
2388 // U+003D EQUALS SIGN (=)
2389 // This is an unexpected-equals-sign-before-attribute-name parse error.
2390 // Start a new attribute in the current tag token. Set that attribute's name
2391 // to the current input character, and its value to the empty string. Switch
2392 // to the attribute name state.
2393 // We set `None` for `value` to support boolean attributes in AST
2394 Some(c @ '=') => {
2395 self.emit_error(ErrorKind::UnexpectedEqualsSignBeforeAttributeName);
2396 self.start_new_attribute_token();
2397 self.append_to_attribute_token_name(c, c);
2398 self.state = State::AttributeName;
2399 }
2400 // Anything else
2401 // Start a new attribute in the current tag token. Set that attribute name
2402 // and value to the empty string. Reconsume in the attribute name state.
2403 // We set `None` for `value` to support boolean attributes in AST
2404 _ => {
2405 self.start_new_attribute_token();
2406 self.reconsume_in_state(State::AttributeName);
2407 }
2408 }
2409 }
2410 // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
2411 State::AttributeName => {
2412 let anything_else = |lexer: &mut Lexer<'a, I>, c: char| {
2413 lexer.append_to_attribute_token_name(c, c);
2414 };
2415
2416 // Consume the next input character:
2417 match self.consume_next_char() {
2418 // U+0009 CHARACTER TABULATION (tab)
2419 // U+000A LINE FEED (LF)
2420 // U+000C FORM FEED (FF)
2421 // U+0020 SPACE
2422 // U+002F SOLIDUS (/)
2423 // U+003E GREATER-THAN SIGN (>)
2424 // EOF
2425 // Reconsume in the after attribute name state.
2426 Some(c) if is_spacy(c) => {
2427 self.finish_attribute_token_name();
2428 self.skip_whitespaces(c);
2429 self.reconsume_in_state(State::AfterAttributeName);
2430 }
2431 Some('/' | '>') | None => {
2432 self.finish_attribute_token_name();
2433 self.reconsume_in_state(State::AfterAttributeName);
2434 }
2435 // U+003D EQUALS SIGN (=)
2436 // Switch to the before attribute value state.
2437 Some('=') => {
2438 self.finish_attribute_token_name();
2439 self.state = State::BeforeAttributeValue;
2440 }
2441 // ASCII upper alpha
2442 // Append the lowercase version of the current input character (add 0x0020
2443 // to the character's code point) to the current attribute's name.
2444 Some(c) if is_ascii_upper_alpha(c) => {
2445 self.consume_and_append_to_attribute_token_name(c, |c| {
2446 is_ascii_upper_alpha(c)
2447 });
2448 }
2449 // U+0000 NULL
2450 // This is an unexpected-null-character parse error. Append a U+FFFD
2451 // REPLACEMENT CHARACTER character to the current attribute's name.
2452 Some(c @ '\x00') => {
2453 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2454 self.append_to_attribute_token_name(REPLACEMENT_CHARACTER, c);
2455 }
2456 // U+0022 QUOTATION MARK (")
2457 // U+0027 APOSTROPHE (')
2458 // U+003C LESS-THAN SIGN (<)
2459 // This is an unexpected-character-in-attribute-name parse error. Treat it
2460 // as per the "anything else" entry below.
2461 Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') => {
2462 self.emit_error(ErrorKind::UnexpectedCharacterInAttributeName);
2463
2464 anything_else(self, c);
2465 }
2466 // Anything else
2467 // Append the current input character to the current attribute's name.
2468 Some(c) => {
2469 self.validate_input_stream_character(c);
2470 self.consume_and_append_to_attribute_token_name(c, |c| {
2471 if !is_allowed_character(c) {
2472 return false;
2473 }
2474
2475 // List of characters from above to stop consumption and a certain
2476 // branch took control
2477 !is_spacy(c)
2478 && !matches!(c, '/' | '>' | '=' | '\x00' | '"' | '\'' | '<')
2479 && !is_ascii_upper_alpha(c)
2480 });
2481 }
2482 }
2483
2484 // When the user agent leaves the attribute name state (and
2485 // before emitting the tag token, if appropriate), the
2486 // complete attribute's name must be compared to the other
2487 // attributes on the same token; if there is already an
2488 // attribute on the token with the exact same name, then
2489 // this is a duplicate-attribute parse error and the new
2490 // attribute must be removed from the token.
2491 //
2492 // We postpone it when we will emit current tag token
2493 }
2494 // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
2495 State::AfterAttributeName => {
2496 // Consume the next input character:
2497 match self.consume_next_char() {
2498 // U+0009 CHARACTER TABULATION (tab)
2499 // U+000A LINE FEED (LF)
2500 // U+000C FORM FEED (FF)
2501 // U+0020 SPACE
2502 // Ignore the character.
2503 Some(c) if is_spacy(c) => {
2504 self.skip_whitespaces(c);
2505 }
2506 // U+002F SOLIDUS (/)
2507 // Switch to the self-closing start tag state.
2508 Some('/') => {
2509 self.state = State::SelfClosingStartTag;
2510 }
2511 // U+003D EQUALS SIGN (=)
2512 // Switch to the before attribute value state.
2513 Some('=') => {
2514 self.state = State::BeforeAttributeValue;
2515 }
2516 // U+003E GREATER-THAN SIGN (>)
2517 // Switch to the data state. Emit the current tag token.
2518 Some('>') => {
2519 self.state = State::Data;
2520 self.emit_tag_token();
2521 }
2522 // EOF
2523 // This is an eof-in-tag parse error. Emit an end-of-file token.
2524 None => {
2525 self.emit_error(ErrorKind::EofInTag);
2526 self.emit_token(Token::Eof);
2527
2528 return Ok(());
2529 }
2530 // Anything else
2531 // Start a new attribute in the current tag token. Set that attribute name
2532 // and value to the empty string. Reconsume in the attribute name state.
2533 // We set `None` for `value` to support boolean attributes in AST
2534 _ => {
2535 self.start_new_attribute_token();
2536 self.reconsume_in_state(State::AttributeName);
2537 }
2538 }
2539 }
2540 // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
2541 State::BeforeAttributeValue => {
2542 // Consume the next input character:
2543 match self.consume_next_char() {
2544 // U+0009 CHARACTER TABULATION (tab)
2545 // U+000A LINE FEED (LF)
2546 // U+000C FORM FEED (FF)
2547 // U+0020 SPACE
2548 // Ignore the character.
2549 Some(c) if is_spacy(c) => {
2550 self.skip_whitespaces(c);
2551 }
2552 // U+0022 QUOTATION MARK (")
2553 // Switch to the attribute value (double-quoted) state.
2554 Some(c @ '"') => {
2555 self.append_to_attribute_token_value(None, Some(c));
2556 self.state = State::AttributeValueDoubleQuoted;
2557 }
2558 // U+0027 APOSTROPHE (')
2559 // Switch to the attribute value (single-quoted) state.
2560 Some(c @ '\'') => {
2561 self.append_to_attribute_token_value(None, Some(c));
2562 self.state = State::AttributeValueSingleQuoted;
2563 }
2564 // U+003E GREATER-THAN SIGN (>)
2565 // This is a missing-attribute-value parse error. Switch to the data state.
2566 // Emit the current tag token.
2567 Some('>') => {
2568 self.emit_error(ErrorKind::MissingAttributeValue);
2569 self.state = State::Data;
2570 self.emit_tag_token();
2571 }
2572 // Anything else
2573 // Reconsume in the attribute value (unquoted) state.
2574 _ => {
2575 self.reconsume_in_state(State::AttributeValueUnquoted);
2576 }
2577 }
2578 }
2579 // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
2580 State::AttributeValueDoubleQuoted => {
2581 // Consume the next input character:
2582 match self.consume_next_char() {
2583 // U+0022 QUOTATION MARK (")
2584 // Switch to the after attribute value (quoted) state.
2585 // We set value to support empty attributes (i.e. `attr=""`)
2586 Some(c @ '"') => {
2587 self.append_to_attribute_token_value(None, Some(c));
2588 self.state = State::AfterAttributeValueQuoted;
2589 }
2590 // U+0026 AMPERSAND (&)
2591 // Set the return state to the attribute value (double-quoted) state. Switch
2592 // to the character reference state.
2593 Some('&') => {
2594 self.return_state = State::AttributeValueDoubleQuoted;
2595 self.state = State::CharacterReference;
2596 }
2597 // U+0000 NULL
2598 // This is an unexpected-null-character parse error. Append a U+FFFD
2599 // REPLACEMENT CHARACTER character to the current attribute's value.
2600 Some(c @ '\x00') => {
2601 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2602 self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
2603 }
2604 // EOF
2605 // This is an eof-in-tag parse error. Emit an end-of-file token.
2606 None => {
2607 self.emit_error(ErrorKind::EofInTag);
2608 self.emit_token(Token::Eof);
2609
2610 return Ok(());
2611 }
2612 // Anything else
2613 // Append the current input character to the current attribute's value.
2614 Some(c) => {
2615 self.validate_input_stream_character(c);
2616 self.consume_and_append_to_attribute_token_value(c, |c| {
2617 if !is_allowed_character(c) {
2618 return false;
2619 }
2620
2621 // List of characters from above to stop consumption and a certain
2622 // branch took control, `\r` is in list because of newline normalization
2623 !matches!(c, '"' | '&' | '\x00' | '\r')
2624 });
2625 }
2626 }
2627 }
2628 // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
2629 State::AttributeValueSingleQuoted => {
2630 // Consume the next input character:
2631 match self.consume_next_char() {
2632 // U+0027 APOSTROPHE (')
2633 // Switch to the after attribute value (quoted) state.
2634 // We set value to support empty attributes (i.e. `attr=''`)
2635 Some(c @ '\'') => {
2636 self.append_to_attribute_token_value(None, Some(c));
2637 self.state = State::AfterAttributeValueQuoted;
2638 }
2639 // U+0026 AMPERSAND (&)
2640 // Set the return state to the attribute value (single-quoted) state. Switch
2641 // to the character reference state.
2642 Some('&') => {
2643 self.return_state = State::AttributeValueSingleQuoted;
2644 self.state = State::CharacterReference;
2645 }
2646 // U+0000 NULL
2647 // This is an unexpected-null-character parse error. Append a U+FFFD
2648 // REPLACEMENT CHARACTER character to the current attribute's value.
2649 Some(c @ '\x00') => {
2650 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2651 self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
2652 }
2653 // EOF
2654 // This is an eof-in-tag parse error. Emit an end-of-file token.
2655 None => {
2656 self.emit_error(ErrorKind::EofInTag);
2657 self.emit_token(Token::Eof);
2658
2659 return Ok(());
2660 }
2661 // Anything else
2662 // Append the current input character to the current attribute's value.
2663 Some(c) => {
2664 self.validate_input_stream_character(c);
2665 self.consume_and_append_to_attribute_token_value(c, |c| {
2666 if !is_allowed_character(c) {
2667 return false;
2668 }
2669
2670 // List of characters from above to stop consumption and a certain
2671 // branch took control, `\r` is in list because of newline normalization
2672 !matches!(c, '\'' | '&' | '\x00' | '\r')
2673 });
2674 }
2675 }
2676 }
2677 // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
2678 State::AttributeValueUnquoted => {
2679 let anything_else = |lexer: &mut Lexer<'a, I>, c: char| {
2680 lexer.append_to_attribute_token_value(Some(c), Some(c));
2681 };
2682
2683 // Consume the next input character:
2684 match self.consume_next_char() {
2685 // U+0009 CHARACTER TABULATION (tab)
2686 // U+000A LINE FEED (LF)
2687 // U+000C FORM FEED (FF)
2688 // U+0020 SPACE
2689 // Switch to the before attribute name state.
2690 Some(c) if is_spacy(c) => {
2691 self.finish_attribute_token_value();
2692 self.skip_whitespaces(c);
2693 self.state = State::BeforeAttributeName;
2694 }
2695 // U+0026 AMPERSAND (&)
2696 // Set the return state to the attribute value (unquoted) state. Switch to
2697 // the character reference state.
2698 Some('&') => {
2699 self.return_state = State::AttributeValueUnquoted;
2700 self.state = State::CharacterReference;
2701 }
2702 // U+003E GREATER-THAN SIGN (>)
2703 // Switch to the data state. Emit the current tag token.
2704 Some('>') => {
2705 self.finish_attribute_token_value();
2706 self.state = State::Data;
2707 self.emit_tag_token();
2708 }
2709 // U+0000 NULL
2710 // This is an unexpected-null-character parse error. Append a U+FFFD
2711 // REPLACEMENT CHARACTER character to the current attribute's value.
2712 Some(c @ '\x00') => {
2713 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2714 self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
2715 }
2716 // U+0022 QUOTATION MARK (")
2717 // U+0027 APOSTROPHE (')
2718 // U+003C LESS-THAN SIGN (<)
2719 // U+003D EQUALS SIGN (=)
2720 // U+0060 GRAVE ACCENT (`)
2721 // This is an unexpected-character-in-unquoted-attribute-value parse error.
2722 // Treat it as per the "anything else" entry below.
2723 Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') | Some(c @ '=')
2724 | Some(c @ '`') => {
2725 self.emit_error(ErrorKind::UnexpectedCharacterInUnquotedAttributeValue);
2726
2727 anything_else(self, c);
2728 }
2729 // EOF
2730 // This is an eof-in-tag parse error. Emit an end-of-file token.
2731 None => {
2732 self.finish_attribute_token_value();
2733 self.emit_error(ErrorKind::EofInTag);
2734 self.emit_token(Token::Eof);
2735
2736 return Ok(());
2737 }
2738 // Anything else
2739 // Append the current input character to the current attribute's value.
2740 Some(c) => {
2741 self.validate_input_stream_character(c);
2742 self.consume_and_append_to_attribute_token_value(c, |c| {
2743 if !is_allowed_character(c) {
2744 return false;
2745 }
2746
2747 // List of characters from above to stop consumption and a certain
2748 // branch took control, `\r` is in list because of newline normalization
2749 !is_spacy(c)
2750 && !matches!(
2751 c,
2752 '&' | '>' | '\x00' | '"' | '\'' | '<' | '=' | '`' | '\r'
2753 )
2754 });
2755 }
2756 }
2757 }
2758 // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
2759 State::AfterAttributeValueQuoted => {
2760 // Consume the next input character:
2761 match self.consume_next_char() {
2762 // U+0009 CHARACTER TABULATION (tab)
2763 // U+000A LINE FEED (LF)
2764 // U+000C FORM FEED (FF)
2765 // U+0020 SPACE
2766 // Switch to the before attribute name state.
2767 Some(c) if is_spacy(c) => {
2768 self.finish_attribute_token_value();
2769 self.skip_whitespaces(c);
2770 self.state = State::BeforeAttributeName;
2771 }
2772 // U+002F SOLIDUS (/)
2773 // Switch to the self-closing start tag state.
2774 Some('/') => {
2775 self.finish_attribute_token_value();
2776 self.state = State::SelfClosingStartTag;
2777 }
2778 // U+003E GREATER-THAN SIGN (>)
2779 // Switch to the data state. Emit the current tag token.
2780 Some('>') => {
2781 self.finish_attribute_token_value();
2782 self.state = State::Data;
2783 self.emit_tag_token();
2784 }
2785 // EOF
2786 // This is an eof-in-tag parse error. Emit an end-of-file token.
2787 None => {
2788 self.finish_attribute_token_value();
2789 self.emit_error(ErrorKind::EofInTag);
2790 self.emit_token(Token::Eof);
2791
2792 return Ok(());
2793 }
2794 // Anything else
2795 // This is a missing-whitespace-between-attributes parse error. Reconsume in
2796 // the before attribute name state.
2797 _ => {
2798 self.finish_attribute_token_value();
2799 self.emit_error(ErrorKind::MissingWhitespaceBetweenAttributes);
2800 self.reconsume_in_state(State::BeforeAttributeName);
2801 }
2802 }
2803 }
2804 // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
2805 State::SelfClosingStartTag => {
2806 // Consume the next input character:
2807 match self.consume_next_char() {
2808 // U+003E GREATER-THAN SIGN (>)
2809 // Set the self-closing flag of the current tag token. Switch to the data
2810 // state. Emit the current tag token.
2811 Some('>') => {
2812 if let Some(
2813 Token::StartTag {
2814 is_self_closing, ..
2815 }
2816 | Token::EndTag {
2817 is_self_closing, ..
2818 },
2819 ) = &mut self.current_token
2820 {
2821 *is_self_closing = true;
2822 }
2823
2824 self.state = State::Data;
2825 self.emit_tag_token();
2826 }
2827 // EOF
2828 // This is an eof-in-tag parse error. Emit an end-of-file token.
2829 None => {
2830 self.emit_error(ErrorKind::EofInTag);
2831 self.emit_token(Token::Eof);
2832
2833 return Ok(());
2834 }
2835 // Anything else
2836 // This is an unexpected-solidus-in-tag parse error. Reconsume in the before
2837 // attribute name state.
2838 _ => {
2839 self.emit_error(ErrorKind::UnexpectedSolidusInTag);
2840 self.reconsume_in_state(State::BeforeAttributeName);
2841 }
2842 }
2843 }
2844 // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
2845 State::BogusComment => {
2846 // Consume the next input character:
2847 match self.consume_next_char() {
2848 // U+003E GREATER-THAN SIGN (>)
2849 // Switch to the data state. Emit the current comment token.
2850 Some('>') => {
2851 self.state = State::Data;
2852 self.emit_comment_token(Some(">"));
2853 }
2854 // EOF
2855 // Emit the comment. Emit an end-of-file token.
2856 None => {
2857 self.emit_comment_token(None);
2858 self.emit_token(Token::Eof);
2859
2860 return Ok(());
2861 }
2862 // U+0000 NULL
2863 // This is an unexpected-null-character parse error. Append a U+FFFD
2864 // REPLACEMENT CHARACTER character to the comment token's data.
2865 Some(c @ '\x00') => {
2866 self.emit_error(ErrorKind::UnexpectedNullCharacter);
2867 self.append_to_comment_token(REPLACEMENT_CHARACTER, c);
2868 }
2869 // Anything else
2870 // Append the current input character to the comment token's data.
2871 Some(c) => {
2872 self.validate_input_stream_character(c);
2873 self.consume_and_append_to_comment_token(c, |c| {
2874 if !is_allowed_character(c) {
2875 return false;
2876 }
2877
2878 // List of characters from above to stop consumption and a certain
2879 // branch took control, `\r` is in list because of newline normalization
2880 !matches!(c, '>' | '\x00' | '\r')
2881 });
2882 }
2883 }
2884 }
2885 // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2886 State::MarkupDeclarationOpen => {
2887 let cur_pos = self.input.cur_pos();
2888 let anything_else = |lexer: &mut Lexer<'a, I>| {
2889 lexer.emit_error(ErrorKind::IncorrectlyOpenedComment);
2890 lexer.create_comment_token("<!");
2891 lexer.state = State::BogusComment;
2892 lexer.cur_pos = cur_pos;
2893 // We don't validate input here because we reset position
2894 unsafe {
2895 // Safety: We reset position to the previous one
2896 lexer.input.reset_to(cur_pos);
2897 }
2898 };
2899
2900 // If the next few characters are:
2901 match self.consume_next_char() {
2902 // Two U+002D HYPHEN-MINUS characters (-)
2903 // Consume those two characters, create a comment token whose data
2904 // is the empty string, and switch to the comment start state.
2905 Some('-') => match self.consume_next_char() {
2906 Some('-') => {
2907 self.create_comment_token("<!--");
2908 self.state = State::CommentStart;
2909 }
2910 _ => {
2911 anything_else(self);
2912 }
2913 },
2914 // ASCII case-insensitive match for the word "DOCTYPE"
2915 // Consume those characters and switch to the DOCTYPE state.
2916 Some(d @ 'd' | d @ 'D') => match self.consume_next_char() {
2917 Some(o @ 'o' | o @ 'O') => match self.consume_next_char() {
2918 Some(c @ 'c' | c @ 'C') => match self.consume_next_char() {
2919 Some(t @ 't' | t @ 'T') => match self.consume_next_char() {
2920 Some(y @ 'y' | y @ 'Y') => match self.consume_next_char() {
2921 Some(p @ 'p' | p @ 'P') => match self.consume_next_char() {
2922 Some(e @ 'e' | e @ 'E') => {
2923 self.state = State::Doctype;
2924
2925 let b = self.sub_buf.clone();
2926 let mut sub_buf = b.borrow_mut();
2927
2928 sub_buf.push('<');
2929 sub_buf.push('!');
2930 sub_buf.push(d);
2931 sub_buf.push(o);
2932 sub_buf.push(c);
2933 sub_buf.push(t);
2934 sub_buf.push(y);
2935 sub_buf.push(p);
2936 sub_buf.push(e);
2937 }
2938 _ => {
2939 anything_else(self);
2940 }
2941 },
2942 _ => {
2943 anything_else(self);
2944 }
2945 },
2946 _ => {
2947 anything_else(self);
2948 }
2949 },
2950 _ => {
2951 anything_else(self);
2952 }
2953 },
2954 _ => {
2955 anything_else(self);
2956 }
2957 },
2958 _ => {
2959 anything_else(self);
2960 }
2961 },
2962 // The string "[CDATA[" (the five uppercase letters "CDATA" with a
2963 // U+005B LEFT SQUARE BRACKET character before and after)
2964 // Consume those characters. If there is an adjusted current node and it
2965 // is not an element in the HTML namespace, then switch to the CDATA
2966 // section state. Otherwise, this is a cdata-in-html-content parse
2967 // error. Create a comment token whose data is the "[CDATA[" string.
2968 // Switch to the bogus comment state.
2969 Some('[') => match self.consume_next_char() {
2970 Some('C') => match self.consume_next_char() {
2971 Some('D') => match self.consume_next_char() {
2972 Some('A') => match self.consume_next_char() {
2973 Some('T') => match self.consume_next_char() {
2974 Some('A') => match self.consume_next_char() {
2975 Some('[') => {
2976 if let Some(false) = self.is_adjusted_current_node_is_element_in_html_namespace {
2977 self.state = State::CdataSection;
2978 } else {
2979 self.emit_error(
2980 ErrorKind::CdataInHtmlContent,
2981 );
2982 self.create_comment_token_with_cdata();
2983
2984 self.state = State::BogusComment;
2985 }
2986 }
2987 _ => {
2988 anything_else(self);
2989 }
2990 }
2991 _ => {
2992 anything_else(self);
2993 }
2994 },
2995 _ => {
2996 anything_else(self);
2997 }
2998 },
2999 _ => {
3000 anything_else(self);
3001 }
3002 }
3003 _ => {
3004 anything_else(self);
3005 }
3006 }
3007 _ => {
3008 anything_else(self);
3009 }
3010 }
3011 // Anything else
3012 // This is an incorrectly-opened-comment parse error. Create a comment token
3013 // whose data is the empty string. Switch to the bogus comment state (don't
3014 // consume anything in the current state).
3015 _ => {
3016 anything_else(self);
3017 }
3018 }
3019 }
3020 // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
3021 State::CommentStart => {
3022 // Consume the next input character:
3023 match self.consume_next_char() {
3024 // U+002D HYPHEN-MINUS (-)
3025 // Switch to the comment start dash state.
3026 Some('-') => {
3027 self.state = State::CommentStartDash;
3028 }
3029 // U+003E GREATER-THAN SIGN (>)
3030 // This is an abrupt-closing-of-empty-comment parse error. Switch to the
3031 // data state. Emit the current comment token.
3032 Some('>') => {
3033 self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
3034 self.state = State::Data;
3035 self.emit_comment_token(Some(">"));
3036 }
3037 // Anything else
3038 // Reconsume in the comment state.
3039 _ => {
3040 self.reconsume_in_state(State::Comment);
3041 }
3042 }
3043 }
3044 // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
3045 State::CommentStartDash => {
3046 // Consume the next input character:
3047 match self.consume_next_char() {
3048 // U+002D HYPHEN-MINUS (-)
3049 // Switch to the comment end state.
3050 Some('-') => {
3051 self.state = State::CommentEnd;
3052 }
3053 // U+003E GREATER-THAN SIGN (>)
3054 // This is an abrupt-closing-of-empty-comment parse error. Switch to the
3055 // data state. Emit the current comment token.
3056 Some('>') => {
3057 self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
3058 self.state = State::Data;
3059 self.emit_comment_token(Some("->"));
3060 }
3061 // EOF
3062 // This is an eof-in-comment parse error. Emit the current comment token.
3063 // Emit an end-of-file token.
3064 None => {
3065 self.emit_error(ErrorKind::EofInComment);
3066 self.emit_comment_token(None);
3067 self.emit_token(Token::Eof);
3068
3069 return Ok(());
3070 }
3071 // Anything else
3072 // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
3073 // Reconsume in the comment state.
3074 _ => {
3075 self.append_to_comment_token('-', '-');
3076 self.reconsume_in_state(State::Comment);
3077 }
3078 }
3079 }
3080 // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
3081 State::Comment => {
3082 // Consume the next input character:
3083 match self.consume_next_char() {
3084 // U+003C LESS-THAN SIGN (<)
3085 // Append the current input character to the comment token's data. Switch to
3086 // the comment less-than sign state.
3087 Some(c @ '<') => {
3088 self.append_to_comment_token(c, c);
3089 self.state = State::CommentLessThanSign;
3090 }
3091 // U+002D HYPHEN-MINUS (-)
3092 // Switch to the comment end dash state.
3093 Some('-') => {
3094 self.state = State::CommentEndDash;
3095 }
3096 // U+0000 NULL
3097 // This is an unexpected-null-character parse error. Append a U+FFFD
3098 // REPLACEMENT CHARACTER character to the comment token's data.
3099 Some(c @ '\x00') => {
3100 self.emit_error(ErrorKind::UnexpectedNullCharacter);
3101 self.append_to_comment_token(REPLACEMENT_CHARACTER, c);
3102 }
3103 // EOF
3104 // This is an eof-in-comment parse error. Emit the current comment token.
3105 // Emit an end-of-file token.
3106 None => {
3107 self.emit_error(ErrorKind::EofInComment);
3108 self.emit_comment_token(None);
3109 self.emit_token(Token::Eof);
3110
3111 return Ok(());
3112 }
3113 // Anything else
3114 // Append the current input character to the comment token's data.
3115 Some(c) => {
3116 self.validate_input_stream_character(c);
3117 self.consume_and_append_to_comment_token(c, |c| {
3118 if !is_allowed_character(c) {
3119 return false;
3120 }
3121
3122 // List of characters from above to stop consumption and a certain
3123 // branch took control, `\r` is in list because of newline normalization
3124 !matches!(c, '<' | '-' | '\x00' | '\r')
3125 });
3126 }
3127 }
3128 }
3129 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
3130 State::CommentLessThanSign => {
3131 // Consume the next input character:
3132 match self.consume_next_char() {
3133 // U+0021 EXCLAMATION MARK (!)
3134 // Append the current input character to the comment token's data. Switch to
3135 // the comment less-than sign bang state.
3136 Some(c @ '!') => {
3137 self.append_to_comment_token(c, c);
3138 self.state = State::CommentLessThanSignBang;
3139 }
3140 // U+003C LESS-THAN SIGN (<)
3141 // Append the current input character to the comment token's data.
3142 Some(c @ '<') => {
3143 self.append_to_comment_token(c, c);
3144 }
3145 // Anything else
3146 // Reconsume in the comment state.
3147 _ => {
3148 self.reconsume_in_state(State::Comment);
3149 }
3150 }
3151 }
3152 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
3153 State::CommentLessThanSignBang => {
3154 // Consume the next input character:
3155 match self.consume_next_char() {
3156 // U+002D HYPHEN-MINUS (-)
3157 // Switch to the comment less-than sign bang dash state.
3158 Some('-') => {
3159 self.state = State::CommentLessThanSignBangDash;
3160 }
3161 // Anything else
3162 // Reconsume in the comment state.
3163 _ => {
3164 self.reconsume_in_state(State::Comment);
3165 }
3166 }
3167 }
3168 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
3169 State::CommentLessThanSignBangDash => {
3170 // Consume the next input character:
3171 match self.consume_next_char() {
3172 // U+002D HYPHEN-MINUS (-)
3173 // Switch to the comment less-than sign bang dash dash state.
3174 Some('-') => {
3175 self.state = State::CommentLessThanSignBangDashDash;
3176 }
3177 // Anything else
3178 // Reconsume in the comment end dash state.
3179 _ => {
3180 self.reconsume_in_state(State::CommentEndDash);
3181 }
3182 }
3183 }
3184 // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
3185 State::CommentLessThanSignBangDashDash => {
3186 // Consume the next input character:
3187 match self.consume_next_char() {
3188 // U+003E GREATER-THAN SIGN (>)
3189 // EOF
3190 // Reconsume in the comment end state.
3191 Some('>') | None => {
3192 self.reconsume_in_state(State::CommentEnd);
3193 }
3194 // Anything else
3195 // This is a nested-comment parse error. Reconsume in the comment end state.
3196 _ => {
3197 self.emit_error(ErrorKind::NestedComment);
3198 self.reconsume_in_state(State::CommentEnd);
3199 }
3200 }
3201 }
3202 // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
3203 State::CommentEndDash => {
3204 // Consume the next input character:
3205 match self.consume_next_char() {
3206 // U+002D HYPHEN-MINUS (-)
3207 // Switch to the comment end state.
3208 Some('-') => {
3209 self.state = State::CommentEnd;
3210 }
3211 // EOF
3212 // This is an eof-in-comment parse error. Emit the current comment token.
3213 // Emit an end-of-file token.
3214 None => {
3215 self.emit_error(ErrorKind::EofInComment);
3216 self.emit_comment_token(None);
3217 self.emit_token(Token::Eof);
3218
3219 return Ok(());
3220 }
3221 // Anything else
3222 // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
3223 // Reconsume in the comment state.
3224 _ => {
3225 self.append_to_comment_token('-', '-');
3226 self.reconsume_in_state(State::Comment);
3227 }
3228 }
3229 }
3230 // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
3231 State::CommentEnd => {
3232 // Consume the next input character:
3233 match self.consume_next_char() {
3234 // U+003E GREATER-THAN SIGN (>)
3235 // Switch to the data state. Emit the current comment token.
3236 Some('>') => {
3237 self.state = State::Data;
3238 self.emit_comment_token(Some("-->"));
3239 }
3240 // U+0021 EXCLAMATION MARK (!)
3241 // Switch to the comment end bang state.
3242 Some('!') => {
3243 self.state = State::CommentEndBang;
3244 }
3245 // U+002D HYPHEN-MINUS (-)
3246 // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
3247 Some(c @ '-') => {
3248 self.append_to_comment_token(c, c);
3249 }
3250 // EOF
3251 // This is an eof-in-comment parse error. Emit the current comment token.
3252 // Emit an end-of-file token.
3253 None => {
3254 self.emit_error(ErrorKind::EofInComment);
3255 self.emit_comment_token(None);
3256 self.emit_token(Token::Eof);
3257
3258 return Ok(());
3259 }
3260 // Anything else
3261 // Append two U+002D HYPHEN-MINUS characters (-) to the comment token's
3262 // data. Reconsume in the comment state.
3263 _ => {
3264 self.append_to_comment_token('-', '-');
3265 self.append_to_comment_token('-', '-');
3266 self.reconsume_in_state(State::Comment);
3267 }
3268 }
3269 }
3270 // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
3271 State::CommentEndBang => {
3272 // Consume the next input character:
3273 match self.consume_next_char() {
3274 // U+002D HYPHEN-MINUS (-)
3275 // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION
3276 // MARK character (!) to the comment token's data. Switch to the comment end
3277 // dash state.
3278 Some(c @ '-') => {
3279 self.append_to_comment_token(c, c);
3280 self.append_to_comment_token('-', '-');
3281 self.append_to_comment_token('!', '!');
3282 self.state = State::CommentEndDash;
3283 }
3284 // U+003E GREATER-THAN SIGN (>)
3285 // This is an incorrectly-closed-comment parse error. Switch to the data
3286 // state. Emit the current comment token.
3287 Some('>') => {
3288 self.emit_error(ErrorKind::IncorrectlyClosedComment);
3289 self.state = State::Data;
3290 self.emit_comment_token(Some(">"));
3291 }
3292 // EOF
3293 // This is an eof-in-comment parse error. Emit the current comment token.
3294 // Emit an end-of-file token.
3295 None => {
3296 self.emit_error(ErrorKind::EofInComment);
3297 self.emit_comment_token(None);
3298 self.emit_token(Token::Eof);
3299
3300 return Ok(());
3301 }
3302 // Anything else
3303 // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION
3304 // MARK character (!) to the comment token's data. Reconsume in the comment
3305 // state.
3306 _ => {
3307 self.append_to_comment_token('-', '-');
3308 self.append_to_comment_token('-', '-');
3309 self.append_to_comment_token('!', '!');
3310 self.reconsume_in_state(State::Comment);
3311 }
3312 }
3313 }
3314 // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
3315 State::Doctype => {
3316 // Consume the next input character:
3317 match self.consume_next_char() {
3318 // U+0009 CHARACTER TABULATION (tab)
3319 // U+000A LINE FEED (LF)
3320 // U+000C FORM FEED (FF)
3321 // U+0020 SPACE
3322 // Switch to the before DOCTYPE name state.
3323 Some(c) if is_spacy(c) => {
3324 self.append_raw_to_doctype_token(c);
3325 self.state = State::BeforeDoctypeName;
3326 }
3327 // U+003E GREATER-THAN SIGN (>)
3328 // Reconsume in the before DOCTYPE name state.
3329 Some('>') => {
3330 self.reconsume_in_state(State::BeforeDoctypeName);
3331 }
3332 // EOF
3333 // This is an eof-in-doctype parse error. Create a new DOCTYPE token. Set
3334 // its force-quirks flag to on. Emit the current token. Emit an end-of-file
3335 // token.
3336 None => {
3337 self.emit_error(ErrorKind::EofInDoctype);
3338 self.create_doctype_token();
3339 self.set_doctype_token_force_quirks();
3340 self.emit_doctype_token();
3341 self.emit_token(Token::Eof);
3342
3343 return Ok(());
3344 }
3345 // Anything else
3346 // This is a missing-whitespace-before-doctype-name parse error. Reconsume
3347 // in the before DOCTYPE name state.
3348 _ => {
3349 self.emit_error(ErrorKind::MissingWhitespaceBeforeDoctypeName);
3350 self.reconsume_in_state(State::BeforeDoctypeName);
3351 }
3352 }
3353 }
3354 // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
3355 State::BeforeDoctypeName => {
3356 // Consume the next input character:
3357 match self.consume_next_char() {
3358 // U+0009 CHARACTER TABULATION (tab)
3359 // U+000A LINE FEED (LF)
3360 // U+000C FORM FEED (FF)
3361 // U+0020 SPACE
3362 // Ignore the character.
3363 Some(c) if is_spacy(c) => {
3364 self.append_raw_to_doctype_token(c);
3365 }
3366 // ASCII upper alpha
3367 // Create a new DOCTYPE token. Set the token's name to the lowercase version
3368 // of the current input character (add 0x0020 to the character's code
3369 // point). Switch to the DOCTYPE name state.
3370 Some(c) if is_ascii_upper_alpha(c) => {
3371 self.append_raw_to_doctype_token(c);
3372 self.create_doctype_token();
3373 self.set_doctype_token_name(c.to_ascii_lowercase());
3374 self.state = State::DoctypeName;
3375 }
3376 // U+0000 NULL
3377 // This is an unexpected-null-character parse error. Create a new DOCTYPE
3378 // token. Set the token's name to a U+FFFD REPLACEMENT CHARACTER character.
3379 // Switch to the DOCTYPE name state.
3380 Some(c @ '\x00') => {
3381 self.append_raw_to_doctype_token(c);
3382 self.emit_error(ErrorKind::UnexpectedNullCharacter);
3383 self.create_doctype_token();
3384 self.set_doctype_token_name(REPLACEMENT_CHARACTER);
3385 self.state = State::DoctypeName;
3386 }
3387 // U+003E GREATER-THAN SIGN (>)
3388 // This is a missing-doctype-name parse error. Create a new DOCTYPE token.
3389 // Set its force-quirks flag to on. Switch to the data state. Emit the
3390 // current token.
3391 Some(c @ '>') => {
3392 self.append_raw_to_doctype_token(c);
3393 self.emit_error(ErrorKind::MissingDoctypeName);
3394 self.create_doctype_token();
3395 self.set_doctype_token_force_quirks();
3396 self.state = State::Data;
3397 self.emit_doctype_token();
3398 }
3399 // EOF
3400 // This is an eof-in-doctype parse error. Create a new DOCTYPE token. Set
3401 // its force-quirks flag to on. Emit the current token. Emit an end-of-file
3402 // token.
3403 None => {
3404 self.emit_error(ErrorKind::EofInDoctype);
3405 self.create_doctype_token();
3406 self.set_doctype_token_force_quirks();
3407 self.emit_doctype_token();
3408 self.emit_token(Token::Eof);
3409
3410 return Ok(());
3411 }
3412 // Anything else
3413 // Create a new DOCTYPE token. Set the token's name to the current input
3414 // character. Switch to the DOCTYPE name state.
3415 Some(c) => {
3416 self.validate_input_stream_character(c);
3417 self.append_raw_to_doctype_token(c);
3418 self.create_doctype_token();
3419 self.set_doctype_token_name(c);
3420 self.state = State::DoctypeName;
3421 }
3422 }
3423 }
3424 // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
3425 State::DoctypeName => {
3426 // Consume the next input character:
3427 match self.consume_next_char() {
3428 // U+0009 CHARACTER TABULATION (tab)
3429 // U+000A LINE FEED (LF)
3430 // U+000C FORM FEED (FF)
3431 // U+0020 SPACE
3432 // Switch to the after DOCTYPE name state.
3433 Some(c) if is_spacy(c) => {
3434 self.append_raw_to_doctype_token(c);
3435 self.finish_doctype_token_name();
3436 self.state = State::AfterDoctypeName;
3437 }
3438 // U+003E GREATER-THAN SIGN (>)
3439 // Switch to the data state. Emit the current DOCTYPE token.
3440 Some(c @ '>') => {
3441 self.append_raw_to_doctype_token(c);
3442 self.finish_doctype_token_name();
3443 self.state = State::Data;
3444 self.emit_doctype_token();
3445 }
3446 // ASCII upper alpha
3447 // Append the lowercase version of the current input character (add 0x0020
3448 // to the character's code point) to the current DOCTYPE token's name.
3449 Some(c) if is_ascii_upper_alpha(c) => {
3450 self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha);
3451 }
3452 // U+0000 NULL
3453 // This is an unexpected-null-character parse error. Append a U+FFFD
3454 // REPLACEMENT CHARACTER character to the current DOCTYPE token's name.
3455 Some(c @ '\x00') => {
3456 self.append_raw_to_doctype_token(c);
3457 self.emit_error(ErrorKind::UnexpectedNullCharacter);
3458 self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER), None, None);
3459 }
3460 // EOF
3461 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3462 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3463 // end-of-file token.
3464 None => {
3465 self.emit_error(ErrorKind::EofInDoctype);
3466 self.set_doctype_token_force_quirks();
3467 self.finish_doctype_token_name();
3468 self.emit_doctype_token();
3469 self.emit_token(Token::Eof);
3470
3471 return Ok(());
3472 }
3473 // Anything else
3474 // Append the current input character to the current DOCTYPE token's name.
3475 Some(c) => {
3476 self.validate_input_stream_character(c);
3477 self.consume_and_append_to_doctype_token_name(c, |c| {
3478 if !is_allowed_character(c) {
3479 return false;
3480 }
3481
3482 !is_spacy(c) && !matches!(c, '>' | '\x00') && !is_ascii_upper_alpha(c)
3483 });
3484 }
3485 }
3486 }
3487 // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
3488 State::AfterDoctypeName => {
3489 let cur_pos = self.input.cur_pos();
3490
3491 // Consume the next input character:
3492 match self.consume_next_char() {
3493 // U+0009 CHARACTER TABULATION (tab)
3494 // U+000A LINE FEED (LF)
3495 // U+000C FORM FEED (FF)
3496 // U+0020 SPACE
3497 // Ignore the character.
3498 Some(c) if is_spacy(c) => {
3499 self.append_raw_to_doctype_token(c);
3500 }
3501 // U+003E GREATER-THAN SIGN (>)
3502 // Switch to the data state. Emit the current DOCTYPE token.
3503 Some(c @ '>') => {
3504 self.append_raw_to_doctype_token(c);
3505 self.state = State::Data;
3506 self.emit_doctype_token();
3507 }
3508 // EOF
3509 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3510 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3511 // end-of-file token.
3512 None => {
3513 self.emit_error(ErrorKind::EofInDoctype);
3514 self.set_doctype_token_force_quirks();
3515 self.emit_doctype_token();
3516 self.emit_token(Token::Eof);
3517
3518 return Ok(());
3519 }
3520 // Anything else
3521 // If the six characters starting from the current input character are an
3522 // ASCII case-insensitive match for the word "PUBLIC", then consume those
3523 // characters and switch to the after DOCTYPE public keyword state.
3524 //
3525 // Otherwise, if the six characters starting from the current input
3526 // character are an ASCII case-insensitive match for the word "SYSTEM", then
3527 // consume those characters and switch to the after DOCTYPE system keyword
3528 // state.
3529 //
3530 // Otherwise, this is an invalid-character-sequence-after-doctype-name parse
3531 // error. Set the current DOCTYPE token's force-quirks flag to on. Reconsume
3532 // in the bogus DOCTYPE state.
3533 Some(c) => {
3534 let b = self.buf.clone();
3535 let mut buf = b.borrow_mut();
3536
3537 buf.push(c);
3538
3539 for _ in 0..5 {
3540 match self.consume_next_char() {
3541 Some(c) => {
3542 buf.push(c);
3543 }
3544 _ => {
3545 break;
3546 }
3547 }
3548 }
3549
3550 match &*buf.to_lowercase() {
3551 "public" => {
3552 self.state = State::AfterDoctypePublicKeyword;
3553
3554 let b = self.sub_buf.clone();
3555 let mut sub_buf = b.borrow_mut();
3556
3557 sub_buf.push_str(&buf);
3558 buf.clear();
3559 }
3560 "system" => {
3561 self.state = State::AfterDoctypeSystemKeyword;
3562
3563 let b = self.sub_buf.clone();
3564 let mut sub_buf = b.borrow_mut();
3565
3566 sub_buf.push_str(&buf);
3567 buf.clear();
3568 }
3569 _ => {
3570 buf.clear();
3571 self.cur_pos = cur_pos;
3572 unsafe {
3573 // Safety: We got cur_pos from self.input.cur_pos() above, so
3574 // it's a valid position.
3575 self.input.reset_to(cur_pos);
3576 }
3577 self.emit_error(
3578 ErrorKind::InvalidCharacterSequenceAfterDoctypeName,
3579 );
3580 self.set_doctype_token_force_quirks();
3581 self.reconsume_in_state(State::BogusDoctype);
3582 }
3583 }
3584 }
3585 }
3586 }
3587 // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
3588 State::AfterDoctypePublicKeyword => {
3589 // Consume the next input character:
3590 match self.consume_next_char() {
3591 // U+0009 CHARACTER TABULATION (tab)
3592 // U+000A LINE FEED (LF)
3593 // U+000C FORM FEED (FF)
3594 // U+0020 SPACE
3595 // Switch to the before DOCTYPE public identifier state.
3596 Some(c) if is_spacy(c) => {
3597 self.append_raw_to_doctype_token(c);
3598 self.state = State::BeforeDoctypePublicIdentifier;
3599 }
3600 // U+0022 QUOTATION MARK (")
3601 // This is a missing-whitespace-after-doctype-public-keyword parse error.
3602 // Set the current DOCTYPE token's public identifier to the empty string
3603 // (not missing), then switch to the DOCTYPE public identifier
3604 // (double-quoted) state.
3605 Some(c @ '"') => {
3606 self.append_raw_to_doctype_token(c);
3607 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
3608 self.set_doctype_token_public_id();
3609 self.state = State::DoctypePublicIdentifierDoubleQuoted;
3610 }
3611 // U+0027 APOSTROPHE (')
3612 // This is a missing-whitespace-after-doctype-public-keyword parse error.
3613 // Set the current DOCTYPE token's public identifier to the empty string
3614 // (not missing), then switch to the DOCTYPE public identifier
3615 // (single-quoted) state.
3616 Some(c @ '\'') => {
3617 self.append_raw_to_doctype_token(c);
3618 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
3619 self.set_doctype_token_public_id();
3620 self.state = State::DoctypePublicIdentifierSingleQuoted;
3621 }
3622 // U+003E GREATER-THAN SIGN (>)
3623 // This is a missing-doctype-public-identifier parse error. Set the current
3624 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3625 // the current DOCTYPE token.
3626 Some(c @ '>') => {
3627 self.append_raw_to_doctype_token(c);
3628 self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
3629 self.set_doctype_token_force_quirks();
3630 self.state = State::Data;
3631 self.emit_doctype_token();
3632 }
3633 // EOF
3634 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3635 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3636 // end-of-file token.
3637 None => {
3638 self.emit_error(ErrorKind::EofInDoctype);
3639 self.set_doctype_token_force_quirks();
3640 self.emit_doctype_token();
3641 self.emit_token(Token::Eof);
3642
3643 return Ok(());
3644 }
3645 // Anything else
3646 // This is a missing-quote-before-doctype-public-identifier parse error. Set
3647 // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3648 // bogus DOCTYPE state.
3649 _ => {
3650 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier);
3651 self.set_doctype_token_force_quirks();
3652 self.reconsume_in_state(State::BogusDoctype);
3653 }
3654 }
3655 }
3656 // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
3657 State::BeforeDoctypePublicIdentifier => {
3658 // Consume the next input character:
3659 match self.consume_next_char() {
3660 // U+0009 CHARACTER TABULATION (tab)
3661 // U+000A LINE FEED (LF)
3662 // U+000C FORM FEED (FF)
3663 // U+0020 SPACE
3664 // Ignore the character.
3665 Some(c) if is_spacy(c) => {
3666 self.append_raw_to_doctype_token(c);
3667 }
3668 // U+0022 QUOTATION MARK (")
3669 // Set the current DOCTYPE token's public identifier to the empty string
3670 // (not missing), then switch to the DOCTYPE public identifier
3671 // (double-quoted) state.
3672 Some(c @ '"') => {
3673 self.append_raw_to_doctype_token(c);
3674 self.set_doctype_token_public_id();
3675 self.state = State::DoctypePublicIdentifierDoubleQuoted;
3676 }
3677 // U+0027 APOSTROPHE (')
3678 // Set the current DOCTYPE token's public identifier to the empty string
3679 // (not missing), then switch to the DOCTYPE public identifier
3680 // (single-quoted) state.
3681 Some(c @ '\'') => {
3682 self.append_raw_to_doctype_token(c);
3683 self.set_doctype_token_public_id();
3684 self.state = State::DoctypePublicIdentifierSingleQuoted;
3685 }
3686 // U+003E GREATER-THAN SIGN (>)
3687 // This is a missing-doctype-public-identifier parse error. Set the current
3688 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3689 // the current DOCTYPE token.
3690 Some(c @ '>') => {
3691 self.append_raw_to_doctype_token(c);
3692 self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
3693 self.set_doctype_token_force_quirks();
3694 self.state = State::Data;
3695 self.emit_doctype_token();
3696 }
3697 // EOF
3698 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3699 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3700 // end-of-file token.
3701 None => {
3702 self.emit_error(ErrorKind::EofInDoctype);
3703 self.set_doctype_token_force_quirks();
3704 self.emit_doctype_token();
3705 self.emit_token(Token::Eof);
3706
3707 return Ok(());
3708 }
3709 // Anything else
3710 // This is a missing-quote-before-doctype-public-identifier parse error. Set
3711 // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3712 // bogus DOCTYPE state.
3713 _ => {
3714 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier);
3715 self.set_doctype_token_force_quirks();
3716 self.reconsume_in_state(State::BogusDoctype);
3717 }
3718 }
3719 }
3720 // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
3721 State::DoctypePublicIdentifierDoubleQuoted => {
3722 // Consume the next input character:
3723 match self.consume_next_char() {
3724 // U+0022 QUOTATION MARK (")
3725 // Switch to the after DOCTYPE public identifier state.
3726 Some(c @ '"') => {
3727 self.append_raw_to_doctype_token(c);
3728 self.finish_doctype_token_public_id();
3729 self.state = State::AfterDoctypePublicIdentifier;
3730 }
3731 // U+0000 NULL
3732 // This is an unexpected-null-character parse error. Append a U+FFFD
3733 // REPLACEMENT CHARACTER character to the current DOCTYPE token's public
3734 // identifier.
3735 Some(c @ '\x00') => {
3736 self.append_raw_to_doctype_token(c);
3737 self.emit_error(ErrorKind::UnexpectedNullCharacter);
3738 self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None);
3739 }
3740 // U+003E GREATER-THAN SIGN (>)
3741 // This is an abrupt-doctype-public-identifier parse error. Set the current
3742 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3743 // the current DOCTYPE token.
3744 Some(c @ '>') => {
3745 self.append_raw_to_doctype_token(c);
3746 self.finish_doctype_token_public_id();
3747 self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
3748 self.set_doctype_token_force_quirks();
3749 self.state = State::Data;
3750 self.emit_doctype_token();
3751 }
3752 // EOF
3753 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3754 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3755 // end-of-file token.
3756 None => {
3757 self.finish_doctype_token_public_id();
3758 self.emit_error(ErrorKind::EofInDoctype);
3759 self.set_doctype_token_force_quirks();
3760 self.emit_doctype_token();
3761 self.emit_token(Token::Eof);
3762
3763 return Ok(());
3764 }
3765 // Anything else
3766 // Append the current input character to the current DOCTYPE token's public
3767 // identifier.
3768 Some(c) => {
3769 self.validate_input_stream_character(c);
3770 self.consume_and_append_to_doctype_token_public_id(c, |c| {
3771 if !is_allowed_character(c) {
3772 return false;
3773 }
3774
3775 !matches!(c, '"' | '\x00' | '>' | '\r')
3776 });
3777 }
3778 }
3779 }
3780 // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
3781 State::DoctypePublicIdentifierSingleQuoted => {
3782 // Consume the next input character:
3783 match self.consume_next_char() {
3784 // U+0027 APOSTROPHE (')
3785 // Switch to the after DOCTYPE public identifier state.
3786 Some(c @ '\'') => {
3787 self.finish_doctype_token_public_id();
3788 self.append_raw_to_doctype_token(c);
3789 self.state = State::AfterDoctypePublicIdentifier;
3790 }
3791 // U+0000 NULL
3792 // This is an unexpected-null-character parse error. Append a U+FFFD
3793 // REPLACEMENT CHARACTER character to the current DOCTYPE token's public
3794 // identifier.
3795 Some(c @ '\x00') => {
3796 self.append_raw_to_doctype_token(c);
3797 self.emit_error(ErrorKind::UnexpectedNullCharacter);
3798 self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None);
3799 }
3800 // U+003E GREATER-THAN SIGN (>)
3801 // This is an abrupt-doctype-public-identifier parse error. Set the current
3802 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
3803 // the current DOCTYPE token.
3804 Some(c @ '>') => {
3805 self.finish_doctype_token_public_id();
3806 self.append_raw_to_doctype_token(c);
3807 self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
3808 self.set_doctype_token_force_quirks();
3809 self.state = State::Data;
3810 self.emit_doctype_token();
3811 }
3812 // EOF
3813 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3814 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3815 // end-of-file token.
3816 None => {
3817 self.finish_doctype_token_public_id();
3818 self.emit_error(ErrorKind::EofInDoctype);
3819 self.set_doctype_token_force_quirks();
3820 self.emit_doctype_token();
3821 self.emit_token(Token::Eof);
3822
3823 return Ok(());
3824 }
3825 // Anything else
3826 // Append the current input character to the current DOCTYPE token's public
3827 // identifier.
3828 Some(c) => {
3829 self.validate_input_stream_character(c);
3830 self.consume_and_append_to_doctype_token_public_id(c, |c| {
3831 if !is_allowed_character(c) {
3832 return false;
3833 }
3834
3835 !matches!(c, '\'' | '\x00' | '>' | '\r')
3836 });
3837 }
3838 }
3839 }
3840 // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
3841 State::AfterDoctypePublicIdentifier => {
3842 // Consume the next input character:
3843 match self.consume_next_char() {
3844 // U+0009 CHARACTER TABULATION (tab)
3845 // U+000A LINE FEED (LF)
3846 // U+000C FORM FEED (FF)
3847 // U+0020 SPACE
3848 // Switch to the between DOCTYPE public and system identifiers state.
3849 Some(c) if is_spacy(c) => {
3850 self.append_raw_to_doctype_token(c);
3851 self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
3852 }
3853 // U+003E GREATER-THAN SIGN (>)
3854 // Switch to the data state. Emit the current DOCTYPE token.
3855 Some(c @ '>') => {
3856 self.append_raw_to_doctype_token(c);
3857 self.state = State::Data;
3858 self.emit_doctype_token();
3859 }
3860 // U+0022 QUOTATION MARK (")
3861 // This is a missing-whitespace-between-doctype-public-and-system-identifiers
3862 // parse error. Set the current DOCTYPE token's system
3863 // identifier to the empty string (not missing), then switch
3864 // to the DOCTYPE system identifier (double-quoted) state.
3865 Some(c @ '"') => {
3866 self.append_raw_to_doctype_token(c);
3867 self.emit_error(
3868 ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
3869 );
3870 self.set_doctype_token_system_id();
3871 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
3872 }
3873 // U+0027 APOSTROPHE (')
3874 // This is a missing-whitespace-between-doctype-public-and-system-identifiers
3875 // parse error. Set the current DOCTYPE token's system
3876 // identifier to the empty string (not missing), then switch
3877 // to the DOCTYPE system identifier (single-quoted) state.
3878 Some(c @ '\'') => {
3879 self.append_raw_to_doctype_token(c);
3880 self.emit_error(
3881 ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
3882 );
3883 self.set_doctype_token_system_id();
3884 self.state = State::DoctypeSystemIdentifierSingleQuoted;
3885 }
3886 // EOF
3887 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3888 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3889 // end-of-file token.
3890 None => {
3891 self.emit_error(ErrorKind::EofInDoctype);
3892 self.set_doctype_token_force_quirks();
3893 self.emit_doctype_token();
3894 self.emit_token(Token::Eof);
3895
3896 return Ok(());
3897 }
3898 // Anything else
3899 // This is a missing-quote-before-doctype-system-identifier parse error. Set
3900 // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3901 // bogus DOCTYPE state.
3902 _ => {
3903 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
3904 self.set_doctype_token_force_quirks();
3905 self.reconsume_in_state(State::BogusDoctype);
3906 }
3907 }
3908 }
3909 // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
3910 State::BetweenDoctypePublicAndSystemIdentifiers => {
3911 // Consume the next input character:
3912 match self.consume_next_char() {
3913 // U+0009 CHARACTER TABULATION (tab)
3914 // U+000A LINE FEED (LF)
3915 // U+000C FORM FEED (FF)
3916 // U+0020 SPACE
3917 // Ignore the character.
3918 Some(c) if is_spacy(c) => {
3919 self.append_raw_to_doctype_token(c);
3920 }
3921 // U+003E GREATER-THAN SIGN (>)
3922 // Switch to the data state. Emit the current DOCTYPE token.
3923 Some(c @ '>') => {
3924 self.append_raw_to_doctype_token(c);
3925 self.state = State::Data;
3926 self.emit_doctype_token();
3927 }
3928 // U+0022 QUOTATION MARK (")
3929 // Set the current DOCTYPE token's system identifier to the empty string
3930 // (not missing), then switch to the DOCTYPE system identifier
3931 // (double-quoted) state.
3932 Some(c @ '"') => {
3933 self.append_raw_to_doctype_token(c);
3934 self.set_doctype_token_system_id();
3935 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
3936 }
3937 // U+0027 APOSTROPHE (')
3938 // Set the current DOCTYPE token's system identifier to the empty string
3939 // (not missing), then switch to the DOCTYPE system identifier
3940 // (single-quoted) state.
3941 Some(c @ '\'') => {
3942 self.append_raw_to_doctype_token(c);
3943 self.set_doctype_token_system_id();
3944 self.state = State::DoctypeSystemIdentifierSingleQuoted;
3945 }
3946 // EOF
3947 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
3948 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
3949 // end-of-file token.
3950 None => {
3951 self.emit_error(ErrorKind::EofInDoctype);
3952 self.set_doctype_token_force_quirks();
3953 self.emit_doctype_token();
3954 self.emit_token(Token::Eof);
3955
3956 return Ok(());
3957 }
3958 // Anything else
3959 // This is a missing-quote-before-doctype-system-identifier parse error. Set
3960 // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
3961 // bogus DOCTYPE state
3962 _ => {
3963 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
3964 self.set_doctype_token_force_quirks();
3965 self.reconsume_in_state(State::BogusDoctype);
3966 }
3967 }
3968 }
3969 // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
3970 State::AfterDoctypeSystemKeyword => {
3971 // Consume the next input character:
3972 match self.consume_next_char() {
3973 // U+0009 CHARACTER TABULATION (tab)
3974 // U+000A LINE FEED (LF)
3975 // U+000C FORM FEED (FF)
3976 // U+0020 SPACE
3977 // Switch to the before DOCTYPE system identifier state.
3978 Some(c) if is_spacy(c) => {
3979 self.append_raw_to_doctype_token(c);
3980 self.state = State::BeforeDoctypeSystemIdentifier;
3981 }
3982 // U+0022 QUOTATION MARK (")
3983 // This is a missing-whitespace-after-doctype-system-keyword parse error.
3984 // Set the current DOCTYPE token's system identifier to the empty string
3985 // (not missing), then switch to the DOCTYPE system identifier
3986 // (double-quoted) state.
3987 Some(c @ '"') => {
3988 self.append_raw_to_doctype_token(c);
3989 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
3990 self.set_doctype_token_system_id();
3991 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
3992 }
3993 // U+0027 APOSTROPHE (')
3994 // This is a missing-whitespace-after-doctype-system-keyword parse error.
3995 // Set the current DOCTYPE token's system identifier to the empty string
3996 // (not missing), then switch to the DOCTYPE system identifier
3997 // (single-quoted) state.
3998 Some(c @ '\'') => {
3999 self.append_raw_to_doctype_token(c);
4000 self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
4001 self.set_doctype_token_system_id();
4002 self.state = State::DoctypeSystemIdentifierSingleQuoted;
4003 }
4004 // U+003E GREATER-THAN SIGN (>)
4005 // This is a missing-doctype-system-identifier parse error. Set the current
4006 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4007 // the current DOCTYPE token.
4008 Some(c @ '>') => {
4009 self.append_raw_to_doctype_token(c);
4010 self.emit_error(ErrorKind::MissingDoctypeSystemIdentifier);
4011 self.set_doctype_token_force_quirks();
4012 self.state = State::Data;
4013 self.emit_doctype_token();
4014 }
4015 // EOF
4016 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4017 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4018 // end-of-file token.
4019 None => {
4020 self.emit_error(ErrorKind::EofInDoctype);
4021 self.set_doctype_token_force_quirks();
4022 self.emit_doctype_token();
4023 self.emit_token(Token::Eof);
4024
4025 return Ok(());
4026 }
4027 // Anything else
4028 // This is a missing-quote-before-doctype-system-identifier parse error. Set
4029 // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
4030 // bogus DOCTYPE state.
4031 _ => {
4032 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
4033 self.set_doctype_token_force_quirks();
4034 self.reconsume_in_state(State::BogusDoctype);
4035 }
4036 }
4037 }
4038 // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
4039 State::BeforeDoctypeSystemIdentifier => {
4040 // Consume the next input character:
4041 match self.consume_next_char() {
4042 // U+0009 CHARACTER TABULATION (tab)
4043 // U+000A LINE FEED (LF)
4044 // U+000C FORM FEED (FF)
4045 // U+0020 SPACE
4046 // Ignore the character.
4047 Some(c) if is_spacy(c) => {
4048 self.append_raw_to_doctype_token(c);
4049 }
4050 // U+0022 QUOTATION MARK (")
4051 // Set the current DOCTYPE token's system identifier to the empty string
4052 // (not missing), then switch to the DOCTYPE system identifier
4053 // (double-quoted) state.
4054 Some(c @ '"') => {
4055 self.append_raw_to_doctype_token(c);
4056 self.set_doctype_token_system_id();
4057 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
4058 }
4059 // U+0027 APOSTROPHE (')
4060 // Set the current DOCTYPE token's system identifier to the empty string
4061 // (not missing), then switch to the DOCTYPE system identifier
4062 // (single-quoted) state.
4063 Some(c @ '\'') => {
4064 self.append_raw_to_doctype_token(c);
4065 self.set_doctype_token_system_id();
4066 self.state = State::DoctypeSystemIdentifierSingleQuoted;
4067 }
4068 // U+003E GREATER-THAN SIGN (>)
4069 // This is a missing-doctype-system-identifier parse error. Set the current
4070 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4071 // the current DOCTYPE token.
4072 Some(c @ '>') => {
4073 self.append_raw_to_doctype_token(c);
4074 self.emit_error(ErrorKind::EofInDoctype);
4075 self.set_doctype_token_force_quirks();
4076 self.state = State::Data;
4077 self.emit_doctype_token();
4078 }
4079 // EOF
4080 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4081 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4082 // end-of-file token.
4083 None => {
4084 self.emit_error(ErrorKind::EofInDoctype);
4085 self.set_doctype_token_force_quirks();
4086 self.emit_doctype_token();
4087 self.emit_token(Token::Eof);
4088
4089 return Ok(());
4090 }
4091 // Anything else
4092 // This is a missing-quote-before-doctype-system-identifier parse error. Set
4093 // the current DOCTYPE token's force-quirks flag to on. Reconsume in the
4094 // bogus DOCTYPE state.
4095 _ => {
4096 self.emit_error(ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier);
4097 self.set_doctype_token_force_quirks();
4098 self.reconsume_in_state(State::BogusDoctype);
4099 }
4100 }
4101 }
4102 // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
4103 State::DoctypeSystemIdentifierDoubleQuoted => {
4104 // Consume the next input character:
4105 match self.consume_next_char() {
4106 // U+0027 APOSTROPHE (')
4107 // Switch to the after DOCTYPE system identifier state.
4108 Some(c @ '"') => {
4109 self.finish_doctype_token_system_id();
4110 self.append_raw_to_doctype_token(c);
4111 self.state = State::AfterDoctypeSystemIdentifier;
4112 }
4113 // U+0000 NULL
4114 // This is an unexpected-null-character parse error. Append a U+FFFD
4115 // REPLACEMENT CHARACTER character to the current DOCTYPE token's system
4116 // identifier.
4117 Some(c @ '\x00') => {
4118 self.append_raw_to_doctype_token(c);
4119 self.emit_error(ErrorKind::UnexpectedNullCharacter);
4120 self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER));
4121 }
4122 // U+003E GREATER-THAN SIGN (>)
4123 // This is an abrupt-doctype-system-identifier parse error. Set the current
4124 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4125 // the current DOCTYPE token.
4126 Some(c @ '>') => {
4127 self.finish_doctype_token_system_id();
4128 self.append_raw_to_doctype_token(c);
4129 self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
4130 self.set_doctype_token_force_quirks();
4131 self.state = State::Data;
4132 self.emit_doctype_token();
4133 }
4134 // EOF
4135 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4136 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4137 // end-of-file token.
4138 None => {
4139 self.finish_doctype_token_system_id();
4140 self.emit_error(ErrorKind::EofInDoctype);
4141 self.set_doctype_token_force_quirks();
4142 self.emit_doctype_token();
4143 self.emit_token(Token::Eof);
4144
4145 return Ok(());
4146 }
4147 // Anything else
4148 // Append the current input character to the current DOCTYPE token's system
4149 // identifier.
4150 Some(c) => {
4151 self.validate_input_stream_character(c);
4152 self.consume_and_append_to_doctype_token_system_id(c, |c| {
4153 if !is_allowed_character(c) {
4154 return false;
4155 }
4156
4157 !matches!(c, '"' | '\x00' | '>' | '\r')
4158 });
4159 }
4160 }
4161 }
4162 // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
4163 State::DoctypeSystemIdentifierSingleQuoted => {
4164 // Consume the next input character:
4165 match self.consume_next_char() {
4166 // U+0027 APOSTROPHE (')
4167 // Switch to the after DOCTYPE system identifier state.
4168 Some(c @ '\'') => {
4169 self.finish_doctype_token_system_id();
4170 self.append_raw_to_doctype_token(c);
4171 self.state = State::AfterDoctypeSystemIdentifier;
4172 }
4173 // U+0000 NULL
4174 // This is an unexpected-null-character parse error. Append a U+FFFD
4175 // REPLACEMENT CHARACTER character to the current DOCTYPE token's system
4176 // identifier.
4177 Some(c @ '\x00') => {
4178 self.append_raw_to_doctype_token(c);
4179 self.emit_error(ErrorKind::UnexpectedNullCharacter);
4180 self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER));
4181 }
4182 // U+003E GREATER-THAN SIGN (>)
4183 // This is an abrupt-doctype-system-identifier parse error. Set the current
4184 // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
4185 // the current DOCTYPE token.
4186 Some(c @ '>') => {
4187 self.finish_doctype_token_system_id();
4188 self.append_raw_to_doctype_token(c);
4189 self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
4190 self.set_doctype_token_force_quirks();
4191 self.state = State::Data;
4192 self.emit_doctype_token();
4193 }
4194 // EOF
4195 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4196 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4197 // end-of-file token.
4198 None => {
4199 self.finish_doctype_token_system_id();
4200 self.emit_error(ErrorKind::EofInDoctype);
4201 self.set_doctype_token_force_quirks();
4202 self.emit_doctype_token();
4203 self.emit_token(Token::Eof);
4204
4205 return Ok(());
4206 }
4207 // Anything else
4208 // Append the current input character to the current DOCTYPE token's system
4209 // identifier.
4210 Some(c) => {
4211 self.validate_input_stream_character(c);
4212 self.consume_and_append_to_doctype_token_system_id(c, |c| {
4213 if !is_allowed_character(c) {
4214 return false;
4215 }
4216
4217 !matches!(c, '\'' | '\x00' | '>' | '\r')
4218 });
4219 }
4220 }
4221 }
4222 // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
4223 State::AfterDoctypeSystemIdentifier => {
4224 // Consume the next input character:
4225 match self.consume_next_char() {
4226 // U+0009 CHARACTER TABULATION (tab)
4227 // U+000A LINE FEED (LF)
4228 // U+000C FORM FEED (FF)
4229 // U+0020 SPACE
4230 // Ignore the character.
4231 Some(c) if is_spacy(c) => {
4232 self.append_raw_to_doctype_token(c);
4233 }
4234 // U+003E GREATER-THAN SIGN (>)
4235 // Switch to the data state. Emit the current DOCTYPE token.
4236 Some(c @ '>') => {
4237 self.append_raw_to_doctype_token(c);
4238 self.state = State::Data;
4239 self.emit_doctype_token();
4240 }
4241 // EOF
4242 // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
4243 // force-quirks flag to on. Emit the current DOCTYPE token. Emit an
4244 // end-of-file token.
4245 None => {
4246 self.emit_error(ErrorKind::EofInDoctype);
4247 self.set_doctype_token_force_quirks();
4248 self.emit_doctype_token();
4249 self.emit_token(Token::Eof);
4250
4251 return Ok(());
4252 }
4253 // Anything else
4254 // This is an unexpected-character-after-doctype-system-identifier parse
4255 // error. Reconsume in the bogus DOCTYPE state. (This does not set the
4256 // current DOCTYPE token's force-quirks flag to on.)
4257 _ => {
4258 self.emit_error(ErrorKind::UnexpectedCharacterAfterDoctypeSystemIdentifier);
4259 self.reconsume_in_state(State::BogusDoctype);
4260 }
4261 }
4262 }
4263 // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
4264 State::BogusDoctype => {
4265 // Consume the next input character:
4266 match self.consume_next_char() {
4267 // U+003E GREATER-THAN SIGN (>)
4268 // Switch to the data state. Emit the DOCTYPE token.
4269 Some(c @ '>') => {
4270 self.append_raw_to_doctype_token(c);
4271 self.state = State::Data;
4272 self.emit_doctype_token();
4273 }
4274 // U+0000 NULL
4275 // This is an unexpected-null-character parse error. Ignore the character.
4276 Some(c @ '\x00') => {
4277 self.append_raw_to_doctype_token(c);
4278 self.emit_error(ErrorKind::UnexpectedNullCharacter);
4279 }
4280 // EOF
4281 // Emit the DOCTYPE token. Emit an end-of-file token.
4282 None => {
4283 self.emit_doctype_token();
4284 self.emit_token(Token::Eof);
4285
4286 return Ok(());
4287 }
4288 // Anything else
4289 // Ignore the character.
4290 Some(c) => {
4291 self.validate_input_stream_character(c);
4292 self.append_raw_to_doctype_token(c);
4293 }
4294 }
4295 }
4296 // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
4297 State::CdataSection => {
4298 // Consume the next input character:
4299 match self.consume_next_char() {
4300 // U+005D RIGHT SQUARE BRACKET (])
4301 // Switch to the CDATA section bracket state.
4302 Some(']') => {
4303 self.state = State::CdataSectionBracket;
4304 }
4305 // EOF
4306 // This is an eof-in-cdata parse error. Emit an end-of-file token.
4307 None => {
4308 self.emit_error(ErrorKind::EofInCdata);
4309 self.emit_token(Token::Eof);
4310
4311 return Ok(());
4312 }
4313 // Anything else
4314 // Emit the current input character as a character token.
4315 Some(c) => {
4316 self.validate_input_stream_character(c);
4317 self.handle_raw_and_emit_character_token(c);
4318 }
4319 }
4320 }
4321 // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
4322 State::CdataSectionBracket => {
4323 // Consume the next input character:
4324 match self.consume_next_char() {
4325 // U+005D RIGHT SQUARE BRACKET (])
4326 // Switch to the CDATA section end state.
4327 Some(']') => {
4328 self.state = State::CdataSectionEnd;
4329 }
4330 // Anything else
4331 // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
4332 // CDATA section state.
4333 _ => {
4334 self.emit_character_token(']');
4335 self.reconsume_in_state(State::CdataSection);
4336 }
4337 }
4338 }
4339 // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
4340 State::CdataSectionEnd => {
4341 // Consume the next input character:
4342 match self.consume_next_char() {
4343 // U+005D RIGHT SQUARE BRACKET (])
4344 // Emit a U+005D RIGHT SQUARE BRACKET character token.
4345 Some(c @ ']') => {
4346 self.emit_character_token_with_raw(']', c);
4347 }
4348 // U+003E GREATER-THAN SIGN character
4349 // Switch to the data state.
4350 Some('>') => {
4351 self.state = State::Data;
4352 }
4353 // Anything else
4354 // Emit two U+005D RIGHT SQUARE BRACKET character tokens. Reconsume in the
4355 // CDATA section state.
4356 _ => {
4357 self.emit_character_token(']');
4358 self.emit_character_token(']');
4359 self.reconsume_in_state(State::CdataSection);
4360 }
4361 }
4362 }
4363 // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
4364 State::CharacterReference => {
4365 // Set the temporary buffer to the empty string. Append a U+0026 AMPERSAND (&)
4366 // character to the temporary buffer.
4367 self.temporary_buffer.clear();
4368 self.temporary_buffer.push('&');
4369
4370 // Consume the next input character:
4371 match self.consume_next_char() {
4372 // ASCII alphanumeric
4373 // Reconsume in the named character reference state.
4374 Some(c) if c.is_ascii_alphanumeric() => {
4375 self.reconsume_in_state(State::NamedCharacterReference);
4376 }
4377 // U+0023 NUMBER SIGN (#)
4378 // Append the current input character to the temporary buffer. Switch to the
4379 // numeric character reference state.
4380 Some(c @ '#') => {
4381 self.temporary_buffer.push(c);
4382 self.state = State::NumericCharacterReference;
4383 }
4384 // Anything else
4385 // Flush code points consumed as a character reference. Reconsume in the
4386 // return state.
4387 _ => {
4388 self.flush_code_points_consumed_as_character_reference(None);
4389 self.reconsume_in_state(self.return_state.clone());
4390 }
4391 }
4392 }
4393 // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
4394 State::NamedCharacterReference => {
4395 // Consume the maximum number of characters possible, where the consumed
4396 // characters are one of the identifiers in the first column of the named
4397 // character references table. Append each character to the temporary buffer
4398 // when it's consumed.
4399 // The shortest entity - `>`
4400 // The longest entity - `∳`
4401 let initial_cur_pos = self.input.cur_pos();
4402
4403 let mut entity: Option<&Entity> = None;
4404 let mut entity_cur_pos: Option<BytePos> = None;
4405 let mut entity_temporary_buffer =
4406 String::with_capacity(self.temporary_buffer.capacity());
4407
4408 entity_temporary_buffer.push_str(&self.temporary_buffer);
4409
4410 // No need to validate input, because we reset position if nothing was found
4411 while let Some(c) = &self.consume_next_char() {
4412 entity_temporary_buffer.push(*c);
4413
4414 if let Some(found_entity) = HTML_ENTITIES.get(&entity_temporary_buffer) {
4415 entity = Some(found_entity);
4416 entity_cur_pos = Some(self.input.cur_pos());
4417
4418 self.temporary_buffer
4419 .replace_range(1.., &entity_temporary_buffer[1..]);
4420 } else {
4421 // We stop when:
4422 //
4423 // - not ascii alphanumeric
4424 // - we consume more characters than the longest entity
4425 if !c.is_ascii_alphanumeric() || entity_temporary_buffer.len() > 32 {
4426 break;
4427 }
4428 }
4429 }
4430
4431 if entity.is_some() {
4432 self.cur_pos = entity_cur_pos.unwrap();
4433 unsafe {
4434 // Safety: We got entity_cur_pos from the input, so it's valid
4435 self.input.reset_to(entity_cur_pos.unwrap());
4436 }
4437 } else {
4438 self.cur_pos = initial_cur_pos;
4439 unsafe {
4440 // Safety: We got initial_cur_pos from the input, so it's valid
4441 self.input.reset_to(initial_cur_pos);
4442 }
4443 }
4444
4445 let is_last_semicolon = self.temporary_buffer.ends_with(';');
4446
4447 // If there is a match
4448 match entity {
4449 Some(entity) => {
4450 let is_next_equals_sign_or_ascii_alphanumeric = match self.next() {
4451 Some('=') => true,
4452 Some(c) if c.is_ascii_alphanumeric() => true,
4453 _ => false,
4454 };
4455
4456 // If the character reference was consumed as part of an attribute, and
4457 // the last character matched is not a
4458 // U+003B SEMICOLON character (;), and the next input
4459 // character is either a U+003D EQUALS SIGN character (=) or an ASCII
4460 // alphanumeric, then, for historical reasons, flush code points
4461 // consumed as a character reference and
4462 // switch to the return state.
4463 if self.is_consumed_as_part_of_an_attribute()
4464 && !is_last_semicolon
4465 && is_next_equals_sign_or_ascii_alphanumeric
4466 {
4467 self.flush_code_points_consumed_as_character_reference(None);
4468 self.state = self.return_state.clone();
4469 }
4470 // Otherwise:
4471 //
4472 // If the last character matched is not a U+003B SEMICOLON character
4473 // (;), then this is a missing-semicolon-after-character-reference parse
4474 // error.
4475 //
4476 // Set the temporary buffer to the empty string. Append one or two
4477 // characters corresponding to the character reference name (as given by
4478 // the second column of the named character references table) to the
4479 // temporary buffer.
4480 //
4481 // Flush code points consumed as a character reference. Switch to the
4482 // return state.
4483 else {
4484 if !is_last_semicolon {
4485 self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
4486 }
4487
4488 let old_temporary_buffer = self.temporary_buffer.clone();
4489
4490 self.temporary_buffer.clear();
4491 self.temporary_buffer.push_str(&entity.characters);
4492 self.flush_code_points_consumed_as_character_reference(Some(
4493 old_temporary_buffer,
4494 ));
4495 self.state = self.return_state.clone();
4496 }
4497 }
4498 // Otherwise
4499 // Flush code points consumed as a character reference. Switch to the
4500 // ambiguous ampersand state.
4501 _ => {
4502 self.flush_code_points_consumed_as_character_reference(None);
4503 self.state = State::AmbiguousAmpersand;
4504 }
4505 }
4506 }
4507 // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
4508 State::AmbiguousAmpersand => {
4509 // Consume the next input character:
4510 match self.consume_next_char() {
4511 // ASCII alphanumeric
4512 // If the character reference was consumed as part of an attribute, then
4513 // append the current input character to the current attribute's value.
4514 // Otherwise, emit the current input character as a character token.
4515 Some(c) if c.is_ascii_alphanumeric() => {
4516 if self.is_consumed_as_part_of_an_attribute() {
4517 self.append_to_attribute_token_value(Some(c), Some(c));
4518 } else {
4519 self.emit_character_token(c);
4520 }
4521 }
4522 // U+003B SEMICOLON (;)
4523 // This is an unknown-named-character-reference parse error. Reconsume in
4524 // the return state.
4525 Some(';') => {
4526 self.emit_error(ErrorKind::UnknownNamedCharacterReference);
4527 self.reconsume_in_state(self.return_state.clone());
4528 }
4529 // Anything else
4530 // Reconsume in the return state.
4531 _ => {
4532 self.reconsume_in_state(self.return_state.clone());
4533 }
4534 }
4535 }
4536 // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
4537 State::NumericCharacterReference => {
4538 self.character_reference_code = Some(vec![(0, 0, None)]);
4539
4540 // Consume the next input character:
4541 match self.consume_next_char() {
4542 // U+0078 LATIN SMALL LETTER X
4543 // U+0058 LATIN CAPITAL LETTER X
4544 // Append the current input character to the temporary buffer. Switch to the
4545 // hexadecimal character reference start state.
4546 Some(c @ 'x' | c @ 'X') => {
4547 self.temporary_buffer.push(c);
4548 self.state = State::HexademicalCharacterReferenceStart;
4549 }
4550 // Anything else
4551 // Reconsume in the decimal character reference start state.
4552 _ => {
4553 self.reconsume_in_state(State::DecimalCharacterReferenceStart);
4554 }
4555 }
4556 }
4557 // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
4558 State::HexademicalCharacterReferenceStart => {
4559 // Consume the next input character:
4560 match self.consume_next_char() {
4561 // ASCII hex digit
4562 // Reconsume in the hexadecimal character reference state.
4563 Some(c) if is_ascii_hex_digit(c) => {
4564 self.reconsume_in_state(State::HexademicalCharacterReference);
4565 }
4566 // Anything else
4567 // This is an absence-of-digits-in-numeric-character-reference parse error.
4568 // Flush code points consumed as a character reference. Reconsume in the
4569 // return state.
4570 _ => {
4571 self.emit_error(ErrorKind::AbsenceOfDigitsInNumericCharacterReference);
4572 self.flush_code_points_consumed_as_character_reference(None);
4573 self.reconsume_in_state(self.return_state.clone());
4574 }
4575 }
4576 }
4577 // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
4578 State::DecimalCharacterReferenceStart => {
4579 // Consume the next input character:
4580 match self.consume_next_char() {
4581 // ASCII digit
4582 // Reconsume in the decimal character reference state.
4583 Some(c) if c.is_ascii_digit() => {
4584 self.reconsume_in_state(State::DecimalCharacterReference);
4585 }
4586 // Anything else
4587 // This is an absence-of-digits-in-numeric-character-reference parse error.
4588 // Flush code points consumed as a character reference. Reconsume in the
4589 // return state.
4590 _ => {
4591 self.emit_error(ErrorKind::AbsenceOfDigitsInNumericCharacterReference);
4592 self.flush_code_points_consumed_as_character_reference(None);
4593 self.reconsume_in_state(self.return_state.clone());
4594 }
4595 }
4596 }
4597 // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
4598 State::HexademicalCharacterReference => {
4599 // Consume the next input character:
4600 match self.consume_next_char() {
4601 // ASCII digit
4602 // Multiply the character reference code by 16. Add a numeric version of the
4603 // current input character (subtract 0x0030 from the character's code point)
4604 // to the character reference code.
4605 Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code {
4606 Some(character_reference_code) => {
4607 character_reference_code.push((16, c as u32 - 0x30, Some(c)));
4608 }
4609 _ => {
4610 unreachable!();
4611 }
4612 },
4613 // ASCII upper hex digit
4614 // Multiply the character reference code by 16. Add a numeric version of the
4615 // current input character as a hexadecimal digit (subtract 0x0037 from the
4616 // character's code point) to the character reference code.
4617 Some(c) if is_upper_hex_digit(c) => match &mut self.character_reference_code {
4618 Some(character_reference_code) => {
4619 character_reference_code.push((16, c as u32 - 0x37, Some(c)));
4620 }
4621 _ => {
4622 unreachable!();
4623 }
4624 },
4625 // ASCII lower hex digit
4626 // Multiply the character reference code by 16. Add a numeric version of the
4627 // current input character as a hexadecimal digit (subtract 0x0057 from the
4628 // character's code point) to the character reference code.
4629 Some(c) if is_lower_hex_digit(c) => match &mut self.character_reference_code {
4630 Some(character_reference_code) => {
4631 character_reference_code.push((16, c as u32 - 0x57, Some(c)));
4632 }
4633 _ => {
4634 unreachable!();
4635 }
4636 },
4637 // U+003B SEMICOLON
4638 // Switch to the numeric character reference end state.
4639 Some(';') => {
4640 self.state = State::NumericCharacterReferenceEnd;
4641 }
4642 // Anything else
4643 // This is a missing-semicolon-after-character-reference parse error.
4644 // Reconsume in the numeric character reference end state.
4645 _ => {
4646 self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
4647 self.reconsume_in_state(State::NumericCharacterReferenceEnd);
4648 }
4649 }
4650 }
4651 // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
4652 State::DecimalCharacterReference => {
4653 // Consume the next input character:
4654 match self.consume_next_char() {
4655 // ASCII digit
4656 // Multiply the character reference code by 10. Add a numeric version of the
4657 // current input character (subtract 0x0030 from the character's code point)
4658 // to the character reference code.
4659 Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code {
4660 Some(character_reference_code) => {
4661 character_reference_code.push((10, c as u32 - 0x30, Some(c)));
4662 }
4663 _ => {
4664 unreachable!();
4665 }
4666 },
4667 // U+003B SEMICOLON
4668 // Switch to the numeric character reference end state.
4669 Some(';') => self.state = State::NumericCharacterReferenceEnd,
4670 // Anything else
4671 // This is a missing-semicolon-after-character-reference parse error.
4672 // Reconsume in the numeric character reference end state.
4673 _ => {
4674 self.emit_error(ErrorKind::MissingSemicolonAfterCharacterReference);
4675 self.reconsume_in_state(State::NumericCharacterReferenceEnd);
4676 }
4677 }
4678 }
4679 // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
4680 State::NumericCharacterReferenceEnd => {
4681 let (value, raw_char_ref) =
4682 if let Some(chars) = self.character_reference_code.take() {
4683 let mut raw = String::with_capacity(8);
4684 let mut i: u32 = 0;
4685 let mut overflowed = false;
4686
4687 for (base, value, c) in chars.iter() {
4688 if let Some(c) = c {
4689 raw.push(*c);
4690 }
4691
4692 if !overflowed {
4693 if let Some(result) = i.checked_mul(*base as u32) {
4694 i = result;
4695
4696 if let Some(result) = i.checked_add(*value) {
4697 i = result;
4698 } else {
4699 i = 0x110000;
4700
4701 overflowed = true;
4702 }
4703 } else {
4704 i = 0x110000;
4705
4706 overflowed = true;
4707 }
4708 }
4709 }
4710
4711 (i, raw)
4712 } else {
4713 unreachable!();
4714 };
4715
4716 // Check the character reference code:
4717 let cr = match value {
4718 // If the number is 0x00, then this is a null-character-reference
4719 // parse error. Set the character
4720 // reference code to 0xFFFD.
4721 0 => {
4722 self.emit_error(ErrorKind::NullCharacterReference);
4723
4724 0xfffd
4725 }
4726 // If the number is greater than 0x10FFFF, then this is a
4727 // character-reference-outside-unicode-range parse error. Set the
4728 // character reference code to
4729 // 0xFFFD.
4730 cr if cr > 0x10ffff => {
4731 self.emit_error(ErrorKind::CharacterReferenceOutsideUnicodeRange);
4732
4733 0xfffd
4734 }
4735 // If the number is a surrogate, then this is a
4736 // surrogate-character-reference parse error. Set the character
4737 // reference code to 0xFFFD.
4738 cr if is_surrogate(cr) => {
4739 self.emit_error(ErrorKind::SurrogateCharacterReference);
4740
4741 0xfffd
4742 }
4743 // If the number is a noncharacter, then this is a
4744 // noncharacter-character-reference parse error.
4745 cr if is_noncharacter(cr) => {
4746 self.emit_error(ErrorKind::NoncharacterCharacterReference);
4747
4748 cr
4749 }
4750 // If the number is 0x0D, or a control that's not ASCII whitespace,
4751 // then
4752 // this is a control-character-reference parse error. If the number
4753 // is one of the numbers in the
4754 // first column of the following table, then find the
4755 // row with that number in the first column, and set the character
4756 // reference code to the number in
4757 // the second column of that row.
4758 cr if cr == 0x0d || is_control(cr) => {
4759 self.emit_error(ErrorKind::ControlCharacterReference);
4760
4761 match cr {
4762 // 0x80 0x20AC EURO SIGN (€)
4763 0x80 => 0x20ac,
4764 // 0x82 0x201A SINGLE LOW-9 QUOTATION MARK (‚)
4765 0x82 => 0x201a,
4766 // 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK (ƒ)
4767 0x83 => 0x0192,
4768 // 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK („)
4769 0x84 => 0x201e,
4770 // 0x85 0x2026 HORIZONTAL ELLIPSIS (…)
4771 0x85 => 0x2026,
4772 // 0x86 0x2020 DAGGER (†)
4773 0x86 => 0x2020,
4774 // 0x87 0x2021 DOUBLE DAGGER (‡)
4775 0x87 => 0x2021,
4776 // 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
4777 0x88 => 0x02c6,
4778 // 0x89 0x2030 PER MILLE SIGN (‰)
4779 0x89 => 0x2030,
4780 // 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON (Š)
4781 0x8a => 0x0160,
4782 // 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
4783 0x8b => 0x2039,
4784 // 0x8C 0x0152 LATIN CAPITAL LIGATURE OE (Œ)
4785 0x8c => 0x0152,
4786 // 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON (Ž)
4787 0x8e => 0x017d,
4788 // 0x91 0x2018 LEFT SINGLE QUOTATION MARK (‘)
4789 0x91 => 0x2018,
4790 // 0x92 0x2018 RIGHT SINGLE QUOTATION MARK (’)
4791 0x92 => 0x2019,
4792 // 0x93 0x201C LEFT DOUBLE QUOTATION MARK (“)
4793 0x93 => 0x201c,
4794 // 0x94 0x201D RIGHT DOUBLE QUOTATION MARK (”)
4795 0x94 => 0x201d,
4796 // 0x95 0x2022 BULLET (•)
4797 0x95 => 0x2022,
4798 // 0x96 0x2013 EN DASH (–)
4799 0x96 => 0x2013,
4800 // 0x97 0x2014 EM DASH (—)
4801 0x97 => 0x2014,
4802 // 0x98 0x02DC SMALL TILDE (˜)
4803 0x98 => 0x02dc,
4804 // 0x99 0x2122 TRADE MARK SIGN (™)
4805 0x99 => 0x2122,
4806 // 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON (š)
4807 0x9a => 0x0161,
4808 // 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
4809 0x9b => 0x203a,
4810 // 0x9C 0x0153 LATIN SMALL LIGATURE OE (œ)
4811 0x9c => 0x0153,
4812 // 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON (ž)
4813 0x9e => 0x017e,
4814 // 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
4815 0x9f => 0x0178,
4816 _ => cr,
4817 }
4818 }
4819 _ => value,
4820 };
4821
4822 // Set the temporary buffer to the empty string.
4823 // Append a code point equal to the character reference code to the temporary
4824 // buffer.
4825 // Flush code points consumed as a character reference.
4826 // Switch to the return state.
4827 let old_temporary_buffer = self.temporary_buffer.clone();
4828
4829 let mut raw =
4830 String::with_capacity(old_temporary_buffer.len() + raw_char_ref.len() + 1);
4831
4832 raw.push_str(&old_temporary_buffer);
4833 raw.push_str(&raw_char_ref);
4834
4835 if self.cur == Some(';') {
4836 raw.push(';');
4837 }
4838
4839 self.temporary_buffer.clear();
4840
4841 let c = match char::from_u32(cr) {
4842 Some(c) => c,
4843 _ => {
4844 unreachable!();
4845 }
4846 };
4847
4848 self.temporary_buffer.push(c);
4849 self.flush_code_points_consumed_as_character_reference(Some(raw));
4850 self.state = self.return_state.clone();
4851 }
4852 }
4853
4854 Ok(())
4855 }
4856
4857 #[inline(always)]
4858 fn skip_whitespaces(&mut self, c: char) {
4859 if c == '\r' && self.input.cur() == Some('\n') {
4860 unsafe {
4861 // Safety: cur() is Some
4862 self.input.bump();
4863 }
4864 }
4865 }
4866}
4867
4868// By spec '\r` removed before tokenizer, but we keep them to have better AST
4869// and don't break logic to ignore characters
4870#[inline(always)]
4871fn is_spacy(c: char) -> bool {
4872 matches!(c, '\x09' | '\x0a' | '\x0d' | '\x0c' | '\x20')
4873}
4874
4875#[inline(always)]
4876fn is_control(c: u32) -> bool {
4877 matches!(c, c @ 0x00..=0x1f | c @ 0x7f..=0x9f if !matches!(c, 0x09 | 0x0a | 0x0c | 0x0d | 0x20))
4878}
4879
4880#[inline(always)]
4881fn is_surrogate(c: u32) -> bool {
4882 matches!(c, 0xd800..=0xdfff)
4883}
4884
4885// A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
4886// inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE,
4887// U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
4888// U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE,
4889// U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE,
4890// U+FFFFF, U+10FFFE, or U+10FFFF.
4891#[inline(always)]
4892fn is_noncharacter(c: u32) -> bool {
4893 matches!(
4894 c,
4895 0xfdd0
4896 ..=0xfdef
4897 | 0xfffe
4898 | 0xffff
4899 | 0x1fffe
4900 | 0x1ffff
4901 | 0x2fffe
4902 | 0x2ffff
4903 | 0x3fffe
4904 | 0x3ffff
4905 | 0x4fffe
4906 | 0x4ffff
4907 | 0x5fffe
4908 | 0x5ffff
4909 | 0x6fffe
4910 | 0x6ffff
4911 | 0x7fffe
4912 | 0x7ffff
4913 | 0x8fffe
4914 | 0x8ffff
4915 | 0x9fffe
4916 | 0x9ffff
4917 | 0xafffe
4918 | 0xaffff
4919 | 0xbfffe
4920 | 0xbffff
4921 | 0xcfffe
4922 | 0xcffff
4923 | 0xdfffe
4924 | 0xdffff
4925 | 0xefffe
4926 | 0xeffff
4927 | 0xffffe
4928 | 0xfffff
4929 | 0x10fffe
4930 | 0x10ffff,
4931 )
4932}
4933
4934#[inline(always)]
4935fn is_upper_hex_digit(c: char) -> bool {
4936 matches!(c, '0'..='9' | 'A'..='F')
4937}
4938
4939#[inline(always)]
4940fn is_lower_hex_digit(c: char) -> bool {
4941 matches!(c, '0'..='9' | 'a'..='f')
4942}
4943
4944#[inline(always)]
4945fn is_ascii_hex_digit(c: char) -> bool {
4946 is_upper_hex_digit(c) || is_lower_hex_digit(c)
4947}
4948
4949#[inline(always)]
4950fn is_ascii_upper_alpha(c: char) -> bool {
4951 c.is_ascii_uppercase()
4952}
4953
4954#[inline(always)]
4955fn is_ascii_lower_alpha(c: char) -> bool {
4956 c.is_ascii_lowercase()
4957}
4958
4959#[inline(always)]
4960fn is_ascii_alpha(c: char) -> bool {
4961 is_ascii_upper_alpha(c) || is_ascii_lower_alpha(c)
4962}
4963
4964#[inline(always)]
4965fn is_allowed_control_character(c: u32) -> bool {
4966 c != 0x00 && is_control(c)
4967}
4968
4969#[inline(always)]
4970fn is_allowed_character(c: char) -> bool {
4971 let c = c as u32;
4972
4973 if is_surrogate(c) || is_allowed_control_character(c) || is_noncharacter(c) {
4974 return false;
4975 }
4976
4977 return true;
4978}