swc_xml_parser/parser/
mod.rs

1use std::{cell::RefCell, mem, rc::Rc};
2
3use node::*;
4use open_elements_stack::*;
5use swc_common::{Span, DUMMY_SP};
6use swc_xml_ast::*;
7
8use self::input::{Buffer, ParserInput};
9use crate::error::{Error, ErrorKind};
10
11#[macro_use]
12mod macros;
13pub mod input;
14mod node;
15mod open_elements_stack;
16
17pub type PResult<T> = Result<T, Error>;
18
19#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
20pub struct ParserConfig {}
21
22#[derive(Debug, Default)]
23pub enum Phase {
24    #[default]
25    StartPhase,
26    MainPhase,
27    EndPhase,
28}
29
30pub struct Parser<I>
31where
32    I: ParserInput,
33{
34    #[allow(dead_code)]
35    config: ParserConfig,
36    input: Buffer<I>,
37    stopped: bool,
38    document: Option<RcNode>,
39    open_elements_stack: OpenElementsStack,
40    errors: Vec<Error>,
41    phase: Phase,
42}
43
44impl<I> Parser<I>
45where
46    I: ParserInput,
47{
48    pub fn new(input: I, config: ParserConfig) -> Self {
49        Parser {
50            config,
51            input: Buffer::new(input),
52            stopped: false,
53            document: None,
54            open_elements_stack: OpenElementsStack::new(),
55            errors: Default::default(),
56            phase: Phase::default(),
57        }
58    }
59
60    pub fn dump_cur(&mut self) -> String {
61        format!("{:?}", self.input.cur())
62    }
63
64    pub fn take_errors(&mut self) -> Vec<Error> {
65        mem::take(&mut self.errors)
66    }
67
68    pub fn parse_document(&mut self) -> PResult<Document> {
69        let start = self.input.cur_span()?;
70
71        self.document = Some(self.create_document());
72
73        self.run()?;
74
75        let document = &mut self.document.take().unwrap();
76        let nodes = document.children.take();
77        let mut children = Vec::with_capacity(nodes.len());
78
79        for node in nodes {
80            children.push(self.node_to_child(node));
81        }
82
83        let last = self.input.last_pos()?;
84
85        Ok(Document {
86            span: Span::new(start.lo(), last),
87            children,
88        })
89    }
90
91    fn create_document(&self) -> RcNode {
92        Node::new(Data::Document, DUMMY_SP)
93    }
94
95    #[allow(clippy::only_used_in_recursion)]
96    fn get_deep_end_span(&mut self, children: &[Child]) -> Option<Span> {
97        match children.last() {
98            Some(Child::DocumentType(DocumentType { span, .. })) => Some(*span),
99            Some(Child::Element(Element { span, children, .. })) => {
100                if span.is_dummy() {
101                    return self.get_deep_end_span(children);
102                }
103
104                Some(*span)
105            }
106            Some(Child::Comment(Comment { span, .. })) => Some(*span),
107            Some(Child::Text(Text { span, .. })) => Some(*span),
108            _ => None,
109        }
110    }
111
112    fn node_to_child(&mut self, node: RcNode) -> Child {
113        let start_span = node.start_span.take();
114
115        match node.data.clone() {
116            Data::DocumentType {
117                name,
118                public_id,
119                system_id,
120                raw,
121            } => Child::DocumentType(DocumentType {
122                span: start_span,
123                name,
124                public_id,
125                system_id,
126                raw,
127            }),
128            Data::Element {
129                tag_name,
130                attributes,
131            } => {
132                let nodes = node.children.take();
133                let mut new_children = Vec::with_capacity(nodes.len());
134
135                for node in nodes {
136                    new_children.push(self.node_to_child(node));
137                }
138
139                let attributes = attributes.take();
140
141                let span = if start_span.is_dummy() {
142                    start_span
143                } else {
144                    let end_span = match node.end_span.take() {
145                        Some(end_span) if !end_span.is_dummy() => end_span,
146                        _ => match self.get_deep_end_span(&new_children) {
147                            Some(end_span) => end_span,
148                            _ => start_span,
149                        },
150                    };
151
152                    Span::new(start_span.lo(), end_span.hi())
153                };
154
155                Child::Element(Element {
156                    span,
157                    tag_name,
158                    attributes,
159                    children: new_children,
160                })
161            }
162            Data::Text { data, raw } => {
163                let span = if let Some(end_span) = node.end_span.take() {
164                    swc_common::Span::new(start_span.lo(), end_span.hi())
165                } else {
166                    start_span
167                };
168
169                Child::Text(Text {
170                    span,
171                    data: data.take().into(),
172                    raw: Some(raw.take().into()),
173                })
174            }
175            Data::Comment { data, raw } => Child::Comment(Comment {
176                span: start_span,
177                data,
178                raw,
179            }),
180            Data::ProcessingInstruction { target, data } => {
181                Child::ProcessingInstruction(ProcessingInstruction {
182                    span: start_span,
183                    target,
184                    data,
185                })
186            }
187            Data::CdataSection { data, raw } => Child::CdataSection(CdataSection {
188                span: start_span,
189                data,
190                raw,
191            }),
192            _ => {
193                unreachable!();
194            }
195        }
196    }
197
198    fn run(&mut self) -> PResult<()> {
199        while !self.stopped {
200            let mut token_and_info = match self.input.cur()? {
201                Some(_) => {
202                    let span = self.input.cur_span()?;
203                    let token = bump!(self);
204
205                    TokenAndInfo {
206                        span: span!(self, span.lo()),
207                        acknowledged: false,
208                        token,
209                    }
210                }
211                None => {
212                    let start_pos = self.input.start_pos()?;
213                    let last_pos = self.input.last_pos()?;
214
215                    TokenAndInfo {
216                        span: Span::new(start_pos, last_pos),
217                        acknowledged: false,
218                        token: Token::Eof,
219                    }
220                }
221            };
222
223            // Re-emit errors from tokenizer
224            for error in self.input.take_errors() {
225                let (span, kind) = *error.into_inner();
226
227                self.errors.push(Error::new(span, kind));
228            }
229
230            self.tree_construction_dispatcher(&mut token_and_info)?;
231        }
232
233        Ok(())
234    }
235
236    fn tree_construction_dispatcher(&mut self, token_and_info: &mut TokenAndInfo) -> PResult<()> {
237        self.process_token(token_and_info, None)
238    }
239
240    fn process_token(
241        &mut self,
242        token_and_info: &mut TokenAndInfo,
243        phase: Option<Phase>,
244    ) -> PResult<()> {
245        let phase = match &phase {
246            Some(phase) => phase,
247            _ => &self.phase,
248        };
249
250        match phase {
251            Phase::StartPhase => match &token_and_info.token {
252                Token::StartTag { .. } => {
253                    let element = self.create_element_for_token(token_and_info.clone());
254
255                    self.append_node(self.document.as_ref().unwrap(), element.clone());
256                    self.open_elements_stack.items.push(element);
257                    self.phase = Phase::MainPhase;
258                }
259                Token::EmptyTag { .. } => {
260                    let element = self.create_element_for_token(token_and_info.clone());
261
262                    self.append_node(self.document.as_ref().unwrap(), element);
263                    self.phase = Phase::EndPhase;
264                }
265                Token::Comment { .. } => {
266                    self.append_comment_to_doc(token_and_info)?;
267                }
268                Token::ProcessingInstruction { .. } => {
269                    self.append_processing_instruction_to_doc(token_and_info)?;
270                }
271                Token::Cdata { .. } => {
272                    self.errors.push(Error::new(
273                        token_and_info.span,
274                        ErrorKind::UnexpectedTokenInStartPhase,
275                    ));
276
277                    self.append_cdata_to_doc(token_and_info)?;
278                }
279                Token::Character { value, .. } => {
280                    if !is_whitespace(*value) {
281                        self.errors.push(Error::new(
282                            token_and_info.span,
283                            ErrorKind::UnexpectedCharacter,
284                        ));
285                    }
286                }
287                Token::Eof => {
288                    self.errors.push(Error::new(
289                        token_and_info.span,
290                        ErrorKind::UnexpectedEofInStartPhase,
291                    ));
292
293                    self.process_token(token_and_info, Some(Phase::EndPhase))?;
294                }
295                Token::Doctype { .. } => {
296                    let document_type = self.create_document_type_for_token(token_and_info);
297
298                    self.append_node(self.document.as_ref().unwrap(), document_type);
299                }
300                _ => {
301                    self.errors.push(Error::new(
302                        token_and_info.span,
303                        ErrorKind::UnexpectedTokenInStartPhase,
304                    ));
305                }
306            },
307            Phase::MainPhase => match &token_and_info.token {
308                Token::Character { .. } => {
309                    self.append_character_to_current_element(token_and_info)?;
310                }
311                Token::StartTag { .. } => {
312                    let element = self.create_element_for_token(token_and_info.clone());
313
314                    self.append_node(self.get_current_element(), element.clone());
315                    self.open_elements_stack.items.push(element);
316                }
317                Token::EmptyTag { .. } => {
318                    let element = self.create_element_for_token(token_and_info.clone());
319
320                    self.append_node(self.get_current_element(), element);
321                }
322                Token::EndTag { tag_name, .. } => {
323                    if get_tag_name!(self.get_current_element()) != tag_name {
324                        self.errors.push(Error::new(
325                            token_and_info.span,
326                            ErrorKind::OpeningAndEndingTagMismatch,
327                        ));
328                    }
329
330                    let is_closed = self
331                        .open_elements_stack
332                        .items
333                        .iter()
334                        .rev()
335                        .any(|node| get_tag_name!(node) == tag_name);
336
337                    if is_closed {
338                        let popped = self
339                            .open_elements_stack
340                            .pop_until_tag_name_popped(&[tag_name]);
341
342                        self.update_end_tag_span(popped.as_ref(), token_and_info.span);
343                    }
344
345                    if self.open_elements_stack.items.is_empty() {
346                        self.phase = Phase::EndPhase;
347                    }
348                }
349                Token::Comment { .. } => {
350                    let comment = self.create_comment(token_and_info);
351
352                    self.append_node(self.get_current_element(), comment);
353                }
354                Token::ProcessingInstruction { .. } => {
355                    let processing_instruction = self.create_processing_instruction(token_and_info);
356
357                    self.append_node(self.get_current_element(), processing_instruction);
358                }
359                Token::Cdata { .. } => {
360                    let cdata = self.create_cdata_section(token_and_info);
361
362                    self.append_node(self.get_current_element(), cdata);
363                }
364                Token::Eof => {
365                    self.errors.push(Error::new(
366                        token_and_info.span,
367                        ErrorKind::UnexpectedEofInMainPhase,
368                    ));
369
370                    self.process_token(token_and_info, Some(Phase::EndPhase))?;
371                }
372                _ => {
373                    self.errors.push(Error::new(
374                        token_and_info.span,
375                        ErrorKind::UnexpectedTokenInMainPhase,
376                    ));
377                }
378            },
379            Phase::EndPhase => match &token_and_info.token {
380                Token::Comment { .. } => {
381                    self.append_comment_to_doc(token_and_info)?;
382                }
383                Token::ProcessingInstruction { .. } => {
384                    self.append_processing_instruction_to_doc(token_and_info)?;
385                }
386                Token::Cdata { .. } => {
387                    self.errors.push(Error::new(
388                        token_and_info.span,
389                        ErrorKind::UnexpectedTokenInEndPhase,
390                    ));
391
392                    self.append_cdata_to_doc(token_and_info)?;
393                }
394                Token::Character { value, .. } => {
395                    if !is_whitespace(*value) {
396                        self.errors.push(Error::new(
397                            token_and_info.span,
398                            ErrorKind::UnexpectedCharacter,
399                        ));
400                    }
401                }
402                Token::Eof => {
403                    self.stopped = true;
404                }
405                _ => {
406                    self.errors.push(Error::new(
407                        token_and_info.span,
408                        ErrorKind::UnexpectedTokenInEndPhase,
409                    ));
410                }
411            },
412        }
413
414        Ok(())
415    }
416
417    fn append_node(&self, parent: &RcNode, child: RcNode) {
418        let previous_parent = child.parent.replace(Some(Rc::downgrade(parent)));
419
420        // Invariant: child cannot have existing parent
421        assert!(previous_parent.is_none());
422
423        parent.children.borrow_mut().push(child);
424    }
425
426    fn get_current_element(&self) -> &RcNode {
427        self.open_elements_stack
428            .items
429            .last()
430            .expect("no current element")
431    }
432
433    fn create_document_type_for_token(&self, token_and_info: &mut TokenAndInfo) -> RcNode {
434        let (name, public_id, system_id, raw) = match &token_and_info.token {
435            Token::Doctype {
436                name,
437                public_id,
438                system_id,
439                raw,
440            } => (
441                name.clone(),
442                public_id.clone(),
443                system_id.clone(),
444                raw.clone(),
445            ),
446            _ => {
447                unreachable!()
448            }
449        };
450
451        Node::new(
452            Data::DocumentType {
453                name,
454                public_id,
455                system_id,
456                raw,
457            },
458            token_and_info.span,
459        )
460    }
461
462    fn create_element_for_token(&self, token_and_info: TokenAndInfo) -> RcNode {
463        let element = match token_and_info.token {
464            Token::StartTag {
465                tag_name,
466                attributes,
467                ..
468            }
469            | Token::EndTag {
470                tag_name,
471                attributes,
472                ..
473            }
474            | Token::EmptyTag {
475                tag_name,
476                attributes,
477                ..
478            } => {
479                let attributes = attributes
480                    .into_iter()
481                    .map(|attribute_token| Attribute {
482                        span: attribute_token.span,
483                        namespace: None,
484                        prefix: None,
485                        name: attribute_token.name,
486                        raw_name: attribute_token.raw_name,
487                        value: attribute_token.value,
488                        raw_value: attribute_token.raw_value,
489                    })
490                    .collect();
491
492                Data::Element {
493                    tag_name,
494                    attributes: RefCell::new(attributes),
495                }
496            }
497            _ => {
498                unreachable!();
499            }
500        };
501
502        Node::new(element, token_and_info.span)
503    }
504
505    fn append_character_to_current_element(
506        &mut self,
507        token_and_info: &mut TokenAndInfo,
508    ) -> PResult<()> {
509        if let Some(last) = self.open_elements_stack.items.last() {
510            let children = last.children.borrow();
511
512            if let Some(last) = children.last() {
513                if let Data::Text {
514                    data,
515                    raw: raw_data,
516                } = &last.data
517                {
518                    match &token_and_info.token {
519                        Token::Character {
520                            value: c,
521                            raw: raw_c,
522                        } => {
523                            data.borrow_mut().push(*c);
524
525                            if let Some(raw_c) = raw_c {
526                                raw_data.borrow_mut().push_str(raw_c);
527                            }
528                        }
529                        _ => {
530                            unreachable!();
531                        }
532                    }
533
534                    let mut span = last.end_span.borrow_mut();
535
536                    *span = Some(token_and_info.span);
537
538                    return Ok(());
539                }
540            }
541        }
542
543        let (data, raw) = match &token_and_info.token {
544            Token::Character {
545                value: c,
546                raw: raw_c,
547            } => {
548                let mut data = String::with_capacity(255);
549
550                data.push(*c);
551
552                let mut raw = String::with_capacity(255);
553
554                if let Some(raw_c) = raw_c {
555                    raw.push_str(raw_c);
556                }
557
558                (RefCell::new(data), RefCell::new(raw))
559            }
560            _ => {
561                unreachable!()
562            }
563        };
564
565        let text = Node::new(Data::Text { data, raw }, token_and_info.span);
566
567        self.append_node(self.get_current_element(), text);
568
569        Ok(())
570    }
571
572    fn create_comment(&self, token_and_info: &mut TokenAndInfo) -> RcNode {
573        let (data, raw) = match &token_and_info.token {
574            Token::Comment { data, raw } => (data.clone(), Some(raw.clone())),
575            _ => {
576                unreachable!()
577            }
578        };
579
580        Node::new(Data::Comment { data, raw }, token_and_info.span)
581    }
582
583    fn append_comment_to_doc(&mut self, token_and_info: &mut TokenAndInfo) -> PResult<()> {
584        let comment = self.create_comment(token_and_info);
585
586        self.append_node(self.document.as_ref().unwrap(), comment);
587
588        Ok(())
589    }
590
591    fn create_processing_instruction(&self, token_and_info: &mut TokenAndInfo) -> RcNode {
592        let (target, data) = match &token_and_info.token {
593            Token::ProcessingInstruction { target, data } => (target.clone(), data.clone()),
594            _ => {
595                unreachable!()
596            }
597        };
598
599        Node::new(
600            Data::ProcessingInstruction { target, data },
601            token_and_info.span,
602        )
603    }
604
605    fn append_processing_instruction_to_doc(
606        &mut self,
607        token_and_info: &mut TokenAndInfo,
608    ) -> PResult<()> {
609        let child = self.create_processing_instruction(token_and_info);
610
611        self.append_node(self.document.as_ref().unwrap(), child);
612
613        Ok(())
614    }
615
616    fn create_cdata_section(&self, token_and_info: &mut TokenAndInfo) -> RcNode {
617        let (data, raw) = match &token_and_info.token {
618            Token::Cdata { data, raw } => (data.clone(), Some(raw.clone())),
619            _ => {
620                unreachable!()
621            }
622        };
623
624        Node::new(Data::CdataSection { data, raw }, token_and_info.span)
625    }
626
627    fn append_cdata_to_doc(&mut self, token_and_info: &mut TokenAndInfo) -> PResult<()> {
628        let child = self.create_cdata_section(token_and_info);
629
630        self.append_node(self.document.as_ref().unwrap(), child);
631
632        Ok(())
633    }
634
635    fn update_end_tag_span(&self, node: Option<&RcNode>, span: Span) {
636        if let Some(node) = node {
637            if node.start_span.borrow().is_dummy() {
638                return;
639            }
640
641            let mut end_tag_span = node.end_span.borrow_mut();
642
643            *end_tag_span = Some(span);
644        }
645    }
646}
647
648fn is_whitespace(c: char) -> bool {
649    matches!(c, '\t' | '\r' | '\n' | '\x0C' | ' ')
650}