swc_ecma_lexer/common/lexer/
char.rs

1use std::iter::FusedIterator;
2
3use arrayvec::ArrayVec;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
6pub struct Char(u32);
7
8impl From<char> for Char {
9    fn from(c: char) -> Self {
10        Char(c as u32)
11    }
12}
13
14impl From<u32> for Char {
15    fn from(c: u32) -> Self {
16        Char(c)
17    }
18}
19
20pub struct CharIter(ArrayVec<char, 12>);
21
22/// Ported from https://github.com/web-infra-dev/oxc/blob/99a4816ce7b6132b2667257984f9d92ae3768f03/crates/oxc_parser/src/lexer/mod.rs#L1349-L1374
23impl IntoIterator for Char {
24    type IntoIter = CharIter;
25    type Item = char;
26
27    #[allow(unsafe_code)]
28    fn into_iter(self) -> Self::IntoIter {
29        //        // TODO: Check if this is correct
30        //        fn to_char(v: u8) -> char {
31        //            char::from_digit(v as _, 16).unwrap_or('0')
32        //        }
33
34        CharIter(match char::from_u32(self.0) {
35            Some(c) => {
36                let mut buf = ArrayVec::new();
37                // Safety: we can make sure that `buf` has enough capacity
38                unsafe {
39                    buf.push_unchecked(c);
40                }
41                buf
42            }
43            None => {
44                let mut buf = ArrayVec::new();
45
46                let high = self.0 & 0xffff0000 >> 16;
47
48                let low = self.0 & 0x0000ffff;
49
50                // The second code unit of a surrogate pair is always in the range from 0xDC00
51                // to 0xDFFF, and is called a low surrogate or a trail surrogate.
52                if !(0xdc00..=0xdfff).contains(&low) {
53                    // Safety: we can make sure that `buf` has enough capacity
54                    unsafe {
55                        buf.push_unchecked('\\');
56                        buf.push_unchecked('u');
57                        for c in format!("{high:x}").chars() {
58                            buf.push_unchecked(c);
59                        }
60                        buf.push_unchecked('\\');
61                        buf.push_unchecked('u');
62                        for c in format!("{low:x}").chars() {
63                            buf.push_unchecked(c);
64                        }
65                    }
66                } else {
67                    // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair`
68                    let astral_code_point = (high - 0xd800) * 0x400 + low - 0xdc00 + 0x10000;
69
70                    // Safety: we can make sure that `buf` has enough capacity
71                    unsafe {
72                        buf.push_unchecked('\\');
73                        buf.push_unchecked('u');
74                        for c in format!("{astral_code_point:x}").chars() {
75                            buf.push_unchecked(c);
76                        }
77                    }
78                }
79
80                buf
81            }
82        })
83    }
84}
85
86impl Iterator for CharIter {
87    type Item = char;
88
89    fn next(&mut self) -> Option<Self::Item> {
90        if self.0.is_empty() {
91            None
92        } else {
93            Some(self.0.remove(0))
94        }
95    }
96}
97
98impl FusedIterator for CharIter {}
99
100/// Implemented for `char`.
101pub trait CharExt: Copy {
102    fn to_char(self) -> Option<char>;
103
104    /// Test whether a given character code starts an identifier.
105    ///
106    /// https://tc39.github.io/ecma262/#prod-IdentifierStart
107    #[inline]
108    fn is_ident_start(self) -> bool {
109        let c = match self.to_char() {
110            Some(c) => c,
111            None => return false,
112        };
113        swc_ecma_ast::Ident::is_valid_start(c)
114    }
115
116    /// Test whether a given character is part of an identifier.
117    #[inline]
118    fn is_ident_part(self) -> bool {
119        let c = match self.to_char() {
120            Some(c) => c,
121            None => return false,
122        };
123        swc_ecma_ast::Ident::is_valid_continue(c)
124    }
125
126    /// See https://tc39.github.io/ecma262/#sec-line-terminators
127    #[inline]
128    fn is_line_terminator(self) -> bool {
129        let c = match self.to_char() {
130            Some(c) => c,
131            None => return false,
132        };
133        matches!(c, '\r' | '\n' | '\u{2028}' | '\u{2029}')
134    }
135
136    /// See https://tc39.github.io/ecma262/#sec-literals-string-literals
137    #[inline]
138    fn is_line_break(self) -> bool {
139        let c = match self.to_char() {
140            Some(c) => c,
141            None => return false,
142        };
143        matches!(c, '\r' | '\n')
144    }
145
146    /// See https://tc39.github.io/ecma262/#sec-white-space
147    #[inline]
148    fn is_ws(self) -> bool {
149        let c = match self.to_char() {
150            Some(c) => c,
151            None => return false,
152        };
153        match c {
154            '\u{0009}' | '\u{000b}' | '\u{000c}' | '\u{0020}' | '\u{00a0}' | '\u{feff}' => true,
155            _ => {
156                if self.is_line_terminator() {
157                    // NOTE: Line terminator is not whitespace.
158                    false
159                } else {
160                    c.is_whitespace()
161                }
162            }
163        }
164    }
165}
166
167impl CharExt for Char {
168    #[inline(always)]
169    fn to_char(self) -> Option<char> {
170        char::from_u32(self.0)
171    }
172}
173
174impl CharExt for char {
175    #[inline(always)]
176    fn to_char(self) -> Option<char> {
177        Some(self)
178    }
179}