swc_ecma_lexer/common/lexer/
whitespace.rs

1/// Returns true if it's done
2type ByteHandler = Option<for<'aa> fn(&mut SkipWhitespace<'aa>) -> u32>;
3
4/// Lookup table for whitespace
5static BYTE_HANDLERS: [ByteHandler; 256] = [
6    //   0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   //
7    ___, ___, ___, ___, ___, ___, ___, ___, ___, SPC, NLN, SPC, SPC, NLN, ___, ___, // 0
8    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1
9    SPC, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 2
10    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 3
11    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 4
12    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 5
13    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 6
14    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 7
15    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8
16    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9
17    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A
18    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B
19    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C
20    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D
21    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E
22    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F
23];
24
25/// Stop
26const ___: ByteHandler = None;
27
28/// Newline
29const NLN: ByteHandler = Some(|skip| {
30    skip.newline = true;
31
32    1
33});
34
35/// Space
36const SPC: ByteHandler = Some(|_| 1);
37
38/// Unicode
39const UNI: ByteHandler = Some(|skip| {
40    // Check byte patterns directly for more efficient Unicode character processing
41    let bytes = skip.input.as_bytes();
42    let i = skip.offset as usize;
43
44    // Check available bytes
45    let remaining_bytes = bytes.len() - i;
46    if remaining_bytes < 1 {
47        return 0;
48    }
49
50    // Predict UTF-8 character length from the first byte
51    let first_byte = unsafe { *bytes.get_unchecked(i) };
52    let char_len = if first_byte < 128 {
53        1
54    } else if first_byte < 224 {
55        if remaining_bytes < 2 {
56            return 0;
57        }
58        2
59    } else if first_byte < 240 {
60        if remaining_bytes < 3 {
61            return 0;
62        }
63        3
64    } else {
65        if remaining_bytes < 4 {
66            return 0;
67        }
68        4
69    };
70
71    // Fast path for common Unicode whitespace characters
72    // Check UTF-8 byte patterns directly
73    if char_len == 3 {
74        // LSEP (U+2028) - Line Separator: E2 80 A8
75        if first_byte == 0xe2
76            && unsafe { *bytes.get_unchecked(i + 1) } == 0x80
77            && unsafe { *bytes.get_unchecked(i + 2) } == 0xa8
78        {
79            skip.newline = true;
80            return 3;
81        }
82
83        // PSEP (U+2029) - Paragraph Separator: E2 80 A9
84        if first_byte == 0xe2
85            && unsafe { *bytes.get_unchecked(i + 1) } == 0x80
86            && unsafe { *bytes.get_unchecked(i + 2) } == 0xa9
87        {
88            skip.newline = true;
89            return 3;
90        }
91    }
92
93    // Process with general method if not handled by fast path
94    let s = unsafe {
95        // Safety: `skip.offset` is always valid
96        skip.input.get_unchecked(skip.offset as usize..)
97    };
98
99    let c = unsafe {
100        // Safety: byte handlers are only called when `skip.input` is not empty
101        s.chars().next().unwrap_unchecked()
102    };
103
104    match c {
105        // Byte Order Mark (BOM)
106        '\u{feff}' => {}
107        // Line break characters already handled above
108        '\u{2028}' | '\u{2029}' => {
109            skip.newline = true;
110        }
111        // Other whitespace characters
112        _ if c.is_whitespace() => {}
113        // Not a whitespace character
114        _ => return 0,
115    }
116
117    c.len_utf8() as u32
118});
119
120/// API is taked from oxc by Boshen (https://github.com/Boshen/oxc/pull/26)
121pub(super) struct SkipWhitespace<'a> {
122    pub input: &'a str,
123
124    /// Total offset
125    pub offset: u32,
126
127    /// Found newline
128    pub newline: bool,
129}
130
131impl SkipWhitespace<'_> {
132    #[inline(always)]
133    pub fn scan(&mut self) {
134        let bytes = self.input.as_bytes();
135        let len = bytes.len();
136        let mut pos = self.offset as usize;
137        debug_assert!(pos == 0);
138        debug_assert!(pos <= len);
139
140        // Optimization: return immediately if input is empty
141        if pos == len {
142            return;
143        }
144
145        loop {
146            // Optimization 1: Process consecutive spaces (most common case) at once
147            let mut byte = unsafe { *bytes.get_unchecked(pos) };
148
149            // Handle consecutive space characters (very common case)
150            if byte == b' ' {
151                pos += 1;
152                // Skip spaces repeatedly (process multiple spaces at once)
153                while pos < len && unsafe { *bytes.get_unchecked(pos) } == b' ' {
154                    pos += 1;
155                }
156
157                // Check if we've reached the end of input
158                if pos >= len {
159                    break;
160                }
161
162                // Get current byte again
163                byte = unsafe { *bytes.get_unchecked(pos) };
164            }
165
166            // Optimization 2: Handle other common whitespace characters
167            match byte {
168                b'\n' => {
169                    pos += 1;
170                    self.newline = true;
171
172                    if pos >= len {
173                        break;
174                    }
175                    continue;
176                }
177                b'\r' => {
178                    pos += 1;
179
180                    // Handle CR+LF sequence (Windows line break)
181                    if pos < len && unsafe { *bytes.get_unchecked(pos) } == b'\n' {
182                        pos += 1;
183                        self.newline = true;
184                    } else {
185                        self.newline = true; // Treat standalone CR as line
186                                             // break too
187                    }
188
189                    if pos >= len {
190                        break;
191                    }
192                    continue;
193                }
194                // Case where handler is needed
195                _ => {
196                    debug_assert!(byte != b' ' && byte != b'\n' && byte != b'\r');
197                    // Temporarily update offset
198                    self.offset = pos as u32;
199
200                    // Use handler table
201                    let handler = unsafe { BYTE_HANDLERS.get_unchecked(byte as usize) };
202
203                    match handler {
204                        Some(handler) => {
205                            let delta = handler(self);
206                            if delta == 0 {
207                                // Non-whitespace character found
208                                // offset is already updated
209                                return;
210                            }
211                            pos = (self.offset + delta) as usize;
212
213                            if pos >= len {
214                                break;
215                            }
216                        }
217                        None => {
218                            // Non-whitespace character found
219                            // offset is already updated
220                            return;
221                        }
222                    }
223                }
224            }
225        }
226
227        // Update offset to final position
228        self.offset = pos as u32;
229    }
230}
swc_ecma_lexer/common/lexer/whitespace.rs

swc_ecma_lexer/common/lexer/
whitespace.rs