swc_ecma_parser/lexer/
whitespace.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/// Returns true if it's done
pub(super) type ByteHandler = Option<for<'aa> fn(&mut SkipWhitespace<'aa>) -> u32>;

/// Lookup table for whitespace
static BYTE_HANDLERS: [ByteHandler; 256] = [
    //   0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   //
    ___, ___, ___, ___, ___, ___, ___, ___, ___, SPC, NLN, SPC, SPC, NLN, ___, ___, // 0
    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1
    SPC, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 2
    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 3
    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 4
    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 5
    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 6
    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 7
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E
    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F
];

/// Stop
const ___: ByteHandler = None;

/// Newline
const NLN: ByteHandler = Some(|skip| {
    skip.newline = true;

    1
});

/// Space
const SPC: ByteHandler = Some(|_| 1);

/// Unicode
const UNI: ByteHandler = Some(|skip| {
    // Check byte patterns directly for more efficient Unicode character processing
    let bytes = skip.input.as_bytes();
    let i = skip.offset as usize;

    // Check available bytes
    let remaining_bytes = bytes.len() - i;
    if remaining_bytes < 1 {
        return 0;
    }

    // Predict UTF-8 character length from the first byte
    let first_byte = unsafe { *bytes.get_unchecked(i) };
    let char_len = if first_byte < 128 {
        1
    } else if first_byte < 224 {
        if remaining_bytes < 2 {
            return 0;
        }
        2
    } else if first_byte < 240 {
        if remaining_bytes < 3 {
            return 0;
        }
        3
    } else {
        if remaining_bytes < 4 {
            return 0;
        }
        4
    };

    // Fast path for common Unicode whitespace characters
    // Check UTF-8 byte patterns directly
    if char_len == 3 {
        // LSEP (U+2028) - Line Separator: E2 80 A8
        if first_byte == 0xe2
            && unsafe { *bytes.get_unchecked(i + 1) } == 0x80
            && unsafe { *bytes.get_unchecked(i + 2) } == 0xa8
        {
            skip.newline = true;
            return 3;
        }

        // PSEP (U+2029) - Paragraph Separator: E2 80 A9
        if first_byte == 0xe2
            && unsafe { *bytes.get_unchecked(i + 1) } == 0x80
            && unsafe { *bytes.get_unchecked(i + 2) } == 0xa9
        {
            skip.newline = true;
            return 3;
        }
    }

    // Process with general method if not handled by fast path
    let s = unsafe {
        // Safety: `skip.offset` is always valid
        skip.input.get_unchecked(skip.offset as usize..)
    };

    let c = unsafe {
        // Safety: byte handlers are only called when `skip.input` is not empty
        s.chars().next().unwrap_unchecked()
    };

    match c {
        // Byte Order Mark (BOM)
        '\u{feff}' => {}
        // Line break characters already handled above
        '\u{2028}' | '\u{2029}' => {
            skip.newline = true;
        }
        // Other whitespace characters
        _ if c.is_whitespace() => {}
        // Not a whitespace character
        _ => return 0,
    }

    c.len_utf8() as u32
});

/// API is taked from oxc by Boshen (https://github.com/Boshen/oxc/pull/26)
pub(super) struct SkipWhitespace<'a> {
    pub input: &'a str,

    /// Total offset
    pub offset: u32,

    /// Found newline
    pub newline: bool,
}

impl SkipWhitespace<'_> {
    #[inline(always)]
    pub fn scan(&mut self) {
        let bytes = self.input.as_bytes();
        let len = bytes.len();
        let mut pos = self.offset as usize;

        // Optimization: return immediately if input is empty
        if pos >= len {
            return;
        }

        loop {
            // Optimization 1: Process consecutive spaces (most common case) at once
            let mut byte = unsafe { *bytes.get_unchecked(pos) };

            // Handle consecutive space characters (very common case)
            if byte == b' ' {
                pos += 1;
                // Skip spaces repeatedly (process multiple spaces at once)
                while pos < len && unsafe { *bytes.get_unchecked(pos) } == b' ' {
                    pos += 1;
                }

                // Check if we've reached the end of input
                if pos >= len {
                    break;
                }

                // Get current byte again
                byte = unsafe { *bytes.get_unchecked(pos) };
            }

            // Optimization 2: Handle other common whitespace characters
            match byte {
                b'\n' => {
                    pos += 1;
                    self.newline = true;

                    if pos >= len {
                        break;
                    }
                    continue;
                }
                b'\t' => {
                    pos += 1;

                    if pos >= len {
                        break;
                    }
                    continue;
                }
                b'\r' => {
                    pos += 1;

                    // Handle CR+LF sequence (Windows line break)
                    if pos < len && unsafe { *bytes.get_unchecked(pos) } == b'\n' {
                        pos += 1;
                        self.newline = true;
                    } else {
                        self.newline = true; // Treat standalone CR as line
                                             // break too
                    }

                    if pos >= len {
                        break;
                    }
                    continue;
                }
                // Case where handler is needed
                _ => {
                    // Temporarily update offset
                    self.offset = pos as u32;

                    // Use handler table
                    let handler = unsafe { BYTE_HANDLERS.get_unchecked(byte as usize) };

                    match handler {
                        Some(handler) => {
                            let delta = handler(self);
                            if delta == 0 {
                                // Non-whitespace character found
                                // offset is already updated
                                return;
                            }
                            pos = self.offset as usize + delta as usize;

                            if pos >= len {
                                break;
                            }
                        }
                        None => {
                            // Non-whitespace character found
                            // offset is already updated
                            return;
                        }
                    }
                }
            }
        }

        // Update offset to final position
        self.offset = pos as u32;
    }
}