swc_ecma_fast_parser/lexer/
string.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
//! String literals processing for the lexer
//!
//! This module handles the parsing of string literals in ECMAScript/TypeScript.

use swc_atoms::Atom;
use swc_common::Span;
use wide::u8x16;

use super::Lexer;
use crate::{
    error::{Error, ErrorKind, Result},
    token::{Token, TokenType, TokenValue},
};

// Pre-computed lookup table for escape sequences
static ESCAPE_LOOKUP: [u8; 128] = {
    let mut table = [0u8; 128];
    table[b'\\' as usize] = b'\\';
    table[b'n' as usize] = b'\n';
    table[b'r' as usize] = b'\r';
    table[b't' as usize] = b'\t';
    table[b'b' as usize] = b'\x08';
    table[b'f' as usize] = b'\x0C';
    table[b'v' as usize] = b'\x0B';
    table[b'\'' as usize] = b'\'';
    table[b'"' as usize] = b'"';
    table[b'`' as usize] = b'`';
    table[b'0' as usize] = b'\0';
    table
};

// SIMD vectors for common string end characters
static BACKSLASH_SMID_VEC: u8x16 = u8x16::new([b'\\'; 16]);
static NEWLINE_SMID_VEC: u8x16 = u8x16::new([b'\n'; 16]);
static CARRIAGE_SMID_VEC: u8x16 = u8x16::new([b'\r'; 16]);

// Buffer for string construction - using thread_local to avoid allocation
thread_local! {
    static STRING_BUFFER: std::cell::RefCell<Vec<u8>> = std::cell::RefCell::new(Vec::with_capacity(1024));
}

impl Lexer<'_> {
    /// Read a string literal
    #[inline]
    pub(super) fn read_string(&mut self, quote: u8) -> Result<Token> {
        let start_pos = self.start_pos;

        // Skip the opening quote
        self.cursor.advance();

        // Fast path: if there are no escape sequences or line terminators, we can
        // directly extract the string without processing each character
        let mut has_escapes = false;

        // Try to find the closing quote
        match self.find_string_end(quote) {
            Some(end) => {
                // Fast path - no escapes
                let end_pos = self.cursor.position() + end;
                // Skip to the end quote
                self.cursor.advance_n(end);
                // Skip the closing quote
                self.cursor.advance();
                end_pos
            }
            None => {
                // Slower path - contains escapes, line terminators, or unterminated
                has_escapes = true;

                // Process each character using a local buffer
                let mut buffer = Vec::with_capacity(128);
                let mut found_closing_quote = false;

                while let Some(ch) = self.cursor.peek() {
                    // Check for unterminated string
                    if self.cursor.is_eof() {
                        let span = Span::new(start_pos, self.cursor.pos());
                        return Err(Error {
                            kind: ErrorKind::UnterminatedString,
                            span,
                        });
                    }

                    // Check for closing quote
                    if ch == quote {
                        self.cursor.advance();
                        found_closing_quote = true;
                        break;
                    }

                    // Check for line terminators (not allowed in strings)
                    if ch == b'\r' || ch == b'\n' {
                        let span = Span::new(start_pos, self.cursor.pos());
                        return Err(Error {
                            kind: ErrorKind::UnterminatedString,
                            span,
                        });
                    }

                    // Check for escape sequences
                    if ch == b'\\' {
                        self.cursor.advance();

                        // Get the next character
                        match self.cursor.peek() {
                            Some(b'x') => {
                                // Hexadecimal escape \xNN
                                self.cursor.advance();
                                let hex_value = self.read_hex_escape(2)? as u8;
                                buffer.push(hex_value);
                            }
                            Some(b'u') => {
                                // Unicode escape \uNNNN or \u{NNNNNN}
                                self.cursor.advance();
                                let code_point_char = self.read_unicode_escape()?;
                                let mut utf8_buf = [0u8; 4];
                                let utf8_str = code_point_char.encode_utf8(&mut utf8_buf);
                                buffer.extend_from_slice(utf8_str.as_bytes());
                            }
                            Some(escape_char @ 0..=127) => {
                                // Simple escape sequence
                                self.cursor.advance();
                                let replacement = ESCAPE_LOOKUP[escape_char as usize];
                                if replacement != 0 {
                                    buffer.push(replacement);
                                } else if (b'0'..=b'7').contains(&escape_char) {
                                    // Octal escape (legacy)
                                    buffer.push(self.read_octal_escape(escape_char)?);
                                } else {
                                    // Any other character is escaped as itself
                                    buffer.push(escape_char);
                                }
                            }
                            Some(ch) => {
                                // Any other escape sequence
                                self.cursor.advance();
                                buffer.push(ch);
                            }
                            None => {
                                // Unterminated escape sequence
                                let span = Span::new(start_pos, self.cursor.pos());
                                return Err(Error {
                                    kind: ErrorKind::UnterminatedString,
                                    span,
                                });
                            }
                        }
                    } else {
                        // Regular character
                        buffer.push(ch);
                        self.cursor.advance();
                    }
                }

                if !found_closing_quote {
                    let span = Span::new(start_pos, self.cursor.pos());
                    return Err(Error {
                        kind: ErrorKind::UnterminatedString,
                        span,
                    });
                }

                // Save the buffer in thread_local for reuse
                STRING_BUFFER.with(|tls_buffer| {
                    let mut tls = tls_buffer.borrow_mut();
                    tls.clear();
                    tls.extend_from_slice(&buffer);
                });

                self.cursor.position()
            }
        };

        // Extract the raw string (including quotes)
        let raw_start = start_pos.0;
        let raw_end = self.cursor.position();
        let raw_bytes = unsafe { self.cursor.slice_unchecked(raw_start, raw_end) };
        let raw_str = unsafe { std::str::from_utf8_unchecked(raw_bytes) };

        // Extract the string value if we used the fast path
        let string_value = if has_escapes {
            // Use the thread-local buffer for the string value
            STRING_BUFFER.with(|buffer| {
                let buffer = buffer.borrow();
                Atom::from(unsafe { std::str::from_utf8_unchecked(&buffer) })
            })
        } else {
            // Direct extraction (excluding quotes)
            let value_bytes = unsafe { self.cursor.slice_unchecked(raw_start + 1, raw_end - 1) };
            Atom::from(unsafe { std::str::from_utf8_unchecked(value_bytes) })
        };

        // Create token
        let span = Span::new(start_pos, self.cursor.pos());

        Ok(Token::new(
            TokenType::Str,
            span,
            bool::from(self.had_line_break),
            TokenValue::Str {
                value: string_value,
                raw: Atom::from(raw_str),
            },
        ))
    }

    /// Find the end of a string without processing escape sequences
    #[inline]
    fn find_string_end(&self, quote: u8) -> Option<u32> {
        let pos = 0;
        let rest = self.cursor.rest();

        // Try the SIMD implementation first, falling back to standard if needed
        self.find_string_end_simd(pos, rest, quote)
            .or_else(|| self.find_string_end_standard(pos, rest, quote))
    }

    /// SIMD-accelerated implementation for finding end of string
    #[inline]
    fn find_string_end_simd(&self, start_pos: u32, rest: &[u8], quote: u8) -> Option<u32> {
        // Safety check for small inputs - process with standard method
        if rest.len() < 32 || start_pos >= rest.len() as u32 {
            return None;
        }

        let mut pos = start_pos;

        // Create vectors for quick comparison
        let quote_vec = u8x16::splat(quote);

        // Process in chunks of 16 bytes using SIMD
        while pos + 16 <= rest.len() as u32 {
            // Load 16 bytes
            let chunk_bytes = &rest[pos as usize..(pos + 16) as usize];
            let mut bytes = [0u8; 16];
            bytes.copy_from_slice(chunk_bytes);
            let chunk = u8x16::new(bytes);

            // Check for presence of special characters with a single combined mask
            let quote_mask = chunk.cmp_eq(quote_vec);
            let backslash_mask = chunk.cmp_eq(BACKSLASH_SMID_VEC);
            let newline_mask = chunk.cmp_eq(NEWLINE_SMID_VEC);
            let carriage_mask = chunk.cmp_eq(CARRIAGE_SMID_VEC);
            // Combine all masks with OR operation
            let combined_mask = quote_mask | backslash_mask | newline_mask | carriage_mask;

            // Convert combined mask to array to check if any special character was found
            let mask_array = combined_mask.to_array();

            // Check if any element in the mask array is non-zero
            #[allow(clippy::needless_range_loop)]
            for i in 0..16 {
                if mask_array[i] != 0 {
                    // We found a character that needs special handling
                    // Process from here using the standard algorithm
                    return self.find_string_end_standard(pos + i as u32, rest, quote);
                }
            }

            // If we get here, the chunk doesn't contain any special characters
            pos += 16;
        }

        // Process remainder with standard algorithm
        if pos < rest.len() as u32 {
            return self.find_string_end_standard(pos, rest, quote);
        }

        None
    }

    /// Standard (non-SIMD) implementation of string end finding
    #[inline]
    fn find_string_end_standard(&self, start_pos: u32, rest: &[u8], quote: u8) -> Option<u32> {
        let mut pos = start_pos;
        let mut in_escape = false;

        // Safety check for empty input
        if rest.is_empty() || pos >= rest.len() as u32 {
            return None;
        }

        while pos < rest.len() as u32 {
            let ch = unsafe { *rest.get_unchecked(pos as usize) };

            if in_escape {
                // Skip the escaped character
                in_escape = false;
                pos += 1;
                continue;
            }

            if ch == b'\\' {
                // Mark that we're in an escape sequence
                in_escape = true;
                pos += 1;
                // If we're at the end after a backslash, it's unterminated
                if pos >= rest.len() as u32 {
                    return None;
                }
            } else if ch == quote {
                // Found unescaped quote
                return Some(pos);
            } else if ch == b'\n' || ch == b'\r' {
                // Line terminator in string is an error
                return None;
            } else {
                pos += 1;
            }
        }

        // String is unterminated
        None
    }

    /// Read an octal escape sequence
    #[inline]
    fn read_octal_escape(&mut self, first: u8) -> Result<u8> {
        let mut value = first - b'0';

        // Read up to 2 more octal digits
        for _ in 0..2 {
            match self.cursor.peek() {
                Some(c @ b'0'..=b'7') => {
                    // Ensure we don't overflow u8
                    let next_value = (value as u16) * 8 + (c - b'0') as u16;
                    if next_value > 255 {
                        break;
                    }
                    value = next_value as u8;
                    self.cursor.advance();
                }
                _ => break,
            }
        }

        Ok(value)
    }
}