swc_ecma_regexp/parser/reader/string_literal_parser/
parser_impl.rs

1use super::{
2    ast,
3    characters::{
4        is_line_terminator, is_non_escape_character, is_single_escape_character, CR, LF, LS, PS,
5    },
6    diagnostics,
7    options::Options,
8};
9use crate::{diagnostics::Result, parser::span_factory::SpanFactory};
10
11// Internal representation of escape sequence resolved unit in a string literal.
12type OffsetsAndCp = ((u32, u32), u32);
13
14/// Helper API for `RegExp` literal parsing.
15/// This time, we don't need to handle escape sequences.
16pub fn parse_regexp_literal(
17    source_text: &str,
18    span_offset: u32,
19    combine_surrogate_pair: bool,
20) -> Vec<ast::CodePoint> {
21    let mut body = vec![];
22
23    let mut offset = 0;
24    for ch in source_text.chars() {
25        let start = offset;
26        #[expect(clippy::cast_possible_truncation)]
27        let end = start + ch.len_utf8() as u32;
28
29        let offsets_and_cp: OffsetsAndCp = ((start, end), ch as u32);
30        Parser::handle_code_point(
31            &mut body,
32            offsets_and_cp,
33            span_offset,
34            combine_surrogate_pair,
35        );
36        offset = end;
37    }
38
39    body
40}
41
42pub struct Parser {
43    // NOTE: In JavaScript, string literals are UTF-16 encoded,
44    // so we need to be aware of surrogate pairs, while collecting offsets for `Span`.
45    // Rather than using `encode_utf16()`, split surrogate pairs manually is easier
46    // to detect the start and end of each code point.
47    chars: Vec<char>,
48    index: usize,
49    offset: u32,
50    options: Options,
51}
52
53impl Parser {
54    // This is public because it is used in `parse_regexp_literal()`.
55    pub fn handle_code_point(
56        body: &mut Vec<ast::CodePoint>,
57        (offsets, cp): OffsetsAndCp,
58        span_offset: u32,
59        combine_surrogate_pair: bool,
60    ) {
61        let span = SpanFactory::span_from_u32(span_offset + offsets.0, span_offset + offsets.1);
62
63        if combine_surrogate_pair || (0..=0xffff).contains(&cp) {
64            // If the code point is in the BMP or if forced, just push it
65            body.push(ast::CodePoint { span, value: cp });
66        } else {
67            // Otherwise, split the code point into a surrogate pair, sharing the same span
68            let (lead, trail) = (
69                0xd800 + ((cp - 0x10000) >> 10),
70                0xdc00 + ((cp - 0x10000) & 0x3ff),
71            );
72            body.push(ast::CodePoint { span, value: lead });
73            body.push(ast::CodePoint { span, value: trail });
74        }
75    }
76
77    // ---
78
79    pub fn new(source_text: &str, options: Options) -> Self {
80        Self {
81            chars: source_text.chars().collect::<Vec<_>>(),
82            index: 0,
83            offset: 0,
84            options,
85        }
86    }
87
88    // ```
89    // StringLiteral ::
90    //   " DoubleStringCharacters[opt] "
91    //   ' SingleStringCharacters[opt] '
92    // ```
93    pub fn parse(mut self) -> Result<ast::StringLiteral> {
94        let (quote_char, kind) = if self.eat('"') {
95            ('"', ast::StringLiteralKind::Double)
96        } else if self.eat('\'') {
97            ('\'', ast::StringLiteralKind::Single)
98        } else {
99            return Err(diagnostics::invalid_input(SpanFactory::span_from_u32(
100                self.options.span_offset,
101                self.options.span_offset,
102            )));
103        };
104
105        let body = self.parse_string_characters(quote_char)?;
106
107        if self.eat(quote_char) {
108            if self.peek().is_some() {
109                return Err(diagnostics::invalid_input(SpanFactory::span_from_u32(
110                    self.options.span_offset + self.offset(),
111                    self.options.span_offset + self.offset(),
112                )));
113            }
114
115            let span = SpanFactory::span_from_u32(
116                self.options.span_offset,
117                self.options.span_offset + self.offset(),
118            );
119            return Ok(ast::StringLiteral { span, kind, body });
120        }
121
122        Err(diagnostics::invalid_input(SpanFactory::span_from_u32(
123            self.options.span_offset + self.offset(),
124            self.options.span_offset + self.offset(),
125        )))
126    }
127
128    // ---
129
130    // ```
131    // DoubleStringCharacters ::
132    //   DoubleStringCharacter DoubleStringCharacters[opt]
133    //
134    // SingleStringCharacters ::
135    //   SingleStringCharacter SingleStringCharacters[opt]
136    // ```
137    fn parse_string_characters(
138        &mut self,
139        single_or_double_quote: char,
140    ) -> Result<Vec<ast::CodePoint>> {
141        let mut body = vec![];
142        while let Some(code_point) = self.parse_string_character(single_or_double_quote)? {
143            Parser::handle_code_point(
144                &mut body,
145                code_point,
146                self.options.span_offset,
147                self.options.combine_surrogate_pair,
148            );
149        }
150        Ok(body)
151    }
152
153    // ```
154    // DoubleStringCharacter ::
155    //   SourceCharacter but not one of " or \ or LineTerminator
156    //   <LS>
157    //   <PS>
158    //   \ EscapeSequence
159    //   LineContinuation
160    //
161    // SingleStringCharacter ::
162    //   SourceCharacter but not one of ' or \ or LineTerminator
163    //   <LS>
164    //   <PS>
165    //   \ EscapeSequence
166    //   LineContinuation
167    // ```
168    fn parse_string_character(
169        &mut self,
170        single_or_double_quote: char,
171    ) -> Result<Option<OffsetsAndCp>> {
172        let offset_start = self.offset();
173        let checkpoint = self.checkpoint();
174
175        if let Some(ch) = self
176            .peek()
177            .filter(|&ch| ch != single_or_double_quote && ch != '\\' && !is_line_terminator(ch))
178        {
179            self.advance();
180            return Ok(Some(((offset_start, self.offset()), ch as u32)));
181        }
182        if self.peek() == Some(LS) {
183            self.advance();
184            return Ok(Some(((offset_start, self.offset()), LS as u32)));
185        }
186        if self.peek() == Some(PS) {
187            self.advance();
188            return Ok(Some(((offset_start, self.offset()), PS as u32)));
189        }
190        if self.eat('\\') {
191            if let Some(cp) = self.parse_escape_sequence(offset_start)? {
192                return Ok(Some(((offset_start, self.offset()), cp)));
193            }
194            self.rewind(checkpoint);
195        }
196        if let Some(cp) = self.parse_line_terminator_sequence() {
197            return Ok(Some(((offset_start, self.offset()), cp)));
198        }
199
200        Ok(None)
201    }
202
203    // ```
204    // EscapeSequence ::
205    //   CharacterEscapeSequence
206    //   0 [lookahead ∉ DecimalDigit]
207    //   LegacyOctalEscapeSequence
208    //   NonOctalDecimalEscapeSequence
209    //   HexEscapeSequence
210    //   UnicodeEscapeSequence
211    // ```
212    fn parse_escape_sequence(&mut self, offset_start: u32) -> Result<Option<u32>> {
213        if let Some(cp) = self.parse_character_escape_sequence() {
214            return Ok(Some(cp));
215        }
216        if self.peek() == Some('0') && self.peek2().map_or(true, |ch| !ch.is_ascii_digit()) {
217            self.advance();
218            return Ok(Some(0x00));
219        }
220        if let Some(cp) = self.parse_legacy_octal_escape_sequence() {
221            // [SS:EE] EscapeSequence :: LegacyOctalEscapeSequence
222            // It is a Syntax Error if IsStrict(this production) is true.
223            if self.options.strict_mode {
224                return Err(diagnostics::legacy_in_strict_mode(
225                    "octal escape sequence",
226                    SpanFactory::span_from_u32(
227                        self.options.span_offset + offset_start,
228                        self.options.span_offset + self.offset(),
229                    ),
230                ));
231            }
232            return Ok(Some(cp));
233        }
234        if let Some(cp) = self.parse_non_octal_decimal_escape_sequence() {
235            // [SS:EE] EscapeSequence :: NonOctalDecimalEscapeSequence
236            // It is a Syntax Error if IsStrict(this production) is true.
237            if self.options.strict_mode {
238                return Err(diagnostics::legacy_in_strict_mode(
239                    "non octal decimal escape sequence",
240                    SpanFactory::span_from_u32(
241                        self.options.span_offset + offset_start,
242                        self.options.span_offset + self.offset(),
243                    ),
244                ));
245            }
246            return Ok(Some(cp));
247        }
248        if let Some(cp) = self.parse_hex_escape_sequence() {
249            return Ok(Some(cp));
250        }
251        if let Some(cp) = self.parse_unicode_escape_sequence(offset_start)? {
252            return Ok(Some(cp));
253        }
254
255        Ok(None)
256    }
257
258    // ```
259    // CharacterEscapeSequence ::
260    //   SingleEscapeCharacter
261    //   NonEscapeCharacter
262    // ```
263    fn parse_character_escape_sequence(&mut self) -> Option<u32> {
264        if let Some(ch) = self.peek().filter(|&ch| is_single_escape_character(ch)) {
265            self.advance();
266            return Some(ch as u32);
267        }
268        if let Some(ch) = self.peek().filter(|&ch| is_non_escape_character(ch)) {
269            self.advance();
270            return Some(ch as u32);
271        }
272
273        None
274    }
275
276    // ```
277    // LegacyOctalEscapeSequence ::
278    //   0 [lookahead ∈ { 8, 9 }]
279    //   NonZeroOctalDigit [lookahead ∉ OctalDigit]
280    //   ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
281    //   FourToSeven OctalDigit
282    //   ZeroToThree OctalDigit OctalDigit
283    //
284    // NonZeroOctalDigit ::
285    //   OctalDigit but not 0
286    //
287    // ZeroToThree :: one of
288    //   0 1 2 3
289    //
290    // FourToSeven :: one of
291    //   4 5 6 7
292    // ```
293    fn parse_legacy_octal_escape_sequence(&mut self) -> Option<u32> {
294        if let Some(first) = self.consume_octal_digit() {
295            // 0 [lookahead ∈ { 8, 9 }]
296            if first == 0 && self.peek().filter(|&ch| !matches!(ch, '8' | '9')).is_some() {
297                return Some(first);
298            }
299
300            if let Some(second) = self.consume_octal_digit() {
301                if let Some(third) = self.consume_octal_digit() {
302                    // ZeroToThree OctalDigit OctalDigit
303                    if first <= 3 {
304                        return Some(first * 64 + second * 8 + third);
305                    }
306                }
307
308                // ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
309                // FourToSeven OctalDigit
310                return Some(first * 8 + second);
311            }
312
313            // NonZeroOctalDigit [lookahead ∉ OctalDigit]
314            return Some(first);
315        }
316
317        None
318    }
319
320    // ```
321    // NonOctalDecimalEscapeSequence :: one of
322    //   8 9
323    // ```
324    fn parse_non_octal_decimal_escape_sequence(&mut self) -> Option<u32> {
325        if self.eat('8') {
326            return Some('8' as u32);
327        }
328        if self.eat('9') {
329            return Some('9' as u32);
330        }
331        None
332    }
333
334    // ```
335    // HexEscapeSequence ::
336    //   x HexDigit HexDigit
337    // ```
338    fn parse_hex_escape_sequence(&mut self) -> Option<u32> {
339        let checkpoint = self.checkpoint();
340
341        if self.eat('x') {
342            if let Some(first) = self.consume_hex_digit() {
343                if let Some(second) = self.consume_hex_digit() {
344                    return Some(first * 16 + second);
345                }
346            }
347
348            self.rewind(checkpoint);
349        }
350
351        None
352    }
353
354    // ```
355    // UnicodeEscapeSequence ::
356    //   u Hex4Digits
357    //   u{ CodePoint }
358    // ```
359    fn parse_unicode_escape_sequence(&mut self, offset_start: u32) -> Result<Option<u32>> {
360        let chckpoint = self.checkpoint();
361
362        if self.eat('u') {
363            if let Some(cp) = self.consume_hex4_digits() {
364                return Ok(Some(cp));
365            }
366            self.rewind(chckpoint);
367        }
368
369        if self.eat('u') {
370            if self.eat('{') {
371                if let Some(hex_digits) = self
372                    .consume_hex_digits(offset_start)?
373                    .filter(|&cp| cp <= 0x10_ffff)
374                {
375                    if self.eat('}') {
376                        return Ok(Some(hex_digits));
377                    }
378                }
379            }
380            self.rewind(chckpoint);
381        }
382
383        Ok(None)
384    }
385
386    // ```
387    // LineContinuation ::
388    //   \ LineTerminatorSequence
389    //
390    // LineTerminatorSequence ::
391    //   <LF>
392    //   <CR> [lookahead ≠ <LF>]
393    //   <LS>
394    //   <PS>
395    //   <CR> <LF>
396    // ```
397    fn parse_line_terminator_sequence(&mut self) -> Option<u32> {
398        let checkpoint = self.checkpoint();
399
400        if self.eat('\\') {
401            if self.peek() == Some(LF) {
402                self.advance();
403                return Some(LF as u32);
404            }
405            if self.peek() == Some(CR) && self.peek2() != Some(LF) {
406                self.advance();
407                return Some(CR as u32);
408            }
409            if self.peek() == Some(LS) {
410                self.advance();
411                return Some(LS as u32);
412            }
413            if self.peek() == Some(PS) {
414                self.advance();
415                return Some(PS as u32);
416            }
417            // NOTE: CR+LF can not represent as a single code point.
418            // I don't know the best way to handle this.
419            // To distinguish this from CR and LF, structural change is needed...
420            if self.peek() == Some(CR) && self.peek2() == Some(LF) {
421                self.advance();
422                self.advance();
423                return Some(LF as u32);
424            }
425        }
426
427        self.rewind(checkpoint);
428        None
429    }
430
431    // ---
432
433    fn consume_hex_digit(&mut self) -> Option<u32> {
434        if let Some(ch) = self.peek().filter(char::is_ascii_hexdigit) {
435            self.advance();
436            return ch.to_digit(16);
437        }
438
439        None
440    }
441
442    fn consume_octal_digit(&mut self) -> Option<u32> {
443        if let Some(ch) = self
444            .peek()
445            .filter(char::is_ascii_digit)
446            .filter(|&ch| ch < '8')
447        {
448            self.advance();
449            // `- '0' as u32`: convert code point to digit
450            return Some(ch as u32 - '0' as u32);
451        }
452
453        None
454    }
455
456    // ```
457    // Hex4Digits ::
458    //   HexDigit HexDigit HexDigit HexDigit
459    // ```
460    fn consume_hex4_digits(&mut self) -> Option<u32> {
461        let checkpoint = self.checkpoint();
462
463        let mut value = 0;
464        for _ in 0..4 {
465            let Some(hex) = self
466                .peek()
467                .filter(char::is_ascii_hexdigit)
468                .and_then(|ch| ch.to_digit(16))
469            else {
470                self.rewind(checkpoint);
471                return None;
472            };
473
474            value = (16 * value) + hex;
475            self.advance();
476        }
477
478        Some(value)
479    }
480
481    fn consume_hex_digits(&mut self, offset_start: u32) -> Result<Option<u32>> {
482        let checkpoint = self.checkpoint();
483
484        let mut value: u32 = 0;
485        while let Some(hex) = self
486            .peek()
487            .filter(char::is_ascii_hexdigit)
488            .and_then(|ch| ch.to_digit(16))
489        {
490            // To prevent panic on overflow cases like `\u{FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF}`
491            if let Some(v) = value.checked_mul(16).and_then(|v| v.checked_add(hex)) {
492                value = v;
493                self.advance();
494            } else {
495                return Err(diagnostics::too_large_unicode_escape_sequence(
496                    SpanFactory::span_from_u32(
497                        self.options.span_offset + offset_start,
498                        self.options.span_offset + self.offset(),
499                    ),
500                ));
501            }
502        }
503
504        if self.checkpoint() != checkpoint {
505            return Ok(Some(value));
506        }
507
508        Ok(None)
509    }
510
511    // ---
512
513    fn checkpoint(&self) -> (usize, u32) {
514        (self.index, self.offset)
515    }
516
517    fn rewind(&mut self, checkpoint: (usize, u32)) {
518        self.index = checkpoint.0;
519        self.offset = checkpoint.1;
520    }
521
522    fn advance(&mut self) {
523        if let Some(ch) = self.chars.get(self.index) {
524            #[expect(clippy::cast_possible_truncation)]
525            let len = ch.len_utf8() as u32;
526            self.offset += len;
527            self.index += 1;
528        }
529    }
530
531    fn eat(&mut self, ch: char) -> bool {
532        if self.peek() == Some(ch) {
533            self.advance();
534            return true;
535        }
536        false
537    }
538
539    fn offset(&self) -> u32 {
540        self.offset
541    }
542
543    fn peek_nth(&self, n: usize) -> Option<char> {
544        let nth = self.index + n;
545        self.chars.get(nth).copied()
546    }
547
548    fn peek(&self) -> Option<char> {
549        self.peek_nth(0)
550    }
551
552    fn peek2(&self) -> Option<char> {
553        self.peek_nth(1)
554    }
555}