swc_ecma_fast_parser/lexer/regex.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
//! Regular expression literals processing for the lexer
//!
//! This module handles the parsing of RegExp literals in ECMAScript/TypeScript.
use swc_atoms::Atom;
use super::Lexer;
use crate::{
error::{Error, ErrorKind, Result},
token::{Token, TokenType, TokenValue},
};
impl Lexer<'_> {
/// Read a regular expression literal
/// Assumes the initial '/' has been consumed
pub(super) fn read_regex(&mut self, had_line_break: bool) -> Result<Token> {
let start_pos = self.start_pos;
let start_idx = start_pos.0;
// Read the pattern
let mut in_class = false; // Whether we're in a character class [...]
let mut escaped = false; // Whether the previous character was escaped
// Regular expression pattern
loop {
match self.cursor.peek() {
// End of pattern
Some(b'/') if !in_class && !escaped => {
self.cursor.advance();
break;
}
// End of file (unterminated regex)
None => {
let span = self.span();
return Err(Error {
kind: ErrorKind::InvalidRegExp {
reason: "Unterminated regular expression literal",
},
span,
});
}
// Line break (illegal in regex literals)
Some(b'\n') | Some(b'\r') => {
let span = self.span();
return Err(Error {
kind: ErrorKind::InvalidRegExp {
reason: "Line break in regular expression literal",
},
span,
});
}
// Start of character class
Some(b'[') if !escaped => {
in_class = true;
self.cursor.advance();
escaped = false;
}
// End of character class
Some(b']') if in_class && !escaped => {
in_class = false;
self.cursor.advance();
escaped = false;
}
// Escape sequence
Some(b'\\') if !escaped => {
self.cursor.advance();
escaped = true;
}
// Regular character
Some(_) => {
self.cursor.advance();
escaped = false;
}
}
}
// Read the flags
let mut flags = String::new();
while let Some(ch) = self.cursor.peek() {
if Self::is_identifier_continue(ch) {
flags.push(ch as char);
self.cursor.advance();
} else {
break;
}
}
// Validate flags (basic validation)
let mut seen_flags = [false; 128];
for ch in flags.bytes() {
if ch as usize >= seen_flags.len() || seen_flags[ch as usize] {
let span = self.span();
return Err(Error {
kind: ErrorKind::InvalidRegExp {
reason: "Duplicate flag in regular expression",
},
span,
});
}
seen_flags[ch as usize] = true;
}
// Extract the raw regex
let end_idx = self.cursor.position();
let regex_bytes = unsafe { self.cursor.slice_unchecked(start_idx, end_idx) };
let regex_str = unsafe { std::str::from_utf8_unchecked(regex_bytes) };
// Split into pattern and flags (skip the leading and trailing '/')
let pattern_end = regex_str.rfind('/').unwrap_or(0);
let pattern = ®ex_str[1..pattern_end];
let span = self.span();
Ok(Token::new(
TokenType::Regex,
span,
had_line_break,
TokenValue::Regex {
exp: Atom::from(pattern),
flags: Atom::from(flags),
},
))
}
/// Check if the slash is the start of a regex literal
pub(super) fn is_regex_start(&self) -> bool {
// We generally decide this based on context (whether a slash could be a
// division operator) Usually, a slash starts a regex if the previous
// token can precede an expression and is not a ++ or -- operator (which
// would make the slash a division operator)
self.current.before_expr()
&& self.current.token_type != TokenType::PlusPlus
&& self.current.token_type != TokenType::MinusMinus
}
}