swc_ecma_regexp/parser/pattern_parser/
state.rs1use rustc_hash::{FxHashMap, FxHashSet};
2use swc_atoms::Atom;
3
4use crate::parser::reader::Reader;
5
6#[derive(Debug)]
9pub struct State {
10 pub unicode_mode: bool,
12 pub unicode_sets_mode: bool,
13 pub named_capture_groups: bool,
14 pub num_of_capturing_groups: u32,
16 pub capturing_group_names: FxHashSet<Atom>,
17}
18
19type DuplicatedNamedCapturingGroupOffsets = Vec<(u32, u32)>;
20
21impl State {
22 pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self {
23 Self {
24 unicode_mode,
25 unicode_sets_mode,
26 named_capture_groups: false,
27 num_of_capturing_groups: 0,
28 capturing_group_names: FxHashSet::default(),
29 }
30 }
31
32 pub fn initialize_with_parsing(
33 &mut self,
34 reader: &mut Reader,
35 ) -> Result<(), DuplicatedNamedCapturingGroupOffsets> {
36 let (num_of_left_capturing_parens, capturing_group_names) = parse_capturing_groups(reader)?;
37
38 self.named_capture_groups =
43 self.unicode_mode || self.unicode_sets_mode || !capturing_group_names.is_empty();
44
45 self.num_of_capturing_groups = num_of_left_capturing_parens;
46 self.capturing_group_names = capturing_group_names;
47
48 Ok(())
49 }
50}
51
52enum SimpleUnit {
53 Open,
54 Close,
55 Pipe,
56 GroupName(Atom),
57}
58
59fn parse_capturing_groups<'a>(
62 reader: &mut Reader<'a>,
63) -> Result<(u32, FxHashSet<Atom>), DuplicatedNamedCapturingGroupOffsets> {
64 let mut num_of_left_capturing_parens = 0;
70
71 let mut group_names: FxHashMap<Atom, (u32, u32)> = FxHashMap::default();
73 let mut may_duplicates: FxHashMap<Atom, DuplicatedNamedCapturingGroupOffsets> =
82 FxHashMap::default();
83 let mut simplified: Vec<SimpleUnit> = vec![];
84
85 let mut in_escape = false;
86 let mut in_character_class = false;
87 while let Some(cp) = reader.peek() {
88 if in_escape {
89 in_escape = false;
90 } else if cp == '\\' as u32 {
91 in_escape = true;
92 } else if cp == '[' as u32 {
93 in_character_class = true;
94 } else if cp == ']' as u32 {
95 in_character_class = false;
96 } else if !in_character_class && cp == '|' as u32 {
97 simplified.push(SimpleUnit::Pipe);
98 } else if !in_character_class && cp == ')' as u32 {
99 simplified.push(SimpleUnit::Close);
100 } else if !in_character_class && cp == '(' as u32 {
101 reader.advance();
102
103 simplified.push(SimpleUnit::Open);
104
105 if reader.eat2('?', ':')
107 || reader.eat2('?', '=')
109 || reader.eat2('?', '!')
110 || reader.eat3('?', '<', '=')
111 || reader.eat3('?', '<', '!')
112 {
113 continue;
114 }
115
116 num_of_left_capturing_parens += 1;
118
119 if reader.eat2('?', '<') {
121 let span_start = reader.offset();
122 while let Some(ch) = reader.peek() {
123 if ch == '>' as u32 {
124 break;
125 }
126 reader.advance();
127 }
128 let span_end = reader.offset();
129
130 if reader.eat('>') {
131 let group_name = reader.atom(span_start, span_end);
132
133 simplified.push(SimpleUnit::GroupName(group_name.clone()));
134 if let Some(last_span) = group_names.get(&group_name) {
136 let entry = may_duplicates.entry(group_name).or_default();
137 entry.push(*last_span);
138 entry.push((span_start, span_end));
139 } else {
140 group_names.insert(group_name, (span_start, span_end));
141 }
142
143 continue;
144 }
145 }
146
147 continue;
149 }
150
151 reader.advance();
152 }
153
154 if !may_duplicates.is_empty() {
156 for (group_name, spans) in may_duplicates {
158 let iter = simplified.iter().clone();
159
160 let mut alternative_depth = FxHashSet::default();
161 let mut depth = 0_u32;
162 let mut is_first = true;
163
164 'outer: for token in iter {
165 match token {
166 SimpleUnit::Open => {
167 depth += 1;
168 }
169 SimpleUnit::Close => {
170 depth = depth.saturating_sub(1);
172 }
173 SimpleUnit::Pipe => {
174 if !is_first {
175 alternative_depth.insert(depth);
176 }
177 }
178 SimpleUnit::GroupName(name) => {
179 if *name != group_name {
181 continue;
182 }
183 if is_first {
185 is_first = false;
186 continue;
187 }
188
189 for i in (0..depth).rev() {
194 if alternative_depth.contains(&i) {
195 alternative_depth.remove(&i);
197 continue 'outer;
198 }
199 }
200
201 return Err(spans);
202 }
203 }
204 }
205 }
206 }
207
208 Ok((
209 num_of_left_capturing_parens,
210 group_names.keys().cloned().collect(),
211 ))
212}
213
214#[cfg(test)]
215mod tests {
216 use super::*;
217
218 #[test]
219 fn count_capturing_groups() {
220 for (source_text, expected) in [
221 ("()", (1, 0)),
222 (r"\1()", (1, 0)),
223 ("(foo)", (1, 0)),
224 ("(foo)(bar)", (2, 0)),
225 ("(foo(bar))", (2, 0)),
226 ("(foo)[(bar)]", (1, 0)),
227 (r"(foo)\(bar\)", (1, 0)),
228 ("(foo)(?<n>bar)", (2, 1)),
229 ("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0)),
230 ("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
231 ("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
232 ("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
233 ] {
234 let mut reader = Reader::initialize(source_text, true, false).unwrap();
235
236 let (num_of_left_capturing_parens, capturing_group_names) =
237 parse_capturing_groups(&mut reader).unwrap();
238
239 let actual = (num_of_left_capturing_parens, capturing_group_names.len());
240 assert_eq!(expected, actual, "{source_text}");
241 }
242 }
243
244 #[test]
245 fn duplicated_named_capturing_groups() {
246 for source_text in [
247 "(?<n>.)(?<n>..)",
248 "(?<n>.(?<n>..))",
249 "|(?<n>.(?<n>..))",
250 "(?<m>.)|(?<n>.(?<n>..))",
251 ] {
252 let mut reader = Reader::initialize(source_text, true, false).unwrap();
253
254 assert!(
255 parse_capturing_groups(&mut reader).is_err(),
256 "{source_text}"
257 );
258 }
259 }
260}