swc_ecma_regexp/parser/pattern_parser/
state.rs

1use rustc_hash::{FxHashMap, FxHashSet};
2use swc_atoms::Atom;
3
4use crate::parser::reader::Reader;
5
6/// Currently all of properties are read only from outside of this module.
7/// Even inside of this module, it is not changed after initialized.
8#[derive(Debug)]
9pub struct State {
10    // Mode flags
11    pub unicode_mode: bool,
12    pub unicode_sets_mode: bool,
13    pub named_capture_groups: bool,
14    // Other states
15    pub num_of_capturing_groups: u32,
16    pub capturing_group_names: FxHashSet<Atom>,
17}
18
19type DuplicatedNamedCapturingGroupOffsets = Vec<(u32, u32)>;
20
21impl State {
22    pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self {
23        Self {
24            unicode_mode,
25            unicode_sets_mode,
26            named_capture_groups: false,
27            num_of_capturing_groups: 0,
28            capturing_group_names: FxHashSet::default(),
29        }
30    }
31
32    pub fn initialize_with_parsing(
33        &mut self,
34        reader: &mut Reader,
35    ) -> Result<(), DuplicatedNamedCapturingGroupOffsets> {
36        let (num_of_left_capturing_parens, capturing_group_names) = parse_capturing_groups(reader)?;
37
38        // In Annex B, this is `false` by default.
39        // It is `true`
40        // - if `u` or `v` flag is set
41        // - or if `GroupName` is found in pattern
42        self.named_capture_groups =
43            self.unicode_mode || self.unicode_sets_mode || !capturing_group_names.is_empty();
44
45        self.num_of_capturing_groups = num_of_left_capturing_parens;
46        self.capturing_group_names = capturing_group_names;
47
48        Ok(())
49    }
50}
51
52enum SimpleUnit {
53    Open,
54    Close,
55    Pipe,
56    GroupName(Atom),
57}
58
59/// Returns: Result<(num_of_left_parens, capturing_group_names),
60/// duplicated_named_capturing_group_offsets>
61fn parse_capturing_groups<'a>(
62    reader: &mut Reader<'a>,
63) -> Result<(u32, FxHashSet<Atom>), DuplicatedNamedCapturingGroupOffsets> {
64    // Count only normal CapturingGroup(named, unnamed)
65    //   (?<name>...), (...)
66    // IgnoreGroup, and LookaroundAssertions are ignored
67    //   (?:...)
68    //   (?=...), (?!...), (?<=...), (?<!...)
69    let mut num_of_left_capturing_parens = 0;
70
71    // Collect capturing group names
72    let mut group_names: FxHashMap<Atom, (u32, u32)> = FxHashMap::default();
73    // At the same time, check duplicates
74    // If you want to process this most efficiently:
75    // - define a scope for each Disjunction
76    // - then check for duplicates for each `|` while inheriting the parent-child
77    //   relationship
78    // ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644
79    // However, duplicates are rare in the first place.
80    // And as long as it works simply, this may be enough.
81    let mut may_duplicates: FxHashMap<Atom, DuplicatedNamedCapturingGroupOffsets> =
82        FxHashMap::default();
83    let mut simplified: Vec<SimpleUnit> = vec![];
84
85    let mut in_escape = false;
86    let mut in_character_class = false;
87    while let Some(cp) = reader.peek() {
88        if in_escape {
89            in_escape = false;
90        } else if cp == '\\' as u32 {
91            in_escape = true;
92        } else if cp == '[' as u32 {
93            in_character_class = true;
94        } else if cp == ']' as u32 {
95            in_character_class = false;
96        } else if !in_character_class && cp == '|' as u32 {
97            simplified.push(SimpleUnit::Pipe);
98        } else if !in_character_class && cp == ')' as u32 {
99            simplified.push(SimpleUnit::Close);
100        } else if !in_character_class && cp == '(' as u32 {
101            reader.advance();
102
103            simplified.push(SimpleUnit::Open);
104
105            // Skip IgnoreGroup
106            if reader.eat2('?', ':')
107            // Skip LookAroundAssertion
108                || reader.eat2('?', '=')
109                || reader.eat2('?', '!')
110                || reader.eat3('?', '<', '=')
111                || reader.eat3('?', '<', '!')
112            {
113                continue;
114            }
115
116            // Count named or unnamed capturing groups
117            num_of_left_capturing_parens += 1;
118
119            // Collect capturing group names
120            if reader.eat2('?', '<') {
121                let span_start = reader.offset();
122                while let Some(ch) = reader.peek() {
123                    if ch == '>' as u32 {
124                        break;
125                    }
126                    reader.advance();
127                }
128                let span_end = reader.offset();
129
130                if reader.eat('>') {
131                    let group_name = reader.atom(span_start, span_end);
132
133                    simplified.push(SimpleUnit::GroupName(group_name.clone()));
134                    // Check duplicates later
135                    if let Some(last_span) = group_names.get(&group_name) {
136                        let entry = may_duplicates.entry(group_name).or_default();
137                        entry.push(*last_span);
138                        entry.push((span_start, span_end));
139                    } else {
140                        group_names.insert(group_name, (span_start, span_end));
141                    }
142
143                    continue;
144                }
145            }
146
147            // Unnamed
148            continue;
149        }
150
151        reader.advance();
152    }
153
154    // Check duplicates and emit error if exists
155    if !may_duplicates.is_empty() {
156        // Check must be done for each group name
157        for (group_name, spans) in may_duplicates {
158            let iter = simplified.iter().clone();
159
160            let mut alternative_depth = FxHashSet::default();
161            let mut depth = 0_u32;
162            let mut is_first = true;
163
164            'outer: for token in iter {
165                match token {
166                    SimpleUnit::Open => {
167                        depth += 1;
168                    }
169                    SimpleUnit::Close => {
170                        // May panic if the pattern has invalid, unbalanced parens
171                        depth = depth.saturating_sub(1);
172                    }
173                    SimpleUnit::Pipe => {
174                        if !is_first {
175                            alternative_depth.insert(depth);
176                        }
177                    }
178                    SimpleUnit::GroupName(name) => {
179                        // Check target group name only
180                        if *name != group_name {
181                            continue;
182                        }
183                        // Skip the first one, because it is not duplicated
184                        if is_first {
185                            is_first = false;
186                            continue;
187                        }
188
189                        // If left outer `|` is found, both can participate
190                        // `|(?<n>)`
191                        //  ^   ^ depth: 1
192                        //  ^ depth: 0
193                        for i in (0..depth).rev() {
194                            if alternative_depth.contains(&i) {
195                                // Remove it, next duplicates requires another `|`
196                                alternative_depth.remove(&i);
197                                continue 'outer;
198                            }
199                        }
200
201                        return Err(spans);
202                    }
203                }
204            }
205        }
206    }
207
208    Ok((
209        num_of_left_capturing_parens,
210        group_names.keys().cloned().collect(),
211    ))
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    #[test]
219    fn count_capturing_groups() {
220        for (source_text, expected) in [
221            ("()", (1, 0)),
222            (r"\1()", (1, 0)),
223            ("(foo)", (1, 0)),
224            ("(foo)(bar)", (2, 0)),
225            ("(foo(bar))", (2, 0)),
226            ("(foo)[(bar)]", (1, 0)),
227            (r"(foo)\(bar\)", (1, 0)),
228            ("(foo)(?<n>bar)", (2, 1)),
229            ("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0)),
230            ("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
231            ("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
232            ("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
233        ] {
234            let mut reader = Reader::initialize(source_text, true, false).unwrap();
235
236            let (num_of_left_capturing_parens, capturing_group_names) =
237                parse_capturing_groups(&mut reader).unwrap();
238
239            let actual = (num_of_left_capturing_parens, capturing_group_names.len());
240            assert_eq!(expected, actual, "{source_text}");
241        }
242    }
243
244    #[test]
245    fn duplicated_named_capturing_groups() {
246        for source_text in [
247            "(?<n>.)(?<n>..)",
248            "(?<n>.(?<n>..))",
249            "|(?<n>.(?<n>..))",
250            "(?<m>.)|(?<n>.(?<n>..))",
251        ] {
252            let mut reader = Reader::initialize(source_text, true, false).unwrap();
253
254            assert!(
255                parse_capturing_groups(&mut reader).is_err(),
256                "{source_text}"
257            );
258        }
259    }
260}