swc_ecma_regexp_ast/
lib.rs

1use bitflags::bitflags;
2use is_macro::Is;
3use swc_atoms::Atom;
4use swc_common::{ast_node, EqIgnoreSpan, Span};
5
6mod display;
7
8/// The root of the `PatternParser` result.
9#[ast_node]
10#[derive(Eq, Hash, EqIgnoreSpan)]
11pub struct Pattern {
12    pub span: Span,
13    pub body: Disjunction,
14}
15
16/// Pile of [`Alternative`]s separated by `|`.
17#[ast_node]
18#[derive(Eq, Hash, EqIgnoreSpan)]
19pub struct Disjunction {
20    pub span: Span,
21    pub body: Vec<Alternative>,
22}
23
24/// Single unit of `|` separated alternatives.
25#[ast_node]
26#[derive(Eq, Hash, EqIgnoreSpan)]
27pub struct Alternative {
28    pub span: Span,
29    pub body: Vec<Term>,
30}
31
32/// Single unit of [`Alternative`], containing various kinds.
33#[ast_node(no_unknown)]
34#[derive(Eq, Hash, EqIgnoreSpan, Is)]
35pub enum Term {
36    // Assertion
37    // QuantifiableAssertion
38    #[tag("BoundaryAssertion")]
39    BoundaryAssertion(Box<BoundaryAssertion>),
40
41    #[tag("LookAroundAssertion")]
42    LookAroundAssertion(Box<LookAroundAssertion>),
43
44    #[tag("Quantifier")]
45    Quantifier(Box<Quantifier>),
46
47    // Atom
48    // ExtendedAtom
49    #[tag("Character")]
50    Character(Box<Character>),
51
52    #[tag("CharacterClassEscape")]
53    Dot(Dot),
54
55    #[tag("CharacterClassEscape")]
56    CharacterClassEscape(Box<CharacterClassEscape>),
57
58    #[tag("UnicodePropertyEscape")]
59    UnicodePropertyEscape(Box<UnicodePropertyEscape>),
60
61    #[tag("ClassStringDisjunction")]
62    CharacterClass(Box<CharacterClass>),
63
64    #[tag("ClassStringDisjunction")]
65    CapturingGroup(Box<CapturingGroup>),
66
67    #[tag("ClassStringDisjunction")]
68    IgnoreGroup(Box<IgnoreGroup>),
69
70    #[tag("ClassStringDisjunction")]
71    IndexedReference(Box<IndexedReference>),
72
73    #[tag("ClassStringDisjunction")]
74    NamedReference(Box<NamedReference>),
75}
76
77/// Simple form of assertion.
78/// e.g. `^`, `$`, `\b`, `\B`
79#[ast_node]
80#[derive(Eq, Hash, EqIgnoreSpan)]
81pub struct BoundaryAssertion {
82    pub span: Span,
83    #[use_eq]
84    pub kind: BoundaryAssertionKind,
85}
86
87#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
88#[cfg_attr(
89    feature = "encoding-impl",
90    derive(::swc_common::Encode, ::swc_common::Decode)
91)]
92pub enum BoundaryAssertionKind {
93    Start = 0,
94    End = 1,
95    Boundary = 2,
96    NegativeBoundary = 3,
97}
98
99/// Lookaround assertion.
100/// e.g. `(?=...)`, `(?!...)`, `(?<=...)`, `(?<!...)`
101#[ast_node]
102#[derive(Eq, Hash, EqIgnoreSpan)]
103pub struct LookAroundAssertion {
104    pub span: Span,
105    #[use_eq]
106    pub kind: LookAroundAssertionKind,
107    pub body: Disjunction,
108}
109
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
111#[cfg_attr(
112    feature = "encoding-impl",
113    derive(::swc_common::Encode, ::swc_common::Decode)
114)]
115pub enum LookAroundAssertionKind {
116    Lookahead = 0,
117    NegativeLookahead = 1,
118    Lookbehind = 2,
119    NegativeLookbehind = 3,
120}
121
122/// Quantifier holding a [`Term`] and its repetition count.
123/// e.g. `a*`, `b+`, `c?`, `d{3}`, `e{4,}`, `f{5,6}`
124#[ast_node]
125#[derive(Eq, Hash, EqIgnoreSpan)]
126pub struct Quantifier {
127    pub span: Span,
128    pub min: u64,
129    /// `None` means no upper bound.
130    #[cfg_attr(
131        feature = "encoding-impl",
132        encoding(with = "cbor4ii::core::types::Maybe")
133    )]
134    pub max: Option<u64>,
135    pub greedy: bool,
136    pub body: Term,
137}
138
139/// Single character.
140#[ast_node]
141#[derive(Eq, Hash, EqIgnoreSpan)]
142pub struct Character {
143    /// This will be invalid position when `UnicodeMode` is disabled and `value`
144    /// is a surrogate pair.
145    pub span: Span,
146    #[use_eq]
147    pub kind: CharacterKind,
148    /// Unicode code point or UTF-16 code unit.
149    pub value: u32,
150}
151
152#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
153#[cfg_attr(
154    feature = "encoding-impl",
155    derive(::swc_common::Encode, ::swc_common::Decode)
156)]
157pub enum CharacterKind {
158    ControlLetter = 0,
159    HexadecimalEscape = 1,
160    Identifier = 2,
161    Null = 3,
162    // To distinguish leading 0 cases like `\00` and `\000`
163    Octal1 = 4,
164    Octal2 = 5,
165    Octal3 = 6,
166    SingleEscape = 7,
167    Symbol = 8,
168    UnicodeEscape = 9,
169}
170
171/// Character class.
172/// e.g. `\d`, `\D`, `\s`, `\S`, `\w`, `\W`
173#[ast_node]
174#[derive(Eq, Hash, EqIgnoreSpan)]
175pub struct CharacterClassEscape {
176    pub span: Span,
177    #[use_eq]
178    pub kind: CharacterClassEscapeKind,
179}
180
181#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
182#[cfg_attr(
183    feature = "encoding-impl",
184    derive(::swc_common::Encode, ::swc_common::Decode)
185)]
186pub enum CharacterClassEscapeKind {
187    D = 0,
188    NegativeD = 1,
189    S = 2,
190    NegativeS = 3,
191    W = 4,
192    NegativeW = 5,
193}
194
195/// Unicode property.
196/// e.g. `\p{ASCII}`, `\P{ASCII}`, `\p{sc=Hiragana}`, `\P{sc=Hiragana}`
197#[ast_node]
198#[derive(Eq, Hash, EqIgnoreSpan)]
199pub struct UnicodePropertyEscape {
200    pub span: Span,
201    pub negative: bool,
202    /// `true` if `UnicodeSetsMode` and `name` matches unicode property of
203    /// strings.
204    pub strings: bool,
205    pub name: Atom,
206    #[cfg_attr(
207        feature = "encoding-impl",
208        encoding(with = "cbor4ii::core::types::Maybe")
209    )]
210    pub value: Option<Atom>,
211}
212
213/// The `.`.
214#[ast_node]
215#[derive(Eq, Hash, EqIgnoreSpan)]
216pub struct Dot {
217    pub span: Span,
218}
219
220/// Character class wrapped by `[]`.
221/// e.g. `[a-z]`, `[^A-Z]`, `[abc]`, `[a&&b&&c]`, `[[a-z]--x--y]`
222#[ast_node]
223#[derive(Eq, Hash, EqIgnoreSpan)]
224pub struct CharacterClass {
225    pub span: Span,
226    pub negative: bool,
227    /// `true` if:
228    /// - `body` contains [`UnicodePropertyEscape`], nested [`CharacterClass`]
229    ///   or [`ClassStringDisjunction`] which `strings` is `true`
230    /// - and matches each logic depends on `kind`
231    pub strings: bool,
232    #[use_eq]
233    pub kind: CharacterClassContentsKind,
234    pub body: Vec<CharacterClassContents>,
235}
236
237#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
238#[cfg_attr(
239    feature = "encoding-impl",
240    derive(::swc_common::Encode, ::swc_common::Decode)
241)]
242pub enum CharacterClassContentsKind {
243    Union = 0,
244    /// `UnicodeSetsMode` only.
245    Intersection = 1,
246    /// `UnicodeSetsMode` only.
247    Subtraction = 2,
248}
249
250#[ast_node(no_unknown)]
251#[derive(Eq, Hash, EqIgnoreSpan, Is)]
252pub enum CharacterClassContents {
253    #[tag("CharacterClassRange")]
254    CharacterClassRange(Box<CharacterClassRange>),
255
256    #[tag("CharacterClassEscape")]
257    CharacterClassEscape(Box<CharacterClassEscape>),
258
259    #[tag("UnicodePropertyEscape")]
260    UnicodePropertyEscape(Box<UnicodePropertyEscape>),
261
262    #[tag("Character")]
263    Character(Box<Character>),
264
265    /// `UnicodeSetsMode` only
266    #[tag("NestedCharacterClass")]
267    NestedCharacterClass(Box<CharacterClass>),
268
269    /// `UnicodeSetsMode` only
270    #[tag("ClassStringDisjunction")]
271    ClassStringDisjunction(Box<ClassStringDisjunction>),
272}
273
274/// `-` separated range of characters.
275/// e.g. `a-z`, `A-Z`, `0-9`
276#[ast_node]
277#[derive(Eq, Hash, EqIgnoreSpan)]
278pub struct CharacterClassRange {
279    pub span: Span,
280    pub min: Character,
281    pub max: Character,
282}
283
284/// `|` separated string of characters wrapped by `\q{}`.
285#[ast_node]
286#[derive(Eq, Hash, EqIgnoreSpan)]
287pub struct ClassStringDisjunction {
288    pub span: Span,
289    /// `true` if body is empty or contains [`ClassString`] which `strings` is
290    /// `true`.
291    pub strings: bool,
292    pub body: Vec<ClassString>,
293}
294
295/// Single unit of [`ClassStringDisjunction`].
296#[ast_node]
297#[derive(Eq, Hash, EqIgnoreSpan)]
298pub struct ClassString {
299    pub span: Span,
300    /// `true` if body is empty or contain 2 more characters.
301    pub strings: bool,
302    pub body: Vec<Character>,
303}
304
305/// Named or unnamed capturing group.
306/// e.g. `(...)`, `(?<name>...)`
307#[ast_node]
308#[derive(Eq, Hash, EqIgnoreSpan)]
309pub struct CapturingGroup {
310    pub span: Span,
311    /// Group name to be referenced by [`NamedReference`].
312    #[cfg_attr(
313        feature = "encoding-impl",
314        encoding(with = "cbor4ii::core::types::Maybe")
315    )]
316    pub name: Option<Atom>,
317    pub body: Disjunction,
318}
319
320/// Pseudo-group for ignoring.
321/// e.g. `(?:...)`
322#[ast_node]
323#[derive(Eq, Hash, EqIgnoreSpan)]
324pub struct IgnoreGroup {
325    pub span: Span,
326    #[cfg_attr(
327        feature = "encoding-impl",
328        encoding(with = "cbor4ii::core::types::Maybe")
329    )]
330    pub modifiers: Option<Modifiers>,
331    pub body: Disjunction,
332}
333
334/// Modifiers in [`IgnoreGroup`].
335/// e.g. `i` in `(?i:...)`, `-s` in `(?-s:...)`
336#[ast_node]
337#[derive(Eq, Hash, EqIgnoreSpan)]
338pub struct Modifiers {
339    pub span: Span,
340    #[use_eq]
341    pub enabling: Modifier,
342    #[use_eq]
343    pub disabling: Modifier,
344}
345
346bitflags! {
347    /// Each part of modifier in [`Modifiers`].
348    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
349    pub struct Modifier: u8 {
350        /// Ignore case flag
351        const I = 1 << 0;
352        /// Multiline flag
353        const M = 1 << 1;
354        /// DotAll flag
355        const S = 1 << 2;
356    }
357}
358
359/// Backreference by index.
360/// e.g. `\1`, `\2`, `\3`
361#[ast_node]
362#[derive(Eq, Hash, EqIgnoreSpan)]
363pub struct IndexedReference {
364    pub span: Span,
365    pub index: u32,
366}
367
368/// Backreference by name.
369/// e.g. `\k<name>`
370#[ast_node]
371#[derive(Eq, Hash, EqIgnoreSpan)]
372pub struct NamedReference {
373    pub span: Span,
374    pub name: Atom,
375}
376
377#[cfg(feature = "encoding-impl")]
378impl cbor4ii::core::enc::Encode for Modifier {
379    fn encode<W: cbor4ii::core::enc::Write>(
380        &self,
381        writer: &mut W,
382    ) -> Result<(), cbor4ii::core::enc::Error<W::Error>> {
383        self.bits().encode(writer)
384    }
385}
386
387#[cfg(feature = "encoding-impl")]
388impl<'de> cbor4ii::core::dec::Decode<'de> for Modifier {
389    fn decode<R: cbor4ii::core::dec::Read<'de>>(
390        reader: &mut R,
391    ) -> Result<Self, cbor4ii::core::dec::Error<R::Error>> {
392        let n = u8::decode(reader)?;
393        Modifier::from_bits(n).ok_or_else(|| cbor4ii::core::dec::Error::Mismatch {
394            name: &"Modifier",
395            found: 0,
396        })
397    }
398}
399
400#[cfg(target_pointer_width = "64")]
401#[test]
402fn size_asserts() {
403    use std::mem::size_of;
404
405    assert!(size_of::<Term>() == 16);
406    assert!(size_of::<CharacterClassContents>() == 16);
407}