swc_ecma_regexp/parser/pattern_parser/
unicode_property.rs

1use phf::{phf_set, Set};
2
3// https://tc39.es/ecma262/2024/multipage/text-processing.html#table-nonbinary-unicode-properties
4pub fn is_valid_unicode_property(name: &str, value: &str) -> bool {
5    if matches!(name, "General_Category" | "gc") {
6        return GC_PROPERTY_VALUES.contains(value);
7    }
8    if matches!(name, "Script" | "sc") {
9        return SC_PROPERTY_VALUES.contains(value);
10    }
11    if matches!(name, "Script_Extensions" | "scx") {
12        return SC_PROPERTY_VALUES.contains(value) || SCX_PROPERTY_VALUES.contains(value);
13    }
14    false
15}
16
17pub fn is_valid_lone_unicode_property(name_or_value: &str) -> bool {
18    BINARY_UNICODE_PROPERTIES.contains(name_or_value)
19}
20/// This should be used with `UnicodeSetsMode`
21pub fn is_valid_lone_unicode_property_of_strings(name_or_value: &str) -> bool {
22    BINARY_UNICODE_PROPERTIES_OF_STRINGS.contains(name_or_value)
23}
24
25// spellchecker:off
26// # General_Category (gc)
27// https://unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt
28static GC_PROPERTY_VALUES: Set<&'static str> = phf_set! {
29    "C", "Other",
30    "Cc", "Control", "cntrl",
31    "Cf", "Format",
32    "Cn", "Unassigned",
33    "Co", "Private_Use",
34    "Cs", "Surrogate",
35    "L", "Letter",
36    "LC", "Cased_Letter",
37    "Ll", "Lowercase_Letter",
38    "Lm", "Modifier_Letter",
39    "Lo", "Other_Letter",
40    "Lt", "Titlecase_Letter",
41    "Lu", "Uppercase_Letter",
42    "M", "Mark", "Combining_Mark",
43    "Mc", "Spacing_Mark",
44    "Me", "Enclosing_Mark",
45    "Mn", "Nonspacing_Mark",
46    "N", "Number",
47    "Nd", "Decimal_Number", "digit",
48    "Nl", "Letter_Number",
49    "No", "Other_Number",
50    "P", "Punctuation", "punct",
51    "Pc", "Connector_Punctuation",
52    "Pd", "Dash_Punctuation",
53    "Pe", "Close_Punctuation",
54    "Pf", "Final_Punctuation",
55    "Pi", "Initial_Punctuation",
56    "Po", "Other_Punctuation",
57    "Ps", "Open_Punctuation",
58    "S", "Symbol",
59    "Sc", "Currency_Symbol",
60    "Sk", "Modifier_Symbol",
61    "Sm", "Math_Symbol",
62    "So", "Other_Symbol",
63    "Z", "Separator",
64    "Zl", "Line_Separator",
65    "Zp", "Paragraph_Separator",
66    "Zs", "Space_Separator"
67};
68
69static SC_PROPERTY_VALUES: Set<&'static str> = phf_set! {
70    "Adlm", "Adlam",
71    "Aghb", "Caucasian_Albanian",
72    "Ahom",
73    "Arab", "Arabic",
74    "Armi", "Imperial_Aramaic",
75    "Armn", "Armenian",
76    "Avst", "Avestan",
77    "Bali", "Balinese",
78    "Bamu", "Bamum",
79    "Bass", "Bassa_Vah",
80    "Batk", "Batak",
81    "Beng", "Bengali",
82    "Bhks", "Bhaiksuki",
83    "Bopo", "Bopomofo",
84    "Brah", "Brahmi",
85    "Brai", "Braille",
86    "Bugi", "Buginese",
87    "Buhd", "Buhid",
88    "Cakm", "Chakma",
89    "Cans", "Canadian_Aboriginal",
90    "Cari", "Carian",
91    "Cham",
92    "Cher", "Cherokee",
93    "Chrs", "Chorasmian",
94    "Copt", "Coptic", "Qaac",
95    "Cpmn", "Cypro_Minoan",
96    "Cprt", "Cypriot",
97    "Cyrl", "Cyrillic",
98    "Deva", "Devanagari",
99    "Diak", "Dives_Akuru",
100    "Dogr", "Dogra",
101    "Dsrt", "Deseret",
102    "Dupl", "Duployan",
103    "Egyp", "Egyptian_Hieroglyphs",
104    "Elba", "Elbasan",
105    "Elym", "Elymaic",
106    "Ethi", "Ethiopic",
107    "Gara", "Garay",
108    "Geor", "Georgian",
109    "Glag", "Glagolitic",
110    "Gong", "Gunjala_Gondi",
111    "Gonm", "Masaram_Gondi",
112    "Goth", "Gothic",
113    "Gran", "Grantha",
114    "Grek", "Greek",
115    "Gujr", "Gujarati",
116    "Gukh", "Gurung_Khema",
117    "Guru", "Gurmukhi",
118    "Hang", "Hangul",
119    "Hani", "Han",
120    "Hano", "Hanunoo",
121    "Hatr", "Hatran",
122    "Hebr", "Hebrew",
123    "Hira", "Hiragana",
124    "Hluw", "Anatolian_Hieroglyphs",
125    "Hmng", "Pahawh_Hmong",
126    "Hmnp", "Nyiakeng_Puachue_Hmong",
127    "Hrkt", "Katakana_Or_Hiragana",
128    "Hung", "Old_Hungarian",
129    "Ital", "Old_Italic",
130    "Java", "Javanese",
131    "Kali", "Kayah_Li",
132    "Kana", "Katakana",
133    "Kawi",
134    "Khar", "Kharoshthi",
135    "Khmr", "Khmer",
136    "Khoj", "Khojki",
137    "Kits", "Khitan_Small_Script",
138    "Knda", "Kannada",
139    "Krai", "Kirat_Rai",
140    "Kthi", "Kaithi",
141    "Lana", "Tai_Tham",
142    "Laoo", "Lao",
143    "Latn", "Latin",
144    "Lepc", "Lepcha",
145    "Limb", "Limbu",
146    "Lina", "Linear_A",
147    "Linb", "Linear_B",
148    "Lisu",
149    "Lyci", "Lycian",
150    "Lydi", "Lydian",
151    "Mahj", "Mahajani",
152    "Maka", "Makasar",
153    "Mand", "Mandaic",
154    "Mani", "Manichaean",
155    "Marc", "Marchen",
156    "Medf", "Medefaidrin",
157    "Mend", "Mende_Kikakui",
158    "Merc", "Meroitic_Cursive",
159    "Mero", "Meroitic_Hieroglyphs",
160    "Mlym", "Malayalam",
161    "Modi",
162    "Mong", "Mongolian",
163    "Mroo", "Mro",
164    "Mtei", "Meetei_Mayek",
165    "Mult", "Multani",
166    "Mymr", "Myanmar",
167    "Nagm", "Nag_Mundari",
168    "Nand", "Nandinagari",
169    "Narb", "Old_North_Arabian",
170    "Nbat", "Nabataean",
171    "Newa",
172    "Nkoo", "Nko",
173    "Nshu", "Nushu",
174    "Ogam", "Ogham",
175    "Olck", "Ol_Chiki",
176    "Onao", "Ol_Onal",
177    "Orkh", "Old_Turkic",
178    "Orya", "Oriya",
179    "Osge", "Osage",
180    "Osma", "Osmanya",
181    "Ougr", "Old_Uyghur",
182    "Palm", "Palmyrene",
183    "Pauc", "Pau_Cin_Hau",
184    "Perm", "Old_Permic",
185    "Phag", "Phags_Pa",
186    "Phli", "Inscriptional_Pahlavi",
187    "Phlp", "Psalter_Pahlavi",
188    "Phnx", "Phoenician",
189    "Plrd", "Miao",
190    "Prti", "Inscriptional_Parthian",
191    "Rjng", "Rejang",
192    "Rohg", "Hanifi_Rohingya",
193    "Runr", "Runic",
194    "Samr", "Samaritan",
195    "Sarb", "Old_South_Arabian",
196    "Saur", "Saurashtra",
197    "Sgnw", "SignWriting",
198    "Shaw", "Shavian",
199    "Shrd", "Sharada",
200    "Sidd", "Siddham",
201    "Sind", "Khudawadi",
202    "Sinh", "Sinhala",
203    "Sogd", "Sogdian",
204    "Sogo", "Old_Sogdian",
205    "Sora", "Sora_Sompeng",
206    "Soyo", "Soyombo",
207    "Sund", "Sundanese",
208    "Sunu", "Sunuwar",
209    "Sylo", "Syloti_Nagri",
210    "Syrc", "Syriac",
211    "Tagb", "Tagbanwa",
212    "Takr", "Takri",
213    "Tale", "Tai_Le",
214    "Talu", "New_Tai_Lue",
215    "Taml", "Tamil",
216    "Tang", "Tangut",
217    "Tavt", "Tai_Viet",
218    "Telu", "Telugu",
219    "Tfng", "Tifinagh",
220    "Tglg", "Tagalog",
221    "Thaa", "Thaana",
222    "Thai",
223    "Tibt", "Tibetan",
224    "Tirh", "Tirhuta",
225    "Tnsa", "Tangsa",
226    "Todr", "Todhri",
227    "Toto",
228    "Tutg", "Tulu_Tigalari",
229    "Ugar", "Ugaritic",
230    "Vaii", "Vai",
231    "Vith", "Vithkuqi",
232    "Wara", "Warang_Citi",
233    "Wcho", "Wancho",
234    "Xpeo", "Old_Persian",
235    "Xsux", "Cuneiform",
236    "Yezi", "Yezidi",
237    "Yiii", "Yi",
238    "Zanb", "Zanabazar_Square",
239    "Zinh", "Inherited", "Qaai",
240    "Zyyy", "Common",
241    "Zzzz", "Unknown",
242};
243
244static SCX_PROPERTY_VALUES: Set<&'static str> = phf_set! {
245    // Empty
246};
247
248// Table 66: Binary Unicode property aliases
249// https://tc39.es/ecma262/2024/multipage/text-processing.html#table-binary-unicode-properties
250static BINARY_UNICODE_PROPERTIES: Set<&'static str> = phf_set! {
251    "ASCII",
252    "ASCII_Hex_Digit",
253    "AHex",
254    "Alphabetic",
255    "Alpha",
256    "Any",
257    "Assigned",
258    "Bidi_Control",
259    "Bidi_C",
260    "Bidi_Mirrored",
261    "Bidi_M",
262    "Case_Ignorable",
263    "CI",
264    "Cased",
265    "Changes_When_Casefolded",
266    "CWCF",
267    "Changes_When_Casemapped",
268    "CWCM",
269    "Changes_When_Lowercased",
270    "CWL",
271    "Changes_When_NFKC_Casefolded",
272    "CWKCF",
273    "Changes_When_Titlecased",
274    "CWT",
275    "Changes_When_Uppercased",
276    "CWU",
277    "Dash",
278    "Default_Ignorable_Code_Point",
279    "DI",
280    "Deprecated",
281    "Dep",
282    "Diacritic",
283    "Dia",
284    "Emoji",
285    "Emoji_Component",
286    "EComp",
287    "Emoji_Modifier",
288    "EMod",
289    "Emoji_Modifier_Base",
290    "EBase",
291    "Emoji_Presentation",
292    "EPres",
293    "Extended_Pictographic",
294    "ExtPict",
295    "Extender",
296    "Ext",
297    "Grapheme_Base",
298    "Gr_Base",
299    "Grapheme_Extend",
300    "Gr_Ext",
301    "Hex_Digit",
302    "Hex",
303    "IDS_Binary_Operator",
304    "IDSB",
305    "IDS_Trinary_Operator",
306    "IDST",
307    "ID_Continue",
308    "IDC",
309    "ID_Start",
310    "IDS",
311    "Ideographic",
312    "Ideo",
313    "Join_Control",
314    "Join_C",
315    "Logical_Order_Exception",
316    "LOE",
317    "Lowercase",
318    "Lower",
319    "Math",
320    "Noncharacter_Code_Point",
321    "NChar",
322    "Pattern_Syntax",
323    "Pat_Syn",
324    "Pattern_White_Space",
325    "Pat_WS",
326    "Quotation_Mark",
327    "QMark",
328    "Radical",
329    "Regional_Indicator",
330    "RI",
331    "Sentence_Terminal",
332    "STerm",
333    "Soft_Dotted",
334    "SD",
335    "Terminal_Punctuation",
336    "Term",
337    "Unified_Ideograph",
338    "UIdeo",
339    "Uppercase",
340    "Upper",
341    "Variation_Selector",
342    "VS",
343    "White_Space",
344    "space",
345    "XID_Continue",
346    "XIDC",
347    "XID_Start",
348    "XIDS",
349};
350
351// Table 67: Binary Unicode properties of strings
352// https://tc39.es/ecma262/2024/multipage/text-processing.html#table-binary-unicode-properties-of-strings
353static BINARY_UNICODE_PROPERTIES_OF_STRINGS: Set<&'static str> = phf_set! {
354    "Basic_Emoji",
355    "Emoji_Keycap_Sequence",
356    "RGI_Emoji_Modifier_Sequence",
357    "RGI_Emoji_Flag_Sequence",
358    "RGI_Emoji_Tag_Sequence",
359    "RGI_Emoji_ZWJ_Sequence",
360    "RGI_Emoji",
361};
362// spellchecker:on