swc_ecma_fast_parser/lexer/
identifier.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
//! Identifier processing for the lexer
//!
//! This module handles the parsing of ECMAScript/TypeScript identifiers.

use swc_atoms::Atom;

use super::Lexer;
use crate::{
    error::Result,
    token::{keyword_to_token_type, Token, TokenType, TokenValue},
};

/// Fast mapping from ASCII to check if a character is valid for identifier
/// start or continuation using bit flags
static IDENT_CHAR: [u8; 128] = {
    let mut table = [0u8; 128];

    // Mark identifier start characters (a-z, A-Z, _, $)
    let mut i = 0;
    while i < 26 {
        table[(b'a' + i) as usize] |= 3; // Both start and continue
        table[(b'A' + i) as usize] |= 3; // Both start and continue
        i += 1;
    }
    table[b'_' as usize] |= 3; // Both start and continue
    table[b'$' as usize] |= 3; // Both start and continue

    // Mark digits (0-9) as continue only
    i = 0;
    while i < 10 {
        table[(b'0' + i) as usize] |= 2; // Continue only
        i += 1;
    }

    table
};

/// ASCII lowercase letters that cannot start a keyword: h, j, m, q, x, z
/// Used to fast-path identifiers that can never be keywords
static NON_KEYWORD_START: [bool; 128] = {
    let mut table = [false; 128];
    table[b'h' as usize] = true;
    table[b'j' as usize] = true;
    table[b'm' as usize] = true;
    table[b'q' as usize] = true;
    table[b'x' as usize] = true;
    table[b'z' as usize] = true;
    table
};

impl Lexer<'_> {
    /// Check if a character is a non-keyword ASCII start character
    #[inline(always)]
    pub(super) fn is_non_keyword_start(ch: u8) -> bool {
        ch < 128 && unsafe { *NON_KEYWORD_START.get_unchecked(ch as usize) }
    }

    /// Optimized path for identifiers that start with characters that can't be
    /// keywords
    #[inline(always)]
    pub(super) fn read_non_keyword_identifier(&mut self) -> Result<Token> {
        let start_pos = self.start_pos;

        // Skip the first character (already verified as identifier start)
        self.cursor.advance();

        // Read as many identifier continue chars as possible
        self.cursor.advance_while(Self::is_identifier_continue);

        // Extract the identifier text
        let span = self.span();
        let ident_start = start_pos.0;
        let ident_end = self.cursor.position();
        let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
        let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
        let had_line_break_bool: bool = self.had_line_break.into();

        // For non-keyword identifiers, we can directly return without checking keyword
        // maps
        Ok(Token::new(
            TokenType::Ident,
            span,
            had_line_break_bool,
            TokenValue::Word(Atom::from(ident_str)),
        ))
    }

    /// Read an identifier or keyword
    #[inline(always)]
    pub(super) fn read_identifier(&mut self) -> Result<Token> {
        let start_pos = self.start_pos;

        // Skip the first character (already verified as identifier start)
        self.cursor.advance();

        // Read as many identifier continue chars as possible
        self.cursor.advance_while(Self::is_identifier_continue);

        // Extract the identifier text
        let span = self.span();
        let ident_start = start_pos.0;
        let ident_end = self.cursor.position();
        let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
        // SAFETY: We've verified the bytes are valid UTF-8
        let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
        let had_line_break_bool: bool = self.had_line_break.into();

        // Ultra-fast path for common 2-6 letter keywords using direct table lookup
        let len = ident_bytes.len();

        // Only process if first byte is an ASCII lowercase letter (all keywords start
        // with a-z)
        if len > 0 && ident_bytes[0] >= b'a' && ident_bytes[0] <= b'z' {
            // Only runs for potential keywords not in our direct lookup tables
            if let Some(token_type) = keyword_to_token_type(ident_str) {
                return Ok(Token::new(
                    token_type,
                    span,
                    had_line_break_bool,
                    TokenValue::None,
                ));
            }
        }

        // Not a keyword, return as identifier with the word value
        Ok(Token::new(
            TokenType::Ident,
            span,
            had_line_break_bool,
            TokenValue::Word(Atom::from(ident_str)),
        ))
    }

    /// Super fast check for ASCII identifier start character
    #[inline(always)]
    pub(crate) fn is_ascii_id_start(ch: u8) -> bool {
        ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 1) != 0 }
    }

    /// Super fast check for ASCII identifier continue character  
    #[inline(always)]
    pub(crate) fn is_ascii_id_continue(ch: u8) -> bool {
        ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 2) != 0 }
    }
}