swc_common/syntax_pos/
analyze_source_file.rs

1// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10use unicode_width::UnicodeWidthChar;
11
12use super::*;
13
14/// Finds all newlines, multi-byte characters, and non-narrow characters in a
15/// SourceFile.
16///
17/// This function will use an SSE2 enhanced implementation if hardware support
18/// is detected at runtime.
19pub fn analyze_source_file(
20    src: &str,
21    source_file_start_pos: BytePos,
22) -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
23    let mut lines = vec![source_file_start_pos];
24    let mut multi_byte_chars = Vec::new();
25    let mut non_narrow_chars = Vec::new();
26
27    // Calls the right implementation, depending on hardware support available.
28    analyze_source_file_generic(
29        src,
30        src.len(),
31        source_file_start_pos,
32        &mut lines,
33        &mut multi_byte_chars,
34        &mut non_narrow_chars,
35    );
36
37    // The code above optimistically registers a new line *after* each \n
38    // it encounters. If that point is already outside the source_file, remove
39    // it again.
40    if let Some(&last_line_start) = lines.last() {
41        let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
42        assert!(source_file_end >= last_line_start);
43        if last_line_start == source_file_end {
44            lines.pop();
45        }
46    }
47
48    (lines, multi_byte_chars, non_narrow_chars)
49}
50
51// `scan_len` determines the number of bytes in `src` to scan. Note that the
52// function can read past `scan_len` if a multi-byte character start within the
53// range but extends past it. The overflow is returned by the function.
54fn analyze_source_file_generic(
55    src: &str,
56    scan_len: usize,
57    output_offset: BytePos,
58    lines: &mut Vec<BytePos>,
59    multi_byte_chars: &mut Vec<MultiByteChar>,
60    non_narrow_chars: &mut Vec<NonNarrowChar>,
61) -> usize {
62    assert!(src.len() >= scan_len);
63    let mut i = 0;
64    let src_bytes = src.as_bytes();
65
66    while i < scan_len {
67        let byte = unsafe {
68            // We verified that i < scan_len <= src.len()
69            *src_bytes.get_unchecked(i)
70        };
71
72        // How much to advance in order to get to the next UTF-8 char in the
73        // string.
74        let mut char_len = 1;
75
76        if byte < 32 {
77            // This is an ASCII control character, it could be one of the cases
78            // that are interesting to us.
79
80            let pos = BytePos::from_usize(i) + output_offset;
81
82            match byte {
83                b'\r' => {
84                    if let Some(b'\n') = src_bytes.get(i + 1) {
85                        lines.push(pos + BytePos(2));
86                        i += 2;
87                        continue;
88                    }
89                    lines.push(pos + BytePos(1));
90                }
91
92                b'\n' => {
93                    lines.push(pos + BytePos(1));
94                }
95                b'\t' => {
96                    non_narrow_chars.push(NonNarrowChar::Tab(pos));
97                }
98                _ => {
99                    non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
100                }
101            }
102        } else if byte >= 127 {
103            // The slow path:
104            // This is either ASCII control character "DEL" or the beginning of
105            // a multibyte char. Just decode to `char`.
106            let c = src[i..].chars().next().unwrap();
107            char_len = c.len_utf8();
108
109            let pos = BytePos::from_usize(i) + output_offset;
110
111            if char_len > 1 {
112                assert!((2..=4).contains(&char_len));
113                let mbc = MultiByteChar {
114                    pos,
115                    bytes: char_len as u8,
116                };
117                multi_byte_chars.push(mbc);
118            }
119
120            // Assume control characters are zero width.
121            // FIXME: How can we decide between `width` and `width_cjk`?
122            let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
123
124            if char_width != 1 {
125                non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
126            }
127        }
128
129        i += char_len;
130    }
131
132    i - scan_len
133}
134
135#[cfg(test)]
136#[allow(clippy::identity_op)]
137mod tests {
138    use super::*;
139
140    macro_rules! test {
141        (case: $test_name:ident,
142     text: $text:expr,
143     source_file_start_pos: $source_file_start_pos:expr,
144     lines: $lines:expr,
145     multi_byte_chars: $multi_byte_chars:expr,
146     non_narrow_chars: $non_narrow_chars:expr,) => {
147            #[test]
148            fn $test_name() {
149                let (lines, multi_byte_chars, non_narrow_chars) =
150                    analyze_source_file($text, BytePos($source_file_start_pos));
151
152                let expected_lines: Vec<BytePos> =
153                    $lines.into_iter().map(|pos| BytePos(pos)).collect();
154
155                assert_eq!(lines, expected_lines);
156
157                let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars
158                    .into_iter()
159                    .map(|(pos, bytes)| MultiByteChar {
160                        pos: BytePos(pos),
161                        bytes,
162                    })
163                    .collect();
164
165                assert_eq!(multi_byte_chars, expected_mbcs);
166
167                let expected_nncs: Vec<NonNarrowChar> = $non_narrow_chars
168                    .into_iter()
169                    .map(|(pos, width)| NonNarrowChar::new(BytePos(pos), width))
170                    .collect();
171
172                assert_eq!(non_narrow_chars, expected_nncs);
173            }
174        };
175    }
176
177    test!(
178        case: empty_text,
179        text: "",
180        source_file_start_pos: 0,
181        lines: Vec::new(),
182        multi_byte_chars: Vec::new(),
183        non_narrow_chars: Vec::new(),
184    );
185
186    test!(
187        case: newlines_short,
188        text: "a\nc",
189        source_file_start_pos: 0,
190        lines: vec![0, 2],
191        multi_byte_chars: Vec::new(),
192        non_narrow_chars: Vec::new(),
193    );
194
195    test!(
196        case: newlines_long,
197        text: "012345678\nabcdef012345678\na",
198        source_file_start_pos: 0,
199        lines: vec![0, 10, 26],
200        multi_byte_chars: Vec::new(),
201        non_narrow_chars: Vec::new(),
202    );
203
204    test!(
205        case: newline_and_multi_byte_char_in_same_chunk,
206        text: "01234β789\nbcdef0123456789abcdef",
207        source_file_start_pos: 0,
208        lines: vec![0, 11],
209        multi_byte_chars: vec![(5, 2)],
210        non_narrow_chars: Vec::new(),
211    );
212
213    test!(
214        case: newline_and_control_char_in_same_chunk,
215        text: "01234\u{07}6789\nbcdef0123456789abcdef",
216        source_file_start_pos: 0,
217        lines: vec![0, 11],
218        multi_byte_chars: Vec::new(),
219        non_narrow_chars: vec![(5, 0)],
220    );
221
222    test!(
223        case: multi_byte_char_short,
224        text: "aβc",
225        source_file_start_pos: 0,
226        lines: vec![0],
227        multi_byte_chars: vec![(1, 2)],
228        non_narrow_chars: Vec::new(),
229    );
230
231    test!(
232        case: multi_byte_char_long,
233        text: "0123456789abcΔf012345β",
234        source_file_start_pos: 0,
235        lines: vec![0],
236        multi_byte_chars: vec![(13, 2), (22, 2)],
237        non_narrow_chars: Vec::new(),
238    );
239
240    test!(
241        case: multi_byte_char_across_chunk_boundary,
242        text: "0123456789abcdeΔ123456789abcdef01234",
243        source_file_start_pos: 0,
244        lines: vec![0],
245        multi_byte_chars: vec![(15, 2)],
246        non_narrow_chars: Vec::new(),
247    );
248
249    test!(
250        case: multi_byte_char_across_chunk_boundary_tail,
251        text: "0123456789abcdeΔ....",
252        source_file_start_pos: 0,
253        lines: vec![0],
254        multi_byte_chars: vec![(15, 2)],
255        non_narrow_chars: Vec::new(),
256    );
257
258    test!(
259        case: non_narrow_short,
260        text: "0\t2",
261        source_file_start_pos: 0,
262        lines: vec![0],
263        multi_byte_chars: Vec::new(),
264        non_narrow_chars: vec![(1, 4)],
265    );
266
267    test!(
268        case: non_narrow_long,
269        text: "01\t3456789abcdef01234567\u{07}9",
270        source_file_start_pos: 0,
271        lines: vec![0],
272        multi_byte_chars: Vec::new(),
273        non_narrow_chars: vec![(2, 4), (24, 0)],
274    );
275
276    test!(
277        case: output_offset_all,
278        text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf",
279        source_file_start_pos: 1000,
280        lines: vec![0 + 1000, 7 + 1000, 27 + 1000],
281        multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)],
282        non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)],
283    );
284
285    test!(
286        case: unix_lf,
287        text: "/**\n * foo\n */\n012345678\nabcdef012345678\na",
288        source_file_start_pos: 0,
289        lines: vec![0, 4, 11, 15, 25, 41],
290        multi_byte_chars: Vec::new(),
291        non_narrow_chars: Vec::new(),
292    );
293
294    test!(
295        case: windows_cr,
296        text: "/**\r * foo\r */\r012345678\rabcdef012345678\ra",
297        source_file_start_pos: 0,
298        lines: vec![0, 4, 11, 15, 25, 41],
299        multi_byte_chars: Vec::new(),
300        non_narrow_chars: Vec::new(),
301    );
302
303    test!(
304        case: windows_crlf,
305        text: "/**\r\n * foo\r\n */\r\n012345678\r\nabcdef012345678\r\na",
306        source_file_start_pos: 0,
307        lines: vec![0, 5, 13, 18, 29, 46],
308        multi_byte_chars: Vec::new(),
309        non_narrow_chars: Vec::new(),
310    );
311}