1use unicode_width::UnicodeWidthChar;
11
12use super::*;
13
14pub fn analyze_source_file(
20 src: &str,
21 source_file_start_pos: BytePos,
22) -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
23 let mut lines = vec![source_file_start_pos];
24 let mut multi_byte_chars = Vec::new();
25 let mut non_narrow_chars = Vec::new();
26
27 analyze_source_file_generic(
29 src,
30 src.len(),
31 source_file_start_pos,
32 &mut lines,
33 &mut multi_byte_chars,
34 &mut non_narrow_chars,
35 );
36
37 if let Some(&last_line_start) = lines.last() {
41 let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
42 assert!(source_file_end >= last_line_start);
43 if last_line_start == source_file_end {
44 lines.pop();
45 }
46 }
47
48 (lines, multi_byte_chars, non_narrow_chars)
49}
50
51fn analyze_source_file_generic(
55 src: &str,
56 scan_len: usize,
57 output_offset: BytePos,
58 lines: &mut Vec<BytePos>,
59 multi_byte_chars: &mut Vec<MultiByteChar>,
60 non_narrow_chars: &mut Vec<NonNarrowChar>,
61) -> usize {
62 assert!(src.len() >= scan_len);
63 let mut i = 0;
64 let src_bytes = src.as_bytes();
65
66 while i < scan_len {
67 let byte = unsafe {
68 *src_bytes.get_unchecked(i)
70 };
71
72 let mut char_len = 1;
75
76 if byte < 32 {
77 let pos = BytePos::from_usize(i) + output_offset;
81
82 match byte {
83 b'\r' => {
84 if let Some(b'\n') = src_bytes.get(i + 1) {
85 lines.push(pos + BytePos(2));
86 i += 2;
87 continue;
88 }
89 lines.push(pos + BytePos(1));
90 }
91
92 b'\n' => {
93 lines.push(pos + BytePos(1));
94 }
95 b'\t' => {
96 non_narrow_chars.push(NonNarrowChar::Tab(pos));
97 }
98 _ => {
99 non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
100 }
101 }
102 } else if byte >= 127 {
103 let c = src[i..].chars().next().unwrap();
107 char_len = c.len_utf8();
108
109 let pos = BytePos::from_usize(i) + output_offset;
110
111 if char_len > 1 {
112 assert!((2..=4).contains(&char_len));
113 let mbc = MultiByteChar {
114 pos,
115 bytes: char_len as u8,
116 };
117 multi_byte_chars.push(mbc);
118 }
119
120 let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
123
124 if char_width != 1 {
125 non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
126 }
127 }
128
129 i += char_len;
130 }
131
132 i - scan_len
133}
134
135#[cfg(test)]
136#[allow(clippy::identity_op)]
137mod tests {
138 use super::*;
139
140 macro_rules! test {
141 (case: $test_name:ident,
142 text: $text:expr,
143 source_file_start_pos: $source_file_start_pos:expr,
144 lines: $lines:expr,
145 multi_byte_chars: $multi_byte_chars:expr,
146 non_narrow_chars: $non_narrow_chars:expr,) => {
147 #[test]
148 fn $test_name() {
149 let (lines, multi_byte_chars, non_narrow_chars) =
150 analyze_source_file($text, BytePos($source_file_start_pos));
151
152 let expected_lines: Vec<BytePos> =
153 $lines.into_iter().map(|pos| BytePos(pos)).collect();
154
155 assert_eq!(lines, expected_lines);
156
157 let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars
158 .into_iter()
159 .map(|(pos, bytes)| MultiByteChar {
160 pos: BytePos(pos),
161 bytes,
162 })
163 .collect();
164
165 assert_eq!(multi_byte_chars, expected_mbcs);
166
167 let expected_nncs: Vec<NonNarrowChar> = $non_narrow_chars
168 .into_iter()
169 .map(|(pos, width)| NonNarrowChar::new(BytePos(pos), width))
170 .collect();
171
172 assert_eq!(non_narrow_chars, expected_nncs);
173 }
174 };
175 }
176
177 test!(
178 case: empty_text,
179 text: "",
180 source_file_start_pos: 0,
181 lines: Vec::new(),
182 multi_byte_chars: Vec::new(),
183 non_narrow_chars: Vec::new(),
184 );
185
186 test!(
187 case: newlines_short,
188 text: "a\nc",
189 source_file_start_pos: 0,
190 lines: vec![0, 2],
191 multi_byte_chars: Vec::new(),
192 non_narrow_chars: Vec::new(),
193 );
194
195 test!(
196 case: newlines_long,
197 text: "012345678\nabcdef012345678\na",
198 source_file_start_pos: 0,
199 lines: vec![0, 10, 26],
200 multi_byte_chars: Vec::new(),
201 non_narrow_chars: Vec::new(),
202 );
203
204 test!(
205 case: newline_and_multi_byte_char_in_same_chunk,
206 text: "01234β789\nbcdef0123456789abcdef",
207 source_file_start_pos: 0,
208 lines: vec![0, 11],
209 multi_byte_chars: vec![(5, 2)],
210 non_narrow_chars: Vec::new(),
211 );
212
213 test!(
214 case: newline_and_control_char_in_same_chunk,
215 text: "01234\u{07}6789\nbcdef0123456789abcdef",
216 source_file_start_pos: 0,
217 lines: vec![0, 11],
218 multi_byte_chars: Vec::new(),
219 non_narrow_chars: vec![(5, 0)],
220 );
221
222 test!(
223 case: multi_byte_char_short,
224 text: "aβc",
225 source_file_start_pos: 0,
226 lines: vec![0],
227 multi_byte_chars: vec![(1, 2)],
228 non_narrow_chars: Vec::new(),
229 );
230
231 test!(
232 case: multi_byte_char_long,
233 text: "0123456789abcΔf012345β",
234 source_file_start_pos: 0,
235 lines: vec![0],
236 multi_byte_chars: vec![(13, 2), (22, 2)],
237 non_narrow_chars: Vec::new(),
238 );
239
240 test!(
241 case: multi_byte_char_across_chunk_boundary,
242 text: "0123456789abcdeΔ123456789abcdef01234",
243 source_file_start_pos: 0,
244 lines: vec![0],
245 multi_byte_chars: vec![(15, 2)],
246 non_narrow_chars: Vec::new(),
247 );
248
249 test!(
250 case: multi_byte_char_across_chunk_boundary_tail,
251 text: "0123456789abcdeΔ....",
252 source_file_start_pos: 0,
253 lines: vec![0],
254 multi_byte_chars: vec![(15, 2)],
255 non_narrow_chars: Vec::new(),
256 );
257
258 test!(
259 case: non_narrow_short,
260 text: "0\t2",
261 source_file_start_pos: 0,
262 lines: vec![0],
263 multi_byte_chars: Vec::new(),
264 non_narrow_chars: vec![(1, 4)],
265 );
266
267 test!(
268 case: non_narrow_long,
269 text: "01\t3456789abcdef01234567\u{07}9",
270 source_file_start_pos: 0,
271 lines: vec![0],
272 multi_byte_chars: Vec::new(),
273 non_narrow_chars: vec![(2, 4), (24, 0)],
274 );
275
276 test!(
277 case: output_offset_all,
278 text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf",
279 source_file_start_pos: 1000,
280 lines: vec![0 + 1000, 7 + 1000, 27 + 1000],
281 multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)],
282 non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)],
283 );
284
285 test!(
286 case: unix_lf,
287 text: "/**\n * foo\n */\n012345678\nabcdef012345678\na",
288 source_file_start_pos: 0,
289 lines: vec![0, 4, 11, 15, 25, 41],
290 multi_byte_chars: Vec::new(),
291 non_narrow_chars: Vec::new(),
292 );
293
294 test!(
295 case: windows_cr,
296 text: "/**\r * foo\r */\r012345678\rabcdef012345678\ra",
297 source_file_start_pos: 0,
298 lines: vec![0, 4, 11, 15, 25, 41],
299 multi_byte_chars: Vec::new(),
300 non_narrow_chars: Vec::new(),
301 );
302
303 test!(
304 case: windows_crlf,
305 text: "/**\r\n * foo\r\n */\r\n012345678\r\nabcdef012345678\r\na",
306 source_file_start_pos: 0,
307 lines: vec![0, 5, 13, 18, 29, 46],
308 multi_byte_chars: Vec::new(),
309 non_narrow_chars: Vec::new(),
310 );
311}