1 use unicode_width::UnicodeWidthChar;
4 /// Find all newlines, multi-byte characters, and non-narrow characters in a
7 /// This function will use an SSE2 enhanced implementation if hardware support
8 /// is detected at runtime.
9 pub fn analyze_source_file(
11 source_file_start_pos: BytePos)
12 -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>)
14 let mut lines = vec![source_file_start_pos];
15 let mut multi_byte_chars = vec![];
16 let mut non_narrow_chars = vec![];
18 // Calls the right implementation, depending on hardware support available.
19 analyze_source_file_dispatch(src,
20 source_file_start_pos,
22 &mut multi_byte_chars,
23 &mut non_narrow_chars);
25 // The code above optimistically registers a new line *after* each \n
26 // it encounters. If that point is already outside the source_file, remove
28 if let Some(&last_line_start) = lines.last() {
29 let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
30 assert!(source_file_end >= last_line_start);
31 if last_line_start == source_file_end {
36 (lines, multi_byte_chars, non_narrow_chars)
40 if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] {
41 fn analyze_source_file_dispatch(src: &str,
42 source_file_start_pos: BytePos,
43 lines: &mut Vec<BytePos>,
44 multi_byte_chars: &mut Vec<MultiByteChar>,
45 non_narrow_chars: &mut Vec<NonNarrowChar>) {
46 if is_x86_feature_detected!("sse2") {
48 analyze_source_file_sse2(src,
49 source_file_start_pos,
55 analyze_source_file_generic(src,
57 source_file_start_pos,
65 /// Check 16 byte chunks of text at a time. If the chunk contains
66 /// something other than printable ASCII characters and newlines, the
67 /// function falls back to the generic implementation. Otherwise it uses
68 /// SSE2 intrinsics to quickly find all newlines.
69 #[target_feature(enable = "sse2")]
70 unsafe fn analyze_source_file_sse2(src: &str,
71 output_offset: BytePos,
72 lines: &mut Vec<BytePos>,
73 multi_byte_chars: &mut Vec<MultiByteChar>,
74 non_narrow_chars: &mut Vec<NonNarrowChar>) {
75 #[cfg(target_arch = "x86")]
76 use std::arch::x86::*;
77 #[cfg(target_arch = "x86_64")]
78 use std::arch::x86_64::*;
80 const CHUNK_SIZE: usize = 16;
82 let src_bytes = src.as_bytes();
84 let chunk_count = src.len() / CHUNK_SIZE;
86 // This variable keeps track of where we should start decoding a
87 // chunk. If a multi-byte character spans across chunk boundaries,
88 // we need to skip that part in the next chunk because we already
90 let mut intra_chunk_offset = 0;
92 for chunk_index in 0 .. chunk_count {
93 let ptr = src_bytes.as_ptr() as *const __m128i;
94 // We don't know if the pointer is aligned to 16 bytes, so we
95 // use `loadu`, which supports unaligned loading.
96 let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize));
98 // For character in the chunk, see if its byte value is < 0, which
99 // indicates that it's part of a UTF-8 char.
100 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
101 // Create a bit mask from the comparison results.
102 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
104 // If the bit mask is all zero, we only have ASCII chars here:
105 if multibyte_mask == 0 {
106 assert!(intra_chunk_offset == 0);
108 // Check if there are any control characters in the chunk. All
109 // control characters that we can encounter at this point have a
110 // byte value less than 32 or ...
111 let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32));
112 let control_char_mask0 = _mm_movemask_epi8(control_char_test0);
114 // ... it's the ASCII 'DEL' character with a value of 127.
115 let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127));
116 let control_char_mask1 = _mm_movemask_epi8(control_char_test1);
118 let control_char_mask = control_char_mask0 | control_char_mask1;
120 if control_char_mask != 0 {
121 // Check for newlines in the chunk
122 let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
123 let newlines_mask = _mm_movemask_epi8(newlines_test);
125 if control_char_mask == newlines_mask {
126 // All control characters are newlines, record them
127 let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
128 let output_offset = output_offset +
129 BytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
132 let index = newlines_mask.trailing_zeros();
134 if index >= CHUNK_SIZE as u32 {
135 // We have arrived at the end of the chunk.
139 lines.push(BytePos(index) + output_offset);
141 // Clear the bit, so we can find the next one.
142 newlines_mask &= (!1) << index;
145 // We are done for this chunk. All control characters were
146 // newlines and we took care of those.
149 // Some of the control characters are not newlines,
150 // fall through to the slow path below.
153 // No control characters, nothing to record for this chunk
159 // There are control chars in here, fallback to generic decoding.
160 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
161 intra_chunk_offset = analyze_source_file_generic(
162 &src[scan_start .. ],
163 CHUNK_SIZE - intra_chunk_offset,
164 BytePos::from_usize(scan_start) + output_offset,
171 // There might still be a tail left to analyze
172 let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
173 if tail_start < src.len() {
174 analyze_source_file_generic(&src[tail_start as usize ..],
175 src.len() - tail_start,
176 output_offset + BytePos::from_usize(tail_start),
184 // The target (or compiler version) does not support SSE2 ...
185 fn analyze_source_file_dispatch(src: &str,
186 source_file_start_pos: BytePos,
187 lines: &mut Vec<BytePos>,
188 multi_byte_chars: &mut Vec<MultiByteChar>,
189 non_narrow_chars: &mut Vec<NonNarrowChar>) {
190 analyze_source_file_generic(src,
192 source_file_start_pos,
200 // `scan_len` determines the number of bytes in `src` to scan. Note that the
201 // function can read past `scan_len` if a multi-byte character start within the
202 // range but extends past it. The overflow is returned by the function.
203 fn analyze_source_file_generic(src: &str,
205 output_offset: BytePos,
206 lines: &mut Vec<BytePos>,
207 multi_byte_chars: &mut Vec<MultiByteChar>,
208 non_narrow_chars: &mut Vec<NonNarrowChar>)
211 assert!(src.len() >= scan_len);
213 let src_bytes = src.as_bytes();
217 // We verified that i < scan_len <= src.len()
218 *src_bytes.get_unchecked(i as usize)
221 // How much to advance in order to get to the next UTF-8 char in the
223 let mut char_len = 1;
226 // This is an ASCII control character, it could be one of the cases
227 // that are interesting to us.
229 let pos = BytePos::from_usize(i) + output_offset;
233 lines.push(pos + BytePos(1));
236 non_narrow_chars.push(NonNarrowChar::Tab(pos));
239 non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
242 } else if byte >= 127 {
244 // This is either ASCII control character "DEL" or the beginning of
245 // a multibyte char. Just decode to `char`.
246 let c = (&src[i..]).chars().next().unwrap();
247 char_len = c.len_utf8();
249 let pos = BytePos::from_usize(i) + output_offset;
252 assert!(char_len >=2 && char_len <= 4);
253 let mbc = MultiByteChar {
255 bytes: char_len as u8,
257 multi_byte_chars.push(mbc);
260 // Assume control characters are zero width.
261 // FIXME: How can we decide between `width` and `width_cjk`?
262 let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
265 non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
278 (case: $test_name:ident,
280 source_file_start_pos: $source_file_start_pos:expr,
282 multi_byte_chars: $multi_byte_chars:expr,
283 non_narrow_chars: $non_narrow_chars:expr,) => (
288 let (lines, multi_byte_chars, non_narrow_chars) =
289 analyze_source_file($text, BytePos($source_file_start_pos));
291 let expected_lines: Vec<BytePos> = $lines
293 .map(|pos| BytePos(pos))
296 assert_eq!(lines, expected_lines);
298 let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars
300 .map(|(pos, bytes)| MultiByteChar {
306 assert_eq!(multi_byte_chars, expected_mbcs);
308 let expected_nncs: Vec<NonNarrowChar> = $non_narrow_chars
310 .map(|(pos, width)| {
311 NonNarrowChar::new(BytePos(pos), width)
315 assert_eq!(non_narrow_chars, expected_nncs);
322 source_file_start_pos: 0,
324 multi_byte_chars: vec![],
325 non_narrow_chars: vec![],
329 case: newlines_short,
331 source_file_start_pos: 0,
333 multi_byte_chars: vec![],
334 non_narrow_chars: vec![],
339 text: "012345678\nabcdef012345678\na",
340 source_file_start_pos: 0,
341 lines: vec![0, 10, 26],
342 multi_byte_chars: vec![],
343 non_narrow_chars: vec![],
347 case: newline_and_multi_byte_char_in_same_chunk,
348 text: "01234β789\nbcdef0123456789abcdef",
349 source_file_start_pos: 0,
351 multi_byte_chars: vec![(5, 2)],
352 non_narrow_chars: vec![],
356 case: newline_and_control_char_in_same_chunk,
357 text: "01234\u{07}6789\nbcdef0123456789abcdef",
358 source_file_start_pos: 0,
360 multi_byte_chars: vec![],
361 non_narrow_chars: vec![(5, 0)],
365 case: multi_byte_char_short,
367 source_file_start_pos: 0,
369 multi_byte_chars: vec![(1, 2)],
370 non_narrow_chars: vec![],
374 case: multi_byte_char_long,
375 text: "0123456789abcΔf012345β",
376 source_file_start_pos: 0,
378 multi_byte_chars: vec![(13, 2), (22, 2)],
379 non_narrow_chars: vec![],
383 case: multi_byte_char_across_chunk_boundary,
384 text: "0123456789abcdeΔ123456789abcdef01234",
385 source_file_start_pos: 0,
387 multi_byte_chars: vec![(15, 2)],
388 non_narrow_chars: vec![],
392 case: multi_byte_char_across_chunk_boundary_tail,
393 text: "0123456789abcdeΔ....",
394 source_file_start_pos: 0,
396 multi_byte_chars: vec![(15, 2)],
397 non_narrow_chars: vec![],
401 case: non_narrow_short,
403 source_file_start_pos: 0,
405 multi_byte_chars: vec![],
406 non_narrow_chars: vec![(1, 4)],
410 case: non_narrow_long,
411 text: "01\t3456789abcdef01234567\u{07}9",
412 source_file_start_pos: 0,
414 multi_byte_chars: vec![],
415 non_narrow_chars: vec![(2, 4), (24, 0)],
419 case: output_offset_all,
420 text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf",
421 source_file_start_pos: 1000,
422 lines: vec![0 + 1000, 7 + 1000, 27 + 1000],
423 multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)],
424 non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)],