src/libsyntax_pos/analyze_source_file.rs

   1 use unicode_width::UnicodeWidthChar;
   2 use super::*;
   3
   4 /// Finds all newlines, multi-byte characters, and non-narrow characters in a
   5 /// SourceFile.
   6 ///
   7 /// This function will use an SSE2 enhanced implementation if hardware support
   8 /// is detected at runtime.
   9 pub fn analyze_source_file(
  10     src: &str,
  11     source_file_start_pos: BytePos)
  12     -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>)
  13 {
  14     let mut lines = vec![source_file_start_pos];
  15     let mut multi_byte_chars = vec![];
  16     let mut non_narrow_chars = vec![];
  17
  18     // Calls the right implementation, depending on hardware support available.
  19     analyze_source_file_dispatch(src,
  20                              source_file_start_pos,
  21                              &mut lines,
  22                              &mut multi_byte_chars,
  23                              &mut non_narrow_chars);
  24
  25     // The code above optimistically registers a new line *after* each \n
  26     // it encounters. If that point is already outside the source_file, remove
  27     // it again.
  28     if let Some(&last_line_start) = lines.last() {
  29         let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
  30         assert!(source_file_end >= last_line_start);
  31         if last_line_start == source_file_end {
  32             lines.pop();
  33         }
  34     }
  35
  36     (lines, multi_byte_chars, non_narrow_chars)
  37 }
  38
  39 cfg_if::cfg_if! {
  40     if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] {
  41         fn analyze_source_file_dispatch(src: &str,
  42                                     source_file_start_pos: BytePos,
  43                                     lines: &mut Vec<BytePos>,
  44                                     multi_byte_chars: &mut Vec<MultiByteChar>,
  45                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
  46             if is_x86_feature_detected!("sse2") {
  47                 unsafe {
  48                     analyze_source_file_sse2(src,
  49                                          source_file_start_pos,
  50                                          lines,
  51                                          multi_byte_chars,
  52                                          non_narrow_chars);
  53                 }
  54             } else {
  55                 analyze_source_file_generic(src,
  56                                         src.len(),
  57                                         source_file_start_pos,
  58                                         lines,
  59                                         multi_byte_chars,
  60                                         non_narrow_chars);
  61
  62             }
  63         }
  64
  65         /// Checks 16 byte chunks of text at a time. If the chunk contains
  66         /// something other than printable ASCII characters and newlines, the
  67         /// function falls back to the generic implementation. Otherwise it uses
  68         /// SSE2 intrinsics to quickly find all newlines.
  69         #[target_feature(enable = "sse2")]
  70         unsafe fn analyze_source_file_sse2(src: &str,
  71                                        output_offset: BytePos,
  72                                        lines: &mut Vec<BytePos>,
  73                                        multi_byte_chars: &mut Vec<MultiByteChar>,
  74                                        non_narrow_chars: &mut Vec<NonNarrowChar>) {
  75             #[cfg(target_arch = "x86")]
  76             use std::arch::x86::*;
  77             #[cfg(target_arch = "x86_64")]
  78             use std::arch::x86_64::*;
  79
  80             const CHUNK_SIZE: usize = 16;
  81
  82             let src_bytes = src.as_bytes();
  83
  84             let chunk_count = src.len() / CHUNK_SIZE;
  85
  86             // This variable keeps track of where we should start decoding a
  87             // chunk. If a multi-byte character spans across chunk boundaries,
  88             // we need to skip that part in the next chunk because we already
  89             // handled it.
  90             let mut intra_chunk_offset = 0;
  91
  92             for chunk_index in 0 .. chunk_count {
  93                 let ptr = src_bytes.as_ptr() as *const __m128i;
  94                 // We don't know if the pointer is aligned to 16 bytes, so we
  95                 // use `loadu`, which supports unaligned loading.
  96                 let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize));
  97
  98                 // For character in the chunk, see if its byte value is < 0, which
  99                 // indicates that it's part of a UTF-8 char.
 100                 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
 101                 // Create a bit mask from the comparison results.
 102                 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
 103
 104                 // If the bit mask is all zero, we only have ASCII chars here:
 105                 if multibyte_mask == 0 {
 106                     assert!(intra_chunk_offset == 0);
 107
 108                     // Check if there are any control characters in the chunk. All
 109                     // control characters that we can encounter at this point have a
 110                     // byte value less than 32 or ...
 111                     let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32));
 112                     let control_char_mask0 = _mm_movemask_epi8(control_char_test0);
 113
 114                     // ... it's the ASCII 'DEL' character with a value of 127.
 115                     let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127));
 116                     let control_char_mask1 = _mm_movemask_epi8(control_char_test1);
 117
 118                     let control_char_mask = control_char_mask0 | control_char_mask1;
 119
 120                     if control_char_mask != 0 {
 121                         // Check for newlines in the chunk
 122                         let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
 123                         let newlines_mask = _mm_movemask_epi8(newlines_test);
 124
 125                         if control_char_mask == newlines_mask {
 126                             // All control characters are newlines, record them
 127                             let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
 128                             let output_offset = output_offset +
 129                                 BytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
 130
 131                             loop {
 132                                 let index = newlines_mask.trailing_zeros();
 133
 134                                 if index >= CHUNK_SIZE as u32 {
 135                                     // We have arrived at the end of the chunk.
 136                                     break
 137                                 }
 138
 139                                 lines.push(BytePos(index) + output_offset);
 140
 141                                 // Clear the bit, so we can find the next one.
 142                                 newlines_mask &= (!1) << index;
 143                             }
 144
 145                             // We are done for this chunk. All control characters were
 146                             // newlines and we took care of those.
 147                             continue
 148                         } else {
 149                             // Some of the control characters are not newlines,
 150                             // fall through to the slow path below.
 151                         }
 152                     } else {
 153                         // No control characters, nothing to record for this chunk
 154                         continue
 155                     }
 156                 }
 157
 158                 // The slow path.
 159                 // There are control chars in here, fallback to generic decoding.
 160                 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
 161                 intra_chunk_offset = analyze_source_file_generic(
 162                     &src[scan_start .. ],
 163                     CHUNK_SIZE - intra_chunk_offset,
 164                     BytePos::from_usize(scan_start) + output_offset,
 165                     lines,
 166                     multi_byte_chars,
 167                     non_narrow_chars
 168                 );
 169             }
 170
 171             // There might still be a tail left to analyze
 172             let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
 173             if tail_start < src.len() {
 174                 analyze_source_file_generic(&src[tail_start as usize ..],
 175                                         src.len() - tail_start,
 176                                         output_offset + BytePos::from_usize(tail_start),
 177                                         lines,
 178                                         multi_byte_chars,
 179                                         non_narrow_chars);
 180             }
 181         }
 182     } else {
 183
 184         // The target (or compiler version) does not support SSE2 ...
 185         fn analyze_source_file_dispatch(src: &str,
 186                                     source_file_start_pos: BytePos,
 187                                     lines: &mut Vec<BytePos>,
 188                                     multi_byte_chars: &mut Vec<MultiByteChar>,
 189                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
 190             analyze_source_file_generic(src,
 191                                     src.len(),
 192                                     source_file_start_pos,
 193                                     lines,
 194                                     multi_byte_chars,
 195                                     non_narrow_chars);
 196         }
 197     }
 198 }
 199
 200 // `scan_len` determines the number of bytes in `src` to scan. Note that the
 201 // function can read past `scan_len` if a multi-byte character start within the
 202 // range but extends past it. The overflow is returned by the function.
 203 fn analyze_source_file_generic(src: &str,
 204                            scan_len: usize,
 205                            output_offset: BytePos,
 206                            lines: &mut Vec<BytePos>,
 207                            multi_byte_chars: &mut Vec<MultiByteChar>,
 208                            non_narrow_chars: &mut Vec<NonNarrowChar>)
 209                            -> usize
 210 {
 211     assert!(src.len() >= scan_len);
 212     let mut i = 0;
 213     let src_bytes = src.as_bytes();
 214
 215     while i < scan_len {
 216         let byte = unsafe {
 217             // We verified that i < scan_len <= src.len()
 218             *src_bytes.get_unchecked(i as usize)
 219         };
 220
 221         // How much to advance in order to get to the next UTF-8 char in the
 222         // string.
 223         let mut char_len = 1;
 224
 225         if byte < 32 {
 226             // This is an ASCII control character, it could be one of the cases
 227             // that are interesting to us.
 228
 229             let pos = BytePos::from_usize(i) + output_offset;
 230
 231             match byte {
 232                 b'\n' => {
 233                     lines.push(pos + BytePos(1));
 234                 }
 235                 b'\t' => {
 236                     non_narrow_chars.push(NonNarrowChar::Tab(pos));
 237                 }
 238                 _ => {
 239                     non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
 240                 }
 241             }
 242         } else if byte >= 127 {
 243             // The slow path:
 244             // This is either ASCII control character "DEL" or the beginning of
 245             // a multibyte char. Just decode to `char`.
 246             let c = (&src[i..]).chars().next().unwrap();
 247             char_len = c.len_utf8();
 248
 249             let pos = BytePos::from_usize(i) + output_offset;
 250
 251             if char_len > 1 {
 252                 assert!(char_len >=2 && char_len <= 4);
 253                 let mbc = MultiByteChar {
 254                     pos,
 255                     bytes: char_len as u8,
 256                 };
 257                 multi_byte_chars.push(mbc);
 258             }
 259
 260             // Assume control characters are zero width.
 261             // FIXME: How can we decide between `width` and `width_cjk`?
 262             let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
 263
 264             if char_width != 1 {
 265                 non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
 266             }
 267         }
 268
 269         i += char_len;
 270     }
 271
 272     i - scan_len
 273 }
 274
 275
 276
 277 macro_rules! test {
 278     (case: $test_name:ident,
 279      text: $text:expr,
 280      source_file_start_pos: $source_file_start_pos:expr,
 281      lines: $lines:expr,
 282      multi_byte_chars: $multi_byte_chars:expr,
 283      non_narrow_chars: $non_narrow_chars:expr,) => (
 284
 285     #[test]
 286     fn $test_name() {
 287
 288         let (lines, multi_byte_chars, non_narrow_chars) =
 289             analyze_source_file($text, BytePos($source_file_start_pos));
 290
 291         let expected_lines: Vec<BytePos> = $lines
 292             .into_iter()
 293             .map(|pos| BytePos(pos))
 294             .collect();
 295
 296         assert_eq!(lines, expected_lines);
 297
 298         let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars
 299             .into_iter()
 300             .map(|(pos, bytes)| MultiByteChar {
 301                 pos: BytePos(pos),
 302                 bytes,
 303             })
 304             .collect();
 305
 306         assert_eq!(multi_byte_chars, expected_mbcs);
 307
 308         let expected_nncs: Vec<NonNarrowChar> = $non_narrow_chars
 309             .into_iter()
 310             .map(|(pos, width)| {
 311                 NonNarrowChar::new(BytePos(pos), width)
 312             })
 313             .collect();
 314
 315         assert_eq!(non_narrow_chars, expected_nncs);
 316     })
 317 }
 318
 319 test!(
 320     case: empty_text,
 321     text: "",
 322     source_file_start_pos: 0,
 323     lines: vec![],
 324     multi_byte_chars: vec![],
 325     non_narrow_chars: vec![],
 326 );
 327
 328 test!(
 329     case: newlines_short,
 330     text: "a\nc",
 331     source_file_start_pos: 0,
 332     lines: vec![0, 2],
 333     multi_byte_chars: vec![],
 334     non_narrow_chars: vec![],
 335 );
 336
 337 test!(
 338     case: newlines_long,
 339     text: "012345678\nabcdef012345678\na",
 340     source_file_start_pos: 0,
 341     lines: vec![0, 10, 26],
 342     multi_byte_chars: vec![],
 343     non_narrow_chars: vec![],
 344 );
 345
 346 test!(
 347     case: newline_and_multi_byte_char_in_same_chunk,
 348     text: "01234β789\nbcdef0123456789abcdef",
 349     source_file_start_pos: 0,
 350     lines: vec![0, 11],
 351     multi_byte_chars: vec![(5, 2)],
 352     non_narrow_chars: vec![],
 353 );
 354
 355 test!(
 356     case: newline_and_control_char_in_same_chunk,
 357     text: "01234\u{07}6789\nbcdef0123456789abcdef",
 358     source_file_start_pos: 0,
 359     lines: vec![0, 11],
 360     multi_byte_chars: vec![],
 361     non_narrow_chars: vec![(5, 0)],
 362 );
 363
 364 test!(
 365     case: multi_byte_char_short,
 366     text: "aβc",
 367     source_file_start_pos: 0,
 368     lines: vec![0],
 369     multi_byte_chars: vec![(1, 2)],
 370     non_narrow_chars: vec![],
 371 );
 372
 373 test!(
 374     case: multi_byte_char_long,
 375     text: "0123456789abcΔf012345β",
 376     source_file_start_pos: 0,
 377     lines: vec![0],
 378     multi_byte_chars: vec![(13, 2), (22, 2)],
 379     non_narrow_chars: vec![],
 380 );
 381
 382 test!(
 383     case: multi_byte_char_across_chunk_boundary,
 384     text: "0123456789abcdeΔ123456789abcdef01234",
 385     source_file_start_pos: 0,
 386     lines: vec![0],
 387     multi_byte_chars: vec![(15, 2)],
 388     non_narrow_chars: vec![],
 389 );
 390
 391 test!(
 392     case: multi_byte_char_across_chunk_boundary_tail,
 393     text: "0123456789abcdeΔ....",
 394     source_file_start_pos: 0,
 395     lines: vec![0],
 396     multi_byte_chars: vec![(15, 2)],
 397     non_narrow_chars: vec![],
 398 );
 399
 400 test!(
 401     case: non_narrow_short,
 402     text: "0\t2",
 403     source_file_start_pos: 0,
 404     lines: vec![0],
 405     multi_byte_chars: vec![],
 406     non_narrow_chars: vec![(1, 4)],
 407 );
 408
 409 test!(
 410     case: non_narrow_long,
 411     text: "01\t3456789abcdef01234567\u{07}9",
 412     source_file_start_pos: 0,
 413     lines: vec![0],
 414     multi_byte_chars: vec![],
 415     non_narrow_chars: vec![(2, 4), (24, 0)],
 416 );
 417
 418 test!(
 419     case: output_offset_all,
 420     text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf",
 421     source_file_start_pos: 1000,
 422     lines: vec![0 + 1000, 7 + 1000, 27 + 1000],
 423     multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)],
 424     non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)],
 425 );