src/libsyntax_pos/analyze_source_file.rs

   1 use unicode_width::UnicodeWidthChar;
   2 use super::*;
   3
   4 #[cfg(test)]
   5 mod tests;
   6
   7 /// Finds all newlines, multi-byte characters, and non-narrow characters in a
   8 /// SourceFile.
   9 ///
  10 /// This function will use an SSE2 enhanced implementation if hardware support
  11 /// is detected at runtime.
  12 pub fn analyze_source_file(
  13     src: &str,
  14     source_file_start_pos: BytePos)
  15     -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>)
  16 {
  17     let mut lines = vec![source_file_start_pos];
  18     let mut multi_byte_chars = vec![];
  19     let mut non_narrow_chars = vec![];
  20
  21     // Calls the right implementation, depending on hardware support available.
  22     analyze_source_file_dispatch(src,
  23                              source_file_start_pos,
  24                              &mut lines,
  25                              &mut multi_byte_chars,
  26                              &mut non_narrow_chars);
  27
  28     // The code above optimistically registers a new line *after* each \n
  29     // it encounters. If that point is already outside the source_file, remove
  30     // it again.
  31     if let Some(&last_line_start) = lines.last() {
  32         let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
  33         assert!(source_file_end >= last_line_start);
  34         if last_line_start == source_file_end {
  35             lines.pop();
  36         }
  37     }
  38
  39     (lines, multi_byte_chars, non_narrow_chars)
  40 }
  41
  42 cfg_if::cfg_if! {
  43     if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] {
  44         fn analyze_source_file_dispatch(src: &str,
  45                                     source_file_start_pos: BytePos,
  46                                     lines: &mut Vec<BytePos>,
  47                                     multi_byte_chars: &mut Vec<MultiByteChar>,
  48                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
  49             if is_x86_feature_detected!("sse2") {
  50                 unsafe {
  51                     analyze_source_file_sse2(src,
  52                                          source_file_start_pos,
  53                                          lines,
  54                                          multi_byte_chars,
  55                                          non_narrow_chars);
  56                 }
  57             } else {
  58                 analyze_source_file_generic(src,
  59                                         src.len(),
  60                                         source_file_start_pos,
  61                                         lines,
  62                                         multi_byte_chars,
  63                                         non_narrow_chars);
  64
  65             }
  66         }
  67
  68         /// Checks 16 byte chunks of text at a time. If the chunk contains
  69         /// something other than printable ASCII characters and newlines, the
  70         /// function falls back to the generic implementation. Otherwise it uses
  71         /// SSE2 intrinsics to quickly find all newlines.
  72         #[target_feature(enable = "sse2")]
  73         unsafe fn analyze_source_file_sse2(src: &str,
  74                                        output_offset: BytePos,
  75                                        lines: &mut Vec<BytePos>,
  76                                        multi_byte_chars: &mut Vec<MultiByteChar>,
  77                                        non_narrow_chars: &mut Vec<NonNarrowChar>) {
  78             #[cfg(target_arch = "x86")]
  79             use std::arch::x86::*;
  80             #[cfg(target_arch = "x86_64")]
  81             use std::arch::x86_64::*;
  82
  83             const CHUNK_SIZE: usize = 16;
  84
  85             let src_bytes = src.as_bytes();
  86
  87             let chunk_count = src.len() / CHUNK_SIZE;
  88
  89             // This variable keeps track of where we should start decoding a
  90             // chunk. If a multi-byte character spans across chunk boundaries,
  91             // we need to skip that part in the next chunk because we already
  92             // handled it.
  93             let mut intra_chunk_offset = 0;
  94
  95             for chunk_index in 0 .. chunk_count {
  96                 let ptr = src_bytes.as_ptr() as *const __m128i;
  97                 // We don't know if the pointer is aligned to 16 bytes, so we
  98                 // use `loadu`, which supports unaligned loading.
  99                 let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize));
 100
 101                 // For character in the chunk, see if its byte value is < 0, which
 102                 // indicates that it's part of a UTF-8 char.
 103                 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
 104                 // Create a bit mask from the comparison results.
 105                 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
 106
 107                 // If the bit mask is all zero, we only have ASCII chars here:
 108                 if multibyte_mask == 0 {
 109                     assert!(intra_chunk_offset == 0);
 110
 111                     // Check if there are any control characters in the chunk. All
 112                     // control characters that we can encounter at this point have a
 113                     // byte value less than 32 or ...
 114                     let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32));
 115                     let control_char_mask0 = _mm_movemask_epi8(control_char_test0);
 116
 117                     // ... it's the ASCII 'DEL' character with a value of 127.
 118                     let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127));
 119                     let control_char_mask1 = _mm_movemask_epi8(control_char_test1);
 120
 121                     let control_char_mask = control_char_mask0 | control_char_mask1;
 122
 123                     if control_char_mask != 0 {
 124                         // Check for newlines in the chunk
 125                         let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
 126                         let newlines_mask = _mm_movemask_epi8(newlines_test);
 127
 128                         if control_char_mask == newlines_mask {
 129                             // All control characters are newlines, record them
 130                             let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
 131                             let output_offset = output_offset +
 132                                 BytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
 133
 134                             loop {
 135                                 let index = newlines_mask.trailing_zeros();
 136
 137                                 if index >= CHUNK_SIZE as u32 {
 138                                     // We have arrived at the end of the chunk.
 139                                     break
 140                                 }
 141
 142                                 lines.push(BytePos(index) + output_offset);
 143
 144                                 // Clear the bit, so we can find the next one.
 145                                 newlines_mask &= (!1) << index;
 146                             }
 147
 148                             // We are done for this chunk. All control characters were
 149                             // newlines and we took care of those.
 150                             continue
 151                         } else {
 152                             // Some of the control characters are not newlines,
 153                             // fall through to the slow path below.
 154                         }
 155                     } else {
 156                         // No control characters, nothing to record for this chunk
 157                         continue
 158                     }
 159                 }
 160
 161                 // The slow path.
 162                 // There are control chars in here, fallback to generic decoding.
 163                 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
 164                 intra_chunk_offset = analyze_source_file_generic(
 165                     &src[scan_start .. ],
 166                     CHUNK_SIZE - intra_chunk_offset,
 167                     BytePos::from_usize(scan_start) + output_offset,
 168                     lines,
 169                     multi_byte_chars,
 170                     non_narrow_chars
 171                 );
 172             }
 173
 174             // There might still be a tail left to analyze
 175             let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
 176             if tail_start < src.len() {
 177                 analyze_source_file_generic(&src[tail_start as usize ..],
 178                                         src.len() - tail_start,
 179                                         output_offset + BytePos::from_usize(tail_start),
 180                                         lines,
 181                                         multi_byte_chars,
 182                                         non_narrow_chars);
 183             }
 184         }
 185     } else {
 186
 187         // The target (or compiler version) does not support SSE2 ...
 188         fn analyze_source_file_dispatch(src: &str,
 189                                     source_file_start_pos: BytePos,
 190                                     lines: &mut Vec<BytePos>,
 191                                     multi_byte_chars: &mut Vec<MultiByteChar>,
 192                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
 193             analyze_source_file_generic(src,
 194                                     src.len(),
 195                                     source_file_start_pos,
 196                                     lines,
 197                                     multi_byte_chars,
 198                                     non_narrow_chars);
 199         }
 200     }
 201 }
 202
 203 // `scan_len` determines the number of bytes in `src` to scan. Note that the
 204 // function can read past `scan_len` if a multi-byte character start within the
 205 // range but extends past it. The overflow is returned by the function.
 206 fn analyze_source_file_generic(src: &str,
 207                            scan_len: usize,
 208                            output_offset: BytePos,
 209                            lines: &mut Vec<BytePos>,
 210                            multi_byte_chars: &mut Vec<MultiByteChar>,
 211                            non_narrow_chars: &mut Vec<NonNarrowChar>)
 212                            -> usize
 213 {
 214     assert!(src.len() >= scan_len);
 215     let mut i = 0;
 216     let src_bytes = src.as_bytes();
 217
 218     while i < scan_len {
 219         let byte = unsafe {
 220             // We verified that i < scan_len <= src.len()
 221             *src_bytes.get_unchecked(i as usize)
 222         };
 223
 224         // How much to advance in order to get to the next UTF-8 char in the
 225         // string.
 226         let mut char_len = 1;
 227
 228         if byte < 32 {
 229             // This is an ASCII control character, it could be one of the cases
 230             // that are interesting to us.
 231
 232             let pos = BytePos::from_usize(i) + output_offset;
 233
 234             match byte {
 235                 b'\n' => {
 236                     lines.push(pos + BytePos(1));
 237                 }
 238                 b'\t' => {
 239                     non_narrow_chars.push(NonNarrowChar::Tab(pos));
 240                 }
 241                 _ => {
 242                     non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
 243                 }
 244             }
 245         } else if byte >= 127 {
 246             // The slow path:
 247             // This is either ASCII control character "DEL" or the beginning of
 248             // a multibyte char. Just decode to `char`.
 249             let c = (&src[i..]).chars().next().unwrap();
 250             char_len = c.len_utf8();
 251
 252             let pos = BytePos::from_usize(i) + output_offset;
 253
 254             if char_len > 1 {
 255                 assert!(char_len >=2 && char_len <= 4);
 256                 let mbc = MultiByteChar {
 257                     pos,
 258                     bytes: char_len as u8,
 259                 };
 260                 multi_byte_chars.push(mbc);
 261             }
 262
 263             // Assume control characters are zero width.
 264             // FIXME: How can we decide between `width` and `width_cjk`?
 265             let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
 266
 267             if char_width != 1 {
 268                 non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
 269             }
 270         }
 271
 272         i += char_len;
 273     }
 274
 275     i - scan_len
 276 }