]> git.lizzy.rs Git - rust.git/blob - compiler/rustc_span/src/analyze_source_file.rs
Rollup merge of #105034 - HintringerFabian:improve_iterator_flatten_doc, r=cuviper
[rust.git] / compiler / rustc_span / src / analyze_source_file.rs
1 use super::*;
2 use unicode_width::UnicodeWidthChar;
3
4 #[cfg(test)]
5 mod tests;
6
7 /// Finds all newlines, multi-byte characters, and non-narrow characters in a
8 /// SourceFile.
9 ///
10 /// This function will use an SSE2 enhanced implementation if hardware support
11 /// is detected at runtime.
12 pub fn analyze_source_file(
13     src: &str,
14     source_file_start_pos: BytePos,
15 ) -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
16     let mut lines = vec![source_file_start_pos];
17     let mut multi_byte_chars = vec![];
18     let mut non_narrow_chars = vec![];
19
20     // Calls the right implementation, depending on hardware support available.
21     analyze_source_file_dispatch(
22         src,
23         source_file_start_pos,
24         &mut lines,
25         &mut multi_byte_chars,
26         &mut non_narrow_chars,
27     );
28
29     // The code above optimistically registers a new line *after* each \n
30     // it encounters. If that point is already outside the source_file, remove
31     // it again.
32     if let Some(&last_line_start) = lines.last() {
33         let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
34         assert!(source_file_end >= last_line_start);
35         if last_line_start == source_file_end {
36             lines.pop();
37         }
38     }
39
40     (lines, multi_byte_chars, non_narrow_chars)
41 }
42
43 cfg_if::cfg_if! {
44     if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
45         fn analyze_source_file_dispatch(src: &str,
46                                     source_file_start_pos: BytePos,
47                                     lines: &mut Vec<BytePos>,
48                                     multi_byte_chars: &mut Vec<MultiByteChar>,
49                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
50             if is_x86_feature_detected!("sse2") {
51                 unsafe {
52                     analyze_source_file_sse2(src,
53                                          source_file_start_pos,
54                                          lines,
55                                          multi_byte_chars,
56                                          non_narrow_chars);
57                 }
58             } else {
59                 analyze_source_file_generic(src,
60                                         src.len(),
61                                         source_file_start_pos,
62                                         lines,
63                                         multi_byte_chars,
64                                         non_narrow_chars);
65
66             }
67         }
68
69         /// Checks 16 byte chunks of text at a time. If the chunk contains
70         /// something other than printable ASCII characters and newlines, the
71         /// function falls back to the generic implementation. Otherwise it uses
72         /// SSE2 intrinsics to quickly find all newlines.
73         #[target_feature(enable = "sse2")]
74         unsafe fn analyze_source_file_sse2(src: &str,
75                                        output_offset: BytePos,
76                                        lines: &mut Vec<BytePos>,
77                                        multi_byte_chars: &mut Vec<MultiByteChar>,
78                                        non_narrow_chars: &mut Vec<NonNarrowChar>) {
79             #[cfg(target_arch = "x86")]
80             use std::arch::x86::*;
81             #[cfg(target_arch = "x86_64")]
82             use std::arch::x86_64::*;
83
84             const CHUNK_SIZE: usize = 16;
85
86             let src_bytes = src.as_bytes();
87
88             let chunk_count = src.len() / CHUNK_SIZE;
89
90             // This variable keeps track of where we should start decoding a
91             // chunk. If a multi-byte character spans across chunk boundaries,
92             // we need to skip that part in the next chunk because we already
93             // handled it.
94             let mut intra_chunk_offset = 0;
95
96             for chunk_index in 0 .. chunk_count {
97                 let ptr = src_bytes.as_ptr() as *const __m128i;
98                 // We don't know if the pointer is aligned to 16 bytes, so we
99                 // use `loadu`, which supports unaligned loading.
100                 let chunk = _mm_loadu_si128(ptr.add(chunk_index));
101
102                 // For character in the chunk, see if its byte value is < 0, which
103                 // indicates that it's part of a UTF-8 char.
104                 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
105                 // Create a bit mask from the comparison results.
106                 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
107
108                 // If the bit mask is all zero, we only have ASCII chars here:
109                 if multibyte_mask == 0 {
110                     assert!(intra_chunk_offset == 0);
111
112                     // Check if there are any control characters in the chunk. All
113                     // control characters that we can encounter at this point have a
114                     // byte value less than 32 or ...
115                     let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32));
116                     let control_char_mask0 = _mm_movemask_epi8(control_char_test0);
117
118                     // ... it's the ASCII 'DEL' character with a value of 127.
119                     let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127));
120                     let control_char_mask1 = _mm_movemask_epi8(control_char_test1);
121
122                     let control_char_mask = control_char_mask0 | control_char_mask1;
123
124                     if control_char_mask != 0 {
125                         // Check for newlines in the chunk
126                         let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
127                         let newlines_mask = _mm_movemask_epi8(newlines_test);
128
129                         if control_char_mask == newlines_mask {
130                             // All control characters are newlines, record them
131                             let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
132                             let output_offset = output_offset +
133                                 BytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
134
135                             loop {
136                                 let index = newlines_mask.trailing_zeros();
137
138                                 if index >= CHUNK_SIZE as u32 {
139                                     // We have arrived at the end of the chunk.
140                                     break
141                                 }
142
143                                 lines.push(BytePos(index) + output_offset);
144
145                                 // Clear the bit, so we can find the next one.
146                                 newlines_mask &= (!1) << index;
147                             }
148
149                             // We are done for this chunk. All control characters were
150                             // newlines and we took care of those.
151                             continue
152                         } else {
153                             // Some of the control characters are not newlines,
154                             // fall through to the slow path below.
155                         }
156                     } else {
157                         // No control characters, nothing to record for this chunk
158                         continue
159                     }
160                 }
161
162                 // The slow path.
163                 // There are control chars in here, fallback to generic decoding.
164                 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
165                 intra_chunk_offset = analyze_source_file_generic(
166                     &src[scan_start .. ],
167                     CHUNK_SIZE - intra_chunk_offset,
168                     BytePos::from_usize(scan_start) + output_offset,
169                     lines,
170                     multi_byte_chars,
171                     non_narrow_chars
172                 );
173             }
174
175             // There might still be a tail left to analyze
176             let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
177             if tail_start < src.len() {
178                 analyze_source_file_generic(&src[tail_start ..],
179                                         src.len() - tail_start,
180                                         output_offset + BytePos::from_usize(tail_start),
181                                         lines,
182                                         multi_byte_chars,
183                                         non_narrow_chars);
184             }
185         }
186     } else {
187
188         // The target (or compiler version) does not support SSE2 ...
189         fn analyze_source_file_dispatch(src: &str,
190                                     source_file_start_pos: BytePos,
191                                     lines: &mut Vec<BytePos>,
192                                     multi_byte_chars: &mut Vec<MultiByteChar>,
193                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
194             analyze_source_file_generic(src,
195                                     src.len(),
196                                     source_file_start_pos,
197                                     lines,
198                                     multi_byte_chars,
199                                     non_narrow_chars);
200         }
201     }
202 }
203
204 // `scan_len` determines the number of bytes in `src` to scan. Note that the
205 // function can read past `scan_len` if a multi-byte character start within the
206 // range but extends past it. The overflow is returned by the function.
207 fn analyze_source_file_generic(
208     src: &str,
209     scan_len: usize,
210     output_offset: BytePos,
211     lines: &mut Vec<BytePos>,
212     multi_byte_chars: &mut Vec<MultiByteChar>,
213     non_narrow_chars: &mut Vec<NonNarrowChar>,
214 ) -> usize {
215     assert!(src.len() >= scan_len);
216     let mut i = 0;
217     let src_bytes = src.as_bytes();
218
219     while i < scan_len {
220         let byte = unsafe {
221             // We verified that i < scan_len <= src.len()
222             *src_bytes.get_unchecked(i)
223         };
224
225         // How much to advance in order to get to the next UTF-8 char in the
226         // string.
227         let mut char_len = 1;
228
229         if byte < 32 {
230             // This is an ASCII control character, it could be one of the cases
231             // that are interesting to us.
232
233             let pos = BytePos::from_usize(i) + output_offset;
234
235             match byte {
236                 b'\n' => {
237                     lines.push(pos + BytePos(1));
238                 }
239                 b'\t' => {
240                     non_narrow_chars.push(NonNarrowChar::Tab(pos));
241                 }
242                 _ => {
243                     non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
244                 }
245             }
246         } else if byte >= 127 {
247             // The slow path:
248             // This is either ASCII control character "DEL" or the beginning of
249             // a multibyte char. Just decode to `char`.
250             let c = src[i..].chars().next().unwrap();
251             char_len = c.len_utf8();
252
253             let pos = BytePos::from_usize(i) + output_offset;
254
255             if char_len > 1 {
256                 assert!((2..=4).contains(&char_len));
257                 let mbc = MultiByteChar { pos, bytes: char_len as u8 };
258                 multi_byte_chars.push(mbc);
259             }
260
261             // Assume control characters are zero width.
262             // FIXME: How can we decide between `width` and `width_cjk`?
263             let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
264
265             if char_width != 1 {
266                 non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
267             }
268         }
269
270         i += char_len;
271     }
272
273     i - scan_len
274 }