library/core/src/slice/ascii.rs

   1 //! Operations on ASCII `[u8]`.
   2
   3 use crate::ascii;
   4 use crate::fmt::{self, Write};
   5 use crate::iter;
   6 use crate::mem;
   7 use crate::ops;
   8
   9 #[cfg(not(test))]
  10 impl [u8] {
  11     /// Checks if all bytes in this slice are within the ASCII range.
  12     #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
  13     #[must_use]
  14     #[inline]
  15     pub fn is_ascii(&self) -> bool {
  16         is_ascii(self)
  17     }
  18
  19     /// Checks that two slices are an ASCII case-insensitive match.
  20     ///
  21     /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
  22     /// but without allocating and copying temporaries.
  23     #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
  24     #[must_use]
  25     #[inline]
  26     pub fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
  27         self.len() == other.len() && iter::zip(self, other).all(|(a, b)| a.eq_ignore_ascii_case(b))
  28     }
  29
  30     /// Converts this slice to its ASCII upper case equivalent in-place.
  31     ///
  32     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
  33     /// but non-ASCII letters are unchanged.
  34     ///
  35     /// To return a new uppercased value without modifying the existing one, use
  36     /// [`to_ascii_uppercase`].
  37     ///
  38     /// [`to_ascii_uppercase`]: #method.to_ascii_uppercase
  39     #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
  40     #[inline]
  41     pub fn make_ascii_uppercase(&mut self) {
  42         for byte in self {
  43             byte.make_ascii_uppercase();
  44         }
  45     }
  46
  47     /// Converts this slice to its ASCII lower case equivalent in-place.
  48     ///
  49     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
  50     /// but non-ASCII letters are unchanged.
  51     ///
  52     /// To return a new lowercased value without modifying the existing one, use
  53     /// [`to_ascii_lowercase`].
  54     ///
  55     /// [`to_ascii_lowercase`]: #method.to_ascii_lowercase
  56     #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
  57     #[inline]
  58     pub fn make_ascii_lowercase(&mut self) {
  59         for byte in self {
  60             byte.make_ascii_lowercase();
  61         }
  62     }
  63
  64     /// Returns an iterator that produces an escaped version of this slice,
  65     /// treating it as an ASCII string.
  66     ///
  67     /// # Examples
  68     ///
  69     /// ```
  70     ///
  71     /// let s = b"0\t\r\n'\"\\\x9d";
  72     /// let escaped = s.escape_ascii().to_string();
  73     /// assert_eq!(escaped, "0\\t\\r\\n\\'\\\"\\\\\\x9d");
  74     /// ```
  75     #[must_use = "this returns the escaped bytes as an iterator, \
  76                   without modifying the original"]
  77     #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
  78     pub fn escape_ascii(&self) -> EscapeAscii<'_> {
  79         EscapeAscii { inner: self.iter().flat_map(EscapeByte) }
  80     }
  81
  82     /// Returns a byte slice with leading ASCII whitespace bytes removed.
  83     ///
  84     /// 'Whitespace' refers to the definition used by
  85     /// `u8::is_ascii_whitespace`.
  86     ///
  87     /// # Examples
  88     ///
  89     /// ```
  90     /// #![feature(byte_slice_trim_ascii)]
  91     ///
  92     /// assert_eq!(b" \t hello world\n".trim_ascii_start(), b"hello world\n");
  93     /// assert_eq!(b"  ".trim_ascii_start(), b"");
  94     /// assert_eq!(b"".trim_ascii_start(), b"");
  95     /// ```
  96     #[unstable(feature = "byte_slice_trim_ascii", issue = "94035")]
  97     pub const fn trim_ascii_start(&self) -> &[u8] {
  98         let mut bytes = self;
  99         // Note: A pattern matching based approach (instead of indexing) allows
 100         // making the function const.
 101         while let [first, rest @ ..] = bytes {
 102             if first.is_ascii_whitespace() {
 103                 bytes = rest;
 104             } else {
 105                 break;
 106             }
 107         }
 108         bytes
 109     }
 110
 111     /// Returns a byte slice with trailing ASCII whitespace bytes removed.
 112     ///
 113     /// 'Whitespace' refers to the definition used by
 114     /// `u8::is_ascii_whitespace`.
 115     ///
 116     /// # Examples
 117     ///
 118     /// ```
 119     /// #![feature(byte_slice_trim_ascii)]
 120     ///
 121     /// assert_eq!(b"\r hello world\n ".trim_ascii_end(), b"\r hello world");
 122     /// assert_eq!(b"  ".trim_ascii_end(), b"");
 123     /// assert_eq!(b"".trim_ascii_end(), b"");
 124     /// ```
 125     #[unstable(feature = "byte_slice_trim_ascii", issue = "94035")]
 126     pub const fn trim_ascii_end(&self) -> &[u8] {
 127         let mut bytes = self;
 128         // Note: A pattern matching based approach (instead of indexing) allows
 129         // making the function const.
 130         while let [rest @ .., last] = bytes {
 131             if last.is_ascii_whitespace() {
 132                 bytes = rest;
 133             } else {
 134                 break;
 135             }
 136         }
 137         bytes
 138     }
 139
 140     /// Returns a byte slice with leading and trailing ASCII whitespace bytes
 141     /// removed.
 142     ///
 143     /// 'Whitespace' refers to the definition used by
 144     /// `u8::is_ascii_whitespace`.
 145     ///
 146     /// # Examples
 147     ///
 148     /// ```
 149     /// #![feature(byte_slice_trim_ascii)]
 150     ///
 151     /// assert_eq!(b"\r hello world\n ".trim_ascii(), b"hello world");
 152     /// assert_eq!(b"  ".trim_ascii(), b"");
 153     /// assert_eq!(b"".trim_ascii(), b"");
 154     /// ```
 155     #[unstable(feature = "byte_slice_trim_ascii", issue = "94035")]
 156     pub const fn trim_ascii(&self) -> &[u8] {
 157         self.trim_ascii_start().trim_ascii_end()
 158     }
 159 }
 160
 161 impl_fn_for_zst! {
 162     #[derive(Clone)]
 163     struct EscapeByte impl Fn = |byte: &u8| -> ascii::EscapeDefault {
 164         ascii::escape_default(*byte)
 165     };
 166 }
 167
 168 /// An iterator over the escaped version of a byte slice.
 169 ///
 170 /// This `struct` is created by the [`slice::escape_ascii`] method. See its
 171 /// documentation for more information.
 172 #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
 173 #[derive(Clone)]
 174 #[must_use = "iterators are lazy and do nothing unless consumed"]
 175 pub struct EscapeAscii<'a> {
 176     inner: iter::FlatMap<super::Iter<'a, u8>, ascii::EscapeDefault, EscapeByte>,
 177 }
 178
 179 #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
 180 impl<'a> iter::Iterator for EscapeAscii<'a> {
 181     type Item = u8;
 182     #[inline]
 183     fn next(&mut self) -> Option<u8> {
 184         self.inner.next()
 185     }
 186     #[inline]
 187     fn size_hint(&self) -> (usize, Option<usize>) {
 188         self.inner.size_hint()
 189     }
 190     #[inline]
 191     fn try_fold<Acc, Fold, R>(&mut self, init: Acc, fold: Fold) -> R
 192     where
 193         Fold: FnMut(Acc, Self::Item) -> R,
 194         R: ops::Try<Output = Acc>,
 195     {
 196         self.inner.try_fold(init, fold)
 197     }
 198     #[inline]
 199     fn fold<Acc, Fold>(self, init: Acc, fold: Fold) -> Acc
 200     where
 201         Fold: FnMut(Acc, Self::Item) -> Acc,
 202     {
 203         self.inner.fold(init, fold)
 204     }
 205     #[inline]
 206     fn last(mut self) -> Option<u8> {
 207         self.next_back()
 208     }
 209 }
 210
 211 #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
 212 impl<'a> iter::DoubleEndedIterator for EscapeAscii<'a> {
 213     fn next_back(&mut self) -> Option<u8> {
 214         self.inner.next_back()
 215     }
 216 }
 217 #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
 218 impl<'a> iter::FusedIterator for EscapeAscii<'a> {}
 219 #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
 220 impl<'a> fmt::Display for EscapeAscii<'a> {
 221     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 222         self.clone().try_for_each(|b| f.write_char(b as char))
 223     }
 224 }
 225 #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
 226 impl<'a> fmt::Debug for EscapeAscii<'a> {
 227     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 228         f.debug_struct("EscapeAscii").finish_non_exhaustive()
 229     }
 230 }
 231
 232 /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
 233 /// from `../str/mod.rs`, which does something similar for utf8 validation.
 234 #[inline]
 235 fn contains_nonascii(v: usize) -> bool {
 236     const NONASCII_MASK: usize = usize::repeat_u8(0x80);
 237     (NONASCII_MASK & v) != 0
 238 }
 239
 240 /// Optimized ASCII test that will use usize-at-a-time operations instead of
 241 /// byte-at-a-time operations (when possible).
 242 ///
 243 /// The algorithm we use here is pretty simple. If `s` is too short, we just
 244 /// check each byte and be done with it. Otherwise:
 245 ///
 246 /// - Read the first word with an unaligned load.
 247 /// - Align the pointer, read subsequent words until end with aligned loads.
 248 /// - Read the last `usize` from `s` with an unaligned load.
 249 ///
 250 /// If any of these loads produces something for which `contains_nonascii`
 251 /// (above) returns true, then we know the answer is false.
 252 #[inline]
 253 fn is_ascii(s: &[u8]) -> bool {
 254     const USIZE_SIZE: usize = mem::size_of::<usize>();
 255
 256     let len = s.len();
 257     let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
 258
 259     // If we wouldn't gain anything from the word-at-a-time implementation, fall
 260     // back to a scalar loop.
 261     //
 262     // We also do this for architectures where `size_of::<usize>()` isn't
 263     // sufficient alignment for `usize`, because it's a weird edge case.
 264     if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
 265         return s.iter().all(|b| b.is_ascii());
 266     }
 267
 268     // We always read the first word unaligned, which means `align_offset` is
 269     // 0, we'd read the same value again for the aligned read.
 270     let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset };
 271
 272     let start = s.as_ptr();
 273     // SAFETY: We verify `len < USIZE_SIZE` above.
 274     let first_word = unsafe { (start as *const usize).read_unaligned() };
 275
 276     if contains_nonascii(first_word) {
 277         return false;
 278     }
 279     // We checked this above, somewhat implicitly. Note that `offset_to_aligned`
 280     // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked
 281     // above.
 282     debug_assert!(offset_to_aligned <= len);
 283
 284     // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the
 285     // middle chunk of the slice.
 286     let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize };
 287
 288     // `byte_pos` is the byte index of `word_ptr`, used for loop end checks.
 289     let mut byte_pos = offset_to_aligned;
 290
 291     // Paranoia check about alignment, since we're about to do a bunch of
 292     // unaligned loads. In practice this should be impossible barring a bug in
 293     // `align_offset` though.
 294     debug_assert_eq!(word_ptr.addr() % mem::align_of::<usize>(), 0);
 295
 296     // Read subsequent words until the last aligned word, excluding the last
 297     // aligned word by itself to be done in tail check later, to ensure that
 298     // tail is always one `usize` at most to extra branch `byte_pos == len`.
 299     while byte_pos < len - USIZE_SIZE {
 300         debug_assert!(
 301             // Sanity check that the read is in bounds
 302             (word_ptr.addr() + USIZE_SIZE) <= start.addr().wrapping_add(len) &&
 303             // And that our assumptions about `byte_pos` hold.
 304             (word_ptr.addr() - start.addr()) == byte_pos
 305         );
 306
 307         // SAFETY: We know `word_ptr` is properly aligned (because of
 308         // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
 309         let word = unsafe { word_ptr.read() };
 310         if contains_nonascii(word) {
 311             return false;
 312         }
 313
 314         byte_pos += USIZE_SIZE;
 315         // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that
 316         // after this `add`, `word_ptr` will be at most one-past-the-end.
 317         word_ptr = unsafe { word_ptr.add(1) };
 318     }
 319
 320     // Sanity check to ensure there really is only one `usize` left. This should
 321     // be guaranteed by our loop condition.
 322     debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE);
 323
 324     // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start.
 325     let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() };
 326
 327     !contains_nonascii(last_word)
 328 }