library/core/src/str/validations.rs

   1 //! Operations related to UTF-8 validation.
   2
   3 use crate::mem;
   4
   5 use super::Utf8Error;
   6
   7 /// Returns the initial codepoint accumulator for the first byte.
   8 /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
   9 /// for width 3, and 3 bits for width 4.
  10 #[inline]
  11 fn utf8_first_byte(byte: u8, width: u32) -> u32 {
  12     (byte & (0x7F >> width)) as u32
  13 }
  14
  15 /// Returns the value of `ch` updated with continuation byte `byte`.
  16 #[inline]
  17 fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
  18     (ch << 6) | (byte & CONT_MASK) as u32
  19 }
  20
  21 /// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
  22 /// bits `10`).
  23 #[inline]
  24 pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
  25     (byte as i8) < -64
  26 }
  27
  28 #[inline]
  29 fn unwrap_or_0(opt: Option<&u8>) -> u8 {
  30     match opt {
  31         Some(&byte) => byte,
  32         None => 0,
  33     }
  34 }
  35
  36 /// Reads the next code point out of a byte iterator (assuming a
  37 /// UTF-8-like encoding).
  38 #[unstable(feature = "str_internals", issue = "none")]
  39 #[inline]
  40 pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
  41     // Decode UTF-8
  42     let x = *bytes.next()?;
  43     if x < 128 {
  44         return Some(x as u32);
  45     }
  46
  47     // Multibyte case follows
  48     // Decode from a byte combination out of: [[[x y] z] w]
  49     // NOTE: Performance is sensitive to the exact formulation here
  50     let init = utf8_first_byte(x, 2);
  51     let y = unwrap_or_0(bytes.next());
  52     let mut ch = utf8_acc_cont_byte(init, y);
  53     if x >= 0xE0 {
  54         // [[x y z] w] case
  55         // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
  56         let z = unwrap_or_0(bytes.next());
  57         let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
  58         ch = init << 12 | y_z;
  59         if x >= 0xF0 {
  60             // [x y z w] case
  61             // use only the lower 3 bits of `init`
  62             let w = unwrap_or_0(bytes.next());
  63             ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
  64         }
  65     }
  66
  67     Some(ch)
  68 }
  69
  70 /// Reads the last code point out of a byte iterator (assuming a
  71 /// UTF-8-like encoding).
  72 #[inline]
  73 pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
  74 where
  75     I: DoubleEndedIterator<Item = &'a u8>,
  76 {
  77     // Decode UTF-8
  78     let w = match *bytes.next_back()? {
  79         next_byte if next_byte < 128 => return Some(next_byte as u32),
  80         back_byte => back_byte,
  81     };
  82
  83     // Multibyte case follows
  84     // Decode from a byte combination out of: [x [y [z w]]]
  85     let mut ch;
  86     let z = unwrap_or_0(bytes.next_back());
  87     ch = utf8_first_byte(z, 2);
  88     if utf8_is_cont_byte(z) {
  89         let y = unwrap_or_0(bytes.next_back());
  90         ch = utf8_first_byte(y, 3);
  91         if utf8_is_cont_byte(y) {
  92             let x = unwrap_or_0(bytes.next_back());
  93             ch = utf8_first_byte(x, 4);
  94             ch = utf8_acc_cont_byte(ch, y);
  95         }
  96         ch = utf8_acc_cont_byte(ch, z);
  97     }
  98     ch = utf8_acc_cont_byte(ch, w);
  99
 100     Some(ch)
 101 }
 102
 103 // use truncation to fit u64 into usize
 104 const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
 105
 106 /// Returns `true` if any byte in the word `x` is nonascii (>= 128).
 107 #[inline]
 108 fn contains_nonascii(x: usize) -> bool {
 109     (x & NONASCII_MASK) != 0
 110 }
 111
 112 /// Walks through `v` checking that it's a valid UTF-8 sequence,
 113 /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
 114 #[inline(always)]
 115 pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
 116     let mut index = 0;
 117     let len = v.len();
 118
 119     let usize_bytes = mem::size_of::<usize>();
 120     let ascii_block_size = 2 * usize_bytes;
 121     let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
 122     let align = v.as_ptr().align_offset(usize_bytes);
 123
 124     while index < len {
 125         let old_offset = index;
 126         macro_rules! err {
 127             ($error_len: expr) => {
 128                 return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
 129             };
 130         }
 131
 132         macro_rules! next {
 133             () => {{
 134                 index += 1;
 135                 // we needed data, but there was none: error!
 136                 if index >= len {
 137                     err!(None)
 138                 }
 139                 v[index]
 140             }};
 141         }
 142
 143         let first = v[index];
 144         if first >= 128 {
 145             let w = UTF8_CHAR_WIDTH[first as usize];
 146             // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
 147             //        first  C2 80        last DF BF
 148             // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
 149             //        first  E0 A0 80     last EF BF BF
 150             //   excluding surrogates codepoints  \u{d800} to  \u{dfff}
 151             //               ED A0 80 to       ED BF BF
 152             // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
 153             //        first  F0 90 80 80  last F4 8F BF BF
 154             //
 155             // Use the UTF-8 syntax from the RFC
 156             //
 157             // https://tools.ietf.org/html/rfc3629
 158             // UTF8-1      = %x00-7F
 159             // UTF8-2      = %xC2-DF UTF8-tail
 160             // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
 161             //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
 162             // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
 163             //               %xF4 %x80-8F 2( UTF8-tail )
 164             match w {
 165                 2 => {
 166                     if !utf8_is_cont_byte(next!()) {
 167                         err!(Some(1))
 168                     }
 169                 }
 170                 3 => {
 171                     match (first, next!()) {
 172                         (0xE0, 0xA0..=0xBF)
 173                         | (0xE1..=0xEC, 0x80..=0xBF)
 174                         | (0xED, 0x80..=0x9F)
 175                         | (0xEE..=0xEF, 0x80..=0xBF) => {}
 176                         _ => err!(Some(1)),
 177                     }
 178                     if !utf8_is_cont_byte(next!()) {
 179                         err!(Some(2))
 180                     }
 181                 }
 182                 4 => {
 183                     match (first, next!()) {
 184                         (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
 185                         _ => err!(Some(1)),
 186                     }
 187                     if !utf8_is_cont_byte(next!()) {
 188                         err!(Some(2))
 189                     }
 190                     if !utf8_is_cont_byte(next!()) {
 191                         err!(Some(3))
 192                     }
 193                 }
 194                 _ => err!(Some(1)),
 195             }
 196             index += 1;
 197         } else {
 198             // Ascii case, try to skip forward quickly.
 199             // When the pointer is aligned, read 2 words of data per iteration
 200             // until we find a word containing a non-ascii byte.
 201             if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
 202                 let ptr = v.as_ptr();
 203                 while index < blocks_end {
 204                     // SAFETY: since `align - index` and `ascii_block_size` are
 205                     // multiples of `usize_bytes`, `block = ptr.add(index)` is
 206                     // always aligned with a `usize` so it's safe to dereference
 207                     // both `block` and `block.offset(1)`.
 208                     unsafe {
 209                         let block = ptr.add(index) as *const usize;
 210                         // break if there is a nonascii byte
 211                         let zu = contains_nonascii(*block);
 212                         let zv = contains_nonascii(*block.offset(1));
 213                         if zu | zv {
 214                             break;
 215                         }
 216                     }
 217                     index += ascii_block_size;
 218                 }
 219                 // step from the point where the wordwise loop stopped
 220                 while index < len && v[index] < 128 {
 221                     index += 1;
 222                 }
 223             } else {
 224                 index += 1;
 225             }
 226         }
 227     }
 228
 229     Ok(())
 230 }
 231
 232 // https://tools.ietf.org/html/rfc3629
 233 static UTF8_CHAR_WIDTH: [u8; 256] = [
 234     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 235     1, // 0x1F
 236     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 237     1, // 0x3F
 238     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 239     1, // 0x5F
 240     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 241     1, // 0x7F
 242     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 243     0, // 0x9F
 244     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 245     0, // 0xBF
 246     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 247     2, // 0xDF
 248     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
 249     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
 250 ];
 251
 252 /// Given a first byte, determines how many bytes are in this UTF-8 character.
 253 #[unstable(feature = "str_internals", issue = "none")]
 254 #[inline]
 255 pub fn utf8_char_width(b: u8) -> usize {
 256     UTF8_CHAR_WIDTH[b as usize] as usize
 257 }
 258
 259 /// Mask of the value bits of a continuation byte.
 260 const CONT_MASK: u8 = 0b0011_1111;
 261
 262 // truncate `&str` to length at most equal to `max`
 263 // return `true` if it were truncated, and the new str.
 264 pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
 265     if max >= s.len() {
 266         (false, s)
 267     } else {
 268         while !s.is_char_boundary(max) {
 269             max -= 1;
 270         }
 271         (true, &s[..max])
 272     }
 273 }