Add fast path for ASCII in UTF-8 validation

author Ulrik Sverdrup <bluss@users.noreply.github.com>

Wed, 6 Jan 2016 14:43:33 +0000 (15:43 +0100)

committer Ulrik Sverdrup <bluss@users.noreply.github.com>

Tue, 12 Jan 2016 20:57:04 +0000 (21:57 +0100)
author Ulrik Sverdrup <bluss@users.noreply.github.com>
Wed, 6 Jan 2016 14:43:33 +0000 (15:43 +0100)
committer Ulrik Sverdrup <bluss@users.noreply.github.com>
Tue, 12 Jan 2016 20:57:04 +0000 (21:57 +0100)
diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs

index e22ff7ca540610335c22e1b1705d9213c07d1851..ab831de65167a9462a621643fde49033cd2d70a1 100644 (file)
--- a/src/libcollectionstest/str.rs
+++ b/src/libcollectionstest/str.rs
@@ -470,6 +470,18 @@ fn test_is_utf8() {
      assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
  }
  
+#[test]
+fn from_utf8_mostly_ascii() {
+    // deny invalid bytes embedded in long stretches of ascii
+    for i in 32..64 {
+        let mut data = [0; 128];
+        data[i] = 0xC0;
+        assert!(from_utf8(&data).is_err());
+        data[i] = 0xC2;
+        assert!(from_utf8(&data).is_err());
+    }
+}
+
  #[test]
  fn test_is_utf16() {
      use rustc_unicode::str::is_utf16;
diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs

index 40bd108a91860f9b24b02d3dccd1db8307bebd04..64c21836b000322756bd068a196939c631f0cd8a 100644 (file)
--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@@ -32,6 +32,7 @@
  use raw::{Repr, Slice};
  use result::Result::{self, Ok, Err};
  use slice::{self, SliceExt};
+use usize;
  
  pub mod pattern;
  
@@ -240,7 +241,7 @@ pub fn valid_up_to(&self) -> usize { self.valid_up_to }
  /// ```
  #[stable(feature = "rust1", since = "1.0.0")]
  pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
-    try!(run_utf8_validation_iterator(&mut v.iter()));
+    try!(run_utf8_validation(v));
      Ok(unsafe { from_utf8_unchecked(v) })
  }
  
@@ -1074,46 +1075,44 @@ unsafe fn cmp_slice(a: &str, b: &str, len: usize) -> i32 {
  }
  
  /*
-Section: Misc
+Section: UTF-8 validation
  */
  
+// use truncation to fit u64 into usize
+const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
+
+/// Return `true` if any byte in the word `x` is nonascii (>= 128).
+#[inline]
+fn contains_nonascii(x: usize) -> bool {
+    (x & NONASCII_MASK) != 0
+}
+
  /// Walk through `iter` checking that it's a valid UTF-8 sequence,
  /// returning `true` in that case, or, if it is invalid, `false` with
  /// `iter` reset such that it is pointing at the first byte in the
  /// invalid sequence.
  #[inline(always)]
-fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
-                                -> Result<(), Utf8Error> {
-    let whole = iter.as_slice();
-    loop {
-        // save the current thing we're pointing at.
-        let old = iter.clone();
-
-        // restore the iterator we had at the start of this codepoint.
+fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
+    let mut offset = 0;
+    let len = v.len();
+    while offset < len {
+        let old_offset = offset;
          macro_rules! err { () => {{
-            *iter = old.clone();
              return Err(Utf8Error {
-                valid_up_to: whole.len() - iter.as_slice().len()
+                valid_up_to: old_offset
              })
          }}}
  
-        macro_rules! next { () => {
-            match iter.next() {
-                Some(a) => *a,
-                // we needed data, but there was none: error!
-                None => err!(),
+        macro_rules! next { () => {{
+            offset += 1;
+            // we needed data, but there was none: error!
+            if offset >= len {
+                err!()
              }
-        }}
-
-        let first = match iter.next() {
-            Some(&b) => b,
-            // we're at the end of the iterator and a codepoint
-            // boundary at the same time, so this string is valid.
-            None => return Ok(())
-        };
+            v[offset]
+        }}}
  
-        // ASCII characters are always valid, so only large
-        // bytes need more examination.
+        let first = v[offset];
          if first >= 128 {
              let w = UTF8_CHAR_WIDTH[first as usize];
              let second = next!();
@@ -1156,8 +1155,39 @@ macro_rules! next { () => {
                  }
                  _ => err!()
              }
+            offset += 1;
+        } else {
+            // Ascii case, try to skip forward quickly.
+            let ptr = v.as_ptr();
+            let align = (ptr as usize + offset) & (usize::BYTES - 1);
+            if align == 0 {
+                // When the pointer is aligned, read 2 words of data per iteration
+                // until we find a word containing a non-ascii byte.
+                while offset <= len - 2 * usize::BYTES {
+                    unsafe {
+                        let u = *(ptr.offset(offset as isize) as *const usize);
+                        let v = *(ptr.offset((offset + usize::BYTES) as isize) as *const usize);
+
+                        // break if there is a nonascii byte
+                        let zu = contains_nonascii(u);
+                        let zv = contains_nonascii(v);
+                        if zu || zv {
+                            break;
+                        }
+                    }
+                    offset += usize::BYTES * 2;
+                }
+                // step from the point where the wordwise loop stopped
+                while offset < len && v[offset] < 128 {
+                    offset += 1;
+                }
+            } else {
+                offset += 1;
+            }
          }
      }
+
+    Ok(())
  }
  
  // https://tools.ietf.org/html/rfc3629
author	Ulrik Sverdrup <bluss@users.noreply.github.com>
	Wed, 6 Jan 2016 14:43:33 +0000 (15:43 +0100)
committer	Ulrik Sverdrup <bluss@users.noreply.github.com>
	Tue, 12 Jan 2016 20:57:04 +0000 (21:57 +0100)
src/libcollectionstest/str.rs		patch \| blob \| history
src/libcore/str/mod.rs		patch \| blob \| history