]> git.lizzy.rs Git - rust.git/commitdiff
std: convert first_non_utf8_byte to use the iterator.
authorHuon Wilson <dbau.pp+github@gmail.com>
Sun, 16 Feb 2014 06:12:47 +0000 (17:12 +1100)
committerHuon Wilson <dbau.pp+github@gmail.com>
Tue, 18 Feb 2014 10:55:53 +0000 (21:55 +1100)
This makes it very slightly faster, especially when the string is valid
UTF-8, and completely removes the use of `unsafe` from the first half.

Before:

    from_utf8_lossy_100_ascii              ... bench:       151 ns/iter (+/- 17)
    from_utf8_lossy_100_invalid            ... bench:       447 ns/iter (+/- 33)
    from_utf8_lossy_100_multibyte          ... bench:       135 ns/iter (+/- 4)
    from_utf8_lossy_invalid                ... bench:       124 ns/iter (+/- 10

After:

    from_utf8_lossy_100_ascii              ... bench:       119 ns/iter (+/- 8)
    from_utf8_lossy_100_invalid            ... bench:       454 ns/iter (+/- 16)
    from_utf8_lossy_100_multibyte          ... bench:       116 ns/iter (+/- 9)
    from_utf8_lossy_invalid                ... bench:       119 ns/iter (+/- 9)

src/libstd/str.rs

index 570df45e08070a0360555d5c5d058cc09baa7127..a780a912d4df3cfa1d954a84a265d6f0f1d2fa3a 100644 (file)
@@ -813,69 +813,19 @@ pub fn is_utf8(v: &[u8]) -> bool {
 
 #[inline(always)]
 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
-    let mut i = 0u;
-    let total = v.len();
-    fn unsafe_get(xs: &[u8], i: uint) -> u8 {
-        unsafe { *xs.unsafe_ref(i) }
-    }
-    while i < total {
-        let v_i = unsafe_get(v, i);
-        if v_i < 128u8 {
-            i += 1u;
-        } else {
-            let w = utf8_char_width(v_i);
-            if w == 0u { return Some(i); }
-
-            let nexti = i + w;
-            if nexti > total { return Some(i); }
+    let mut it = v.iter();
 
-            // 2-byte encoding is for codepoints  \u0080 to  \u07ff
-            //        first  C2 80        last DF BF
-            // 3-byte encoding is for codepoints  \u0800 to  \uffff
-            //        first  E0 A0 80     last EF BF BF
-            //   excluding surrogates codepoints  \ud800 to  \udfff
-            //               ED A0 80 to       ED BF BF
-            // 4-byte encoding is for codepoints \u10000 to \u10ffff
-            //        first  F0 90 80 80  last F4 8F BF BF
-            //
-            // Use the UTF-8 syntax from the RFC
-            //
-            // https://tools.ietf.org/html/rfc3629
-            // UTF8-1      = %x00-7F
-            // UTF8-2      = %xC2-DF UTF8-tail
-            // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
-            //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
-            // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
-            //               %xF4 %x80-8F 2( UTF8-tail )
-            // UTF8-tail   = %x80-BF
-            match w {
-                2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
-                    return Some(i)
-                },
-                3 => match (v_i,
-                            unsafe_get(v, i + 1),
-                            unsafe_get(v, i + 2) & 192u8) {
-                    (0xE0        , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
-                    (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
-                    (0xED        , 0x80 .. 0x9F, TAG_CONT_U8) => (),
-                    (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
-                    _ => return Some(i),
-                },
-                _ => match (v_i,
-                            unsafe_get(v, i + 1),
-                            unsafe_get(v, i + 2) & 192u8,
-                            unsafe_get(v, i + 3) & 192u8) {
-                    (0xF0        , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
-                    (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
-                    (0xF4        , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
-                    _ => return Some(i)
-                },
-            }
-
-            i = nexti;
-        }
+    let ok = run_utf8_validation_iterator(&mut it);
+    if ok {
+        None
+    } else {
+        // work out how many valid bytes we've consumed
+        // (run_utf8_validation_iterator resets the iterator to just
+        // after the last good byte), which we can do because the
+        // vector iterator size_hint is exact.
+        let (remaining, _) = it.size_hint();
+        Some(v.len() - remaining)
     }
-    None
 }
 
 /// Determines if a vector of `u16` contains valid UTF-16