#[inline(always)]
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
- let mut i = 0u;
- let total = v.len();
- fn unsafe_get(xs: &[u8], i: uint) -> u8 {
- unsafe { *xs.unsafe_ref(i) }
- }
- while i < total {
- let v_i = unsafe_get(v, i);
- if v_i < 128u8 {
- i += 1u;
- } else {
- let w = utf8_char_width(v_i);
- if w == 0u { return Some(i); }
-
- let nexti = i + w;
- if nexti > total { return Some(i); }
+ let mut it = v.iter();
- // 2-byte encoding is for codepoints \u0080 to \u07ff
- // first C2 80 last DF BF
- // 3-byte encoding is for codepoints \u0800 to \uffff
- // first E0 A0 80 last EF BF BF
- // excluding surrogates codepoints \ud800 to \udfff
- // ED A0 80 to ED BF BF
- // 4-byte encoding is for codepoints \u10000 to \u10ffff
- // first F0 90 80 80 last F4 8F BF BF
- //
- // Use the UTF-8 syntax from the RFC
- //
- // https://tools.ietf.org/html/rfc3629
- // UTF8-1 = %x00-7F
- // UTF8-2 = %xC2-DF UTF8-tail
- // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
- // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
- // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
- // %xF4 %x80-8F 2( UTF8-tail )
- // UTF8-tail = %x80-BF
- match w {
- 2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
- return Some(i)
- },
- 3 => match (v_i,
- unsafe_get(v, i + 1),
- unsafe_get(v, i + 2) & 192u8) {
- (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
- (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
- (0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
- (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
- _ => return Some(i),
- },
- _ => match (v_i,
- unsafe_get(v, i + 1),
- unsafe_get(v, i + 2) & 192u8,
- unsafe_get(v, i + 3) & 192u8) {
- (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
- (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
- (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
- _ => return Some(i)
- },
- }
-
- i = nexti;
- }
+ let ok = run_utf8_validation_iterator(&mut it);
+ if ok {
+ None
+ } else {
+ // work out how many valid bytes we've consumed
+ // (run_utf8_validation_iterator resets the iterator to just
+ // after the last good byte), which we can do because the
+ // vector iterator size_hint is exact.
+ let (remaining, _) = it.size_hint();
+ Some(v.len() - remaining)
}
- None
}
/// Determines if a vector of `u16` contains valid UTF-16