Respond to review feedback, and improve implementation somewhat

author Thom Chiovoloni <chiovolonit@gmail.com>

Tue, 1 Feb 2022 03:07:08 +0000 (19:07 -0800)

committer Thom Chiovoloni <chiovolonit@gmail.com>

Sat, 5 Feb 2022 19:15:18 +0000 (11:15 -0800)
author Thom Chiovoloni <chiovolonit@gmail.com>
Tue, 1 Feb 2022 03:07:08 +0000 (19:07 -0800)
committer Thom Chiovoloni <chiovolonit@gmail.com>
Sat, 5 Feb 2022 19:15:18 +0000 (11:15 -0800)
diff --git a/library/core/benches/str/char_count.rs b/library/core/benches/str/char_count.rs

index f19d0941142546949422ea9850f7bddfaaa2ec92..25d9b2e299223dc6f8ec681ff699af1128178867 100644 (file)
--- a/library/core/benches/str/char_count.rs
+++ b/library/core/benches/str/char_count.rs
@@ -3,21 +3,25 @@
  
  macro_rules! define_benches {
      ($( fn $name: ident($arg: ident: &str) $body: block )+) => {
+        define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+);
          define_benches!(mod en_small, en::SMALL, $($name $arg $body)+);
          define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+);
          define_benches!(mod en_large, en::LARGE, $($name $arg $body)+);
          define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+);
  
+        define_benches!(mod zh_tiny, zh::TINY, $($name $arg $body)+);
          define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+);
          define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+);
          define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+);
          define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+);
  
+        define_benches!(mod ru_tiny, ru::TINY, $($name $arg $body)+);
          define_benches!(mod ru_small, ru::SMALL, $($name $arg $body)+);
          define_benches!(mod ru_medium, ru::MEDIUM, $($name $arg $body)+);
          define_benches!(mod ru_large, ru::LARGE, $($name $arg $body)+);
          define_benches!(mod ru_huge, ru::HUGE, $($name $arg $body)+);
  
+        define_benches!(mod emoji_tiny, emoji::TINY, $($name $arg $body)+);
          define_benches!(mod emoji_small, emoji::SMALL, $($name $arg $body)+);
          define_benches!(mod emoji_medium, emoji::MEDIUM, $($name $arg $body)+);
          define_benches!(mod emoji_large, emoji::LARGE, $($name $arg $body)+);
@@ -43,12 +47,12 @@ fn $name(bencher: &mut Bencher) {
  }
  
  define_benches! {
-    fn case00_cur_libcore(s: &str) {
-        cur_libcore(s)
+    fn case00_libcore(s: &str) {
+        libcore(s)
      }
  
-    fn case01_old_libcore(s: &str) {
-        old_libcore(s)
+    fn case01_filter_count_cont_bytes(s: &str) {
+        filter_count_cont_bytes(s)
      }
  
      fn case02_iter_increment(s: &str) {
@@ -60,14 +64,16 @@ fn case03_manual_char_len(s: &str) {
      }
  }
  
-fn cur_libcore(s: &str) -> usize {
+fn libcore(s: &str) -> usize {
      s.chars().count()
  }
+
  #[inline]
  fn utf8_is_cont_byte(byte: u8) -> bool {
      (byte as i8) < -64
  }
-fn old_libcore(s: &str) -> usize {
+
+fn filter_count_cont_bytes(s: &str) -> usize {
      s.as_bytes().iter().filter(|&&byte| !utf8_is_cont_byte(byte)).count()
  }
  
diff --git a/library/core/benches/str/corpora.rs b/library/core/benches/str/corpora.rs

index 04e60f0144a029f241af1e722244c5e830a306ea..b4ac625061dfa524bbe7522985fae02032316365 100644 (file)
--- a/library/core/benches/str/corpora.rs
+++ b/library/core/benches/str/corpora.rs
@@ -1,8 +1,9 @@
  //! Exposes a number of modules with different kinds of strings.
  //!
-//! Each module contains `&str` constants named `SMALL`, `MEDIUM`, `LARGE`, and
-//! `HUGE`.
+//! Each module contains `&str` constants named `TINY`, `SMALL`, `MEDIUM`,
+//! `LARGE`, and `HUGE`.
  //!
+//! - The `TINY` string is generally around 8 bytes.
  //! - The `SMALL` string is generally around 30-40 bytes.
  //! - The `MEDIUM` string is generally around 600-700 bytes.
  //! - The `LARGE` string is the `MEDIUM` string repeated 8x, and is around 5kb.
@@ -27,6 +28,7 @@ macro_rules! define_consts {
  }
  
  pub mod en {
+    pub const TINY: &str = "Mary had";
      pub const SMALL: &str = "Mary had a little lamb, Little lamb";
      define_consts! {
          "Rust is blazingly fast and memory-efficient: with no runtime or garbage
@@ -42,6 +44,7 @@ pub mod en {
  }
  
  pub mod zh {
+    pub const TINY: &str = "速度惊";
      pub const SMALL: &str = "速度惊人且内存利用率极高";
      define_consts! {
          "Rust   速度惊人且内存利用率极高。由于\
@@ -59,6 +62,7 @@ pub mod zh {
  }
  
  pub mod ru {
+    pub const TINY: &str = "Сотни";
      pub const SMALL: &str = "Сотни компаний по";
      define_consts! {
          "Сотни компаний по всему миру используют Rust в реальных\
@@ -72,6 +76,7 @@ pub mod ru {
  }
  
  pub mod emoji {
+    pub const TINY: &str = "😀😃";
      pub const SMALL: &str = "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘";
      define_consts! {
          "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘😗☺😚😙🥲😋😛😜🤪😝🤑🤗🤭🤫🤔🤐🤨😐😑😶😶‍🌫️😏😒\
diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs

index 464c6889c323a6a577b08460591c289503a7afb4..a80ebac734d78a9b8caaf46d75c8a48ac7e9cb55 100644 (file)
--- a/library/core/src/str/count.rs
+++ b/library/core/src/str/count.rs
@@ -17,27 +17,57 @@
  //! Note: Because the term "leading byte" can sometimes be ambiguous (for
  //! example, it could also refer to the first byte of a slice), we'll often use
  //! the term "non-continuation byte" to refer to these bytes in the code.
+use core::intrinsics::unlikely;
  
+const USIZE_SIZE: usize = core::mem::size_of::<usize>();
+const UNROLL_INNER: usize = 4;
+
+#[inline]
  pub(super) fn count_chars(s: &str) -> usize {
+    if s.len() < USIZE_SIZE * UNROLL_INNER {
+        // Avoid entering the optimized implementation for strings where the
+        // difference is not likely to matter, or where it might even be slower.
+        // That said, a ton of thought was not spent on the particular threshold
+        // here, beyond "this value seems to make sense".
+        char_count_general_case(s.as_bytes())
+    } else {
+        do_count_chars(s)
+    }
+}
+
+fn do_count_chars(s: &str) -> usize {
      // For correctness, `CHUNK_SIZE` must be:
+    //
      // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`.
      // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the
      //   `body.chunks(CHUNK_SIZE)` loop.
      //
      // For performance, `CHUNK_SIZE` should be:
-    // - Relatively cheap to `%` against.
+    // - Relatively cheap to `/` against (so some simple sum of powers of two).
      // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize`
      //   too often.
      const CHUNK_SIZE: usize = 192;
-    const UNROLL_INNER: usize = 4;
  
-    // Check the properties of `CHUNK_SIZE` / `UNROLL_INNER` that are required
+    // Check the properties of `CHUNK_SIZE` and `UNROLL_INNER` that are required
      // for correctness.
-    const _: [(); 1] = [(); (CHUNK_SIZE < 256 && (CHUNK_SIZE % UNROLL_INNER) == 0) as usize];
+    const _: () = assert!(CHUNK_SIZE < 256);
+    const _: () = assert!(CHUNK_SIZE % UNROLL_INNER == 0);
+
      // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size
      // differences which are handled by `align_to`.
      let (head, body, tail) = unsafe { s.as_bytes().align_to::<usize>() };
  
+    // This should be quite rare, and basically exists to handle the degenerate
+    // cases where align_to fails (as well as miri under symbolic alignment
+    // mode).
+    //
+    // The `unlikely` helps discourage LLVM from inlining the body, which is
+    // nice, as we would rather not mark the `char_count_general_case` function
+    // as cold.
+    if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) {
+        return char_count_general_case(s.as_bytes());
+    }
+
      let mut total = char_count_general_case(head) + char_count_general_case(tail);
      // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which
      // we call `sum_bytes_in_usize`.
@@ -45,11 +75,8 @@ pub(super) fn count_chars(s: &str) -> usize {
          // We accumulate intermediate sums in `counts`, where each byte contains
          // a subset of the sum of this chunk, like a `[u8; size_of::<usize>()]`.
          let mut counts = 0;
-        let unrolled_chunks = chunk.array_chunks::<UNROLL_INNER>();
-        // If there's a remainder (know can only happen for the last item in
-        // `chunks`, because `CHUNK_SIZE % UNROLL == 0`), then we need to
-        // account for that (although we don't use it to later).
-        let remainder = unrolled_chunks.remainder();
+
+        let (unrolled_chunks, remainder) = chunk.as_chunks::<UNROLL_INNER>();
          for unrolled in unrolled_chunks {
              for &word in unrolled {
                  // Because `CHUNK_SIZE` is < 256, this addition can't cause the
@@ -85,8 +112,8 @@ pub(super) fn count_chars(s: &str) -> usize {
  // true)
  #[inline]
  fn contains_non_continuation_byte(w: usize) -> usize {
-    let lsb = 0x0101_0101_0101_0101u64 as usize;
-    ((!w >> 7) | (w >> 6)) & lsb
+    const LSB: usize = 0x0101_0101_0101_0101u64 as usize;
+    ((!w >> 7) | (w >> 6)) & LSB
  }
  
  // Morally equivalent to `values.to_ne_bytes().into_iter().sum::<usize>()`, but
@@ -97,7 +124,7 @@ fn sum_bytes_in_usize(values: usize) -> usize {
      const SKIP_BYTES: usize = 0x00ff_00ff_00ff_00ff_u64 as usize;
  
      let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES);
-    pair_sum.wrapping_mul(LSB_SHORTS) >> ((core::mem::size_of::<usize>() - 2) * 8)
+    pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8)
  }
  
  // This is the most direct implementation of the concept of "count the number of
@@ -105,12 +132,5 @@ fn sum_bytes_in_usize(values: usize) -> usize {
  // head and tail of the input string (the first and last item in the tuple
  // returned by `slice::align_to`).
  fn char_count_general_case(s: &[u8]) -> usize {
-    const CONT_MASK_U8: u8 = 0b0011_1111;
-    const TAG_CONT_U8: u8 = 0b1000_0000;
-    let mut leads = 0;
-    for &byte in s {
-        let is_lead = (byte & !CONT_MASK_U8) != TAG_CONT_U8;
-        leads += is_lead as usize;
-    }
-    leads
+    s.iter().filter(|&&byte| !super::validations::utf8_is_cont_byte(byte)).count()
  }
author	Thom Chiovoloni <chiovolonit@gmail.com>
	Tue, 1 Feb 2022 03:07:08 +0000 (19:07 -0800)
committer	Thom Chiovoloni <chiovolonit@gmail.com>
	Sat, 5 Feb 2022 19:15:18 +0000 (11:15 -0800)
library/core/benches/str/char_count.rs		patch \| blob \| history
library/core/benches/str/corpora.rs		patch \| blob \| history
library/core/src/str/count.rs		patch \| blob \| history