For MIRI, cfg out the swap logic from 94212

author Scott McMurray <scottmcm@users.noreply.github.com>

Sun, 27 Feb 2022 02:57:15 +0000 (18:57 -0800)

committer Scott McMurray <scottmcm@users.noreply.github.com>

Sun, 27 Feb 2022 02:57:15 +0000 (18:57 -0800)
author Scott McMurray <scottmcm@users.noreply.github.com>
Sun, 27 Feb 2022 02:57:15 +0000 (18:57 -0800)
committer Scott McMurray <scottmcm@users.noreply.github.com>
Sun, 27 Feb 2022 02:57:15 +0000 (18:57 -0800)
diff --git a/library/core/src/mem/mod.rs b/library/core/src/mem/mod.rs

index b5c1ae37e5e89091047e583cd43217697c50c275..8a99bed6a96ab186ff528075669ceae80865d3ac 100644 (file)
--- a/library/core/src/mem/mod.rs
+++ b/library/core/src/mem/mod.rs
@@ -708,7 +708,10 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
      // understanding `mem::replace`, `Option::take`, etc. - a better overall
      // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
      // a backend can choose to implement using the block optimization, or not.
-    #[cfg(not(target_arch = "spirv"))]
+    // NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
+    // pessimization for it.  Also, if the type contains any unaligned pointers,
+    // copying those over multiple reads is difficult to support.
+    #[cfg(not(any(target_arch = "spirv", miri)))]
      {
          // For types that are larger multiples of their alignment, the simple way
          // tends to copy the whole thing to stack rather than doing it one part
@@ -737,12 +740,26 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
  #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
  #[inline]
  pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
+    // We arrange for this to typically be called with small types,
+    // so this reads-and-writes approach is actually better than using
+    // copy_nonoverlapping as it easily puts things in LLVM registers
+    // directly and doesn't end up inlining allocas.
+    // And LLVM actually optimizes it to 3×memcpy if called with
+    // a type larger than it's willing to keep in a register.
+    // Having typed reads and writes in MIR here is also good as
+    // it lets MIRI and CTFE understand them better, including things
+    // like enforcing type validity for them.
+    // Importantly, read+copy_nonoverlapping+write introduces confusing
+    // asymmetry to the behaviour where one value went through read+write
+    // whereas the other was copied over by the intrinsic (see #94371).
+
      // SAFETY: exclusive references are always valid to read/write,
-    // are non-overlapping, and nothing here panics so it's drop-safe.
+    // including being aligned, and nothing here panics so it's drop-safe.
      unsafe {
-        let z = ptr::read(x);
-        ptr::copy_nonoverlapping(y, x, 1);
-        ptr::write(y, z);
+        let a = ptr::read(x);
+        let b = ptr::read(y);
+        ptr::write(x, b);
+        ptr::write(y, a);
      }
  }
  
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs

index ff71fadb614182e942ce9a2d1a93a4916c211ecb..59b1b4c1367526a274b0fc1ad26aa44a12096fa8 100644 (file)
--- a/library/core/src/ptr/mod.rs
+++ b/library/core/src/ptr/mod.rs
@@ -419,6 +419,7 @@ pub const fn slice_from_raw_parts_mut<T>(data: *mut T, len: usize) -> *mut [T] {
  #[stable(feature = "swap_nonoverlapping", since = "1.27.0")]
  #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
  pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
+    #[allow(unused)]
      macro_rules! attempt_swap_as_chunks {
          ($ChunkTy:ty) => {
              if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
@@ -437,15 +438,21 @@ macro_rules! attempt_swap_as_chunks {
          };
      }
  
-    // Split up the slice into small power-of-two-sized chunks that LLVM is able
-    // to vectorize (unless it's a special type with more-than-pointer alignment,
-    // because we don't want to pessimize things like slices of SIMD vectors.)
-    if mem::align_of::<T>() <= mem::size_of::<usize>()
-        && (!mem::size_of::<T>().is_power_of_two()
-            || mem::size_of::<T>() > mem::size_of::<usize>() * 2)
+    // NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
+    // pessimization for it.  Also, if the type contains any unaligned pointers,
+    // copying those over multiple reads is difficult to support.
+    #[cfg(not(miri))]
      {
-        attempt_swap_as_chunks!(usize);
-        attempt_swap_as_chunks!(u8);
+        // Split up the slice into small power-of-two-sized chunks that LLVM is able
+        // to vectorize (unless it's a special type with more-than-pointer alignment,
+        // because we don't want to pessimize things like slices of SIMD vectors.)
+        if mem::align_of::<T>() <= mem::size_of::<usize>()
+            && (!mem::size_of::<T>().is_power_of_two()
+                || mem::size_of::<T>() > mem::size_of::<usize>() * 2)
+        {
+            attempt_swap_as_chunks!(usize);
+            attempt_swap_as_chunks!(u8);
+        }
      }
  
      // SAFETY: Same preconditions as this function
diff --git a/src/test/codegen/swap-large-types.rs b/src/test/codegen/swap-large-types.rs

index 535d301a3d27b0be6185fac195eba70c78954f2f..91a1ab7144fd4ed72d6a728a794e3aea17364257 100644 (file)
--- a/src/test/codegen/swap-large-types.rs
+++ b/src/test/codegen/swap-large-types.rs
@@ -39,6 +39,9 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
      swap(x, y)
  }
  
+// Verify that types with usize alignment are swapped via vectored usizes,
+// not falling back to byte-level code.
+
  // CHECK-LABEL: @swap_slice
  #[no_mangle]
  pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
@@ -50,6 +53,8 @@ pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
      }
  }
  
+// But for a large align-1 type, vectorized byte copying is what we want.
+
  type OneKilobyteBuffer = [u8; 1024];
  
  // CHECK-LABEL: @swap_1kb_slices
@@ -62,3 +67,25 @@ pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer])
          x.swap_with_slice(y);
      }
  }
+
+// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
+// for an unusual type like this.  It's not clear whether we should do anything
+// smarter in Rust for these, so for now it's fine to leave these up to the backend.
+// That's not as bad as it might seem, as for example, LLVM will lower the
+// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
+// Eventually we'll be able to pass `align_of::<T>` to a const generic and
+// thus pick a smarter chunk size ourselves without huge code duplication.
+
+#[repr(align(64))]
+pub struct BigButHighlyAligned([u8; 64 * 3]);
+
+// CHECK-LABEL: @swap_big_aligned
+#[no_mangle]
+pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
+// CHECK-NOT: call void @llvm.memcpy
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
+// CHECK-NOT: call void @llvm.memcpy
+    swap(x, y)
+}
author	Scott McMurray <scottmcm@users.noreply.github.com>
	Sun, 27 Feb 2022 02:57:15 +0000 (18:57 -0800)
committer	Scott McMurray <scottmcm@users.noreply.github.com>
	Sun, 27 Feb 2022 02:57:15 +0000 (18:57 -0800)
library/core/src/mem/mod.rs		patch \| blob \| history
library/core/src/ptr/mod.rs		patch \| blob \| history
src/test/codegen/swap-large-types.rs		patch \| blob \| history