// understanding `mem::replace`, `Option::take`, etc. - a better overall
// solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
// a backend can choose to implement using the block optimization, or not.
- #[cfg(not(target_arch = "spirv"))]
+ // NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
+ // pessimization for it. Also, if the type contains any unaligned pointers,
+ // copying those over multiple reads is difficult to support.
+ #[cfg(not(any(target_arch = "spirv", miri)))]
{
// For types that are larger multiples of their alignment, the simple way
// tends to copy the whole thing to stack rather than doing it one part
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
#[inline]
pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
+ // We arrange for this to typically be called with small types,
+ // so this reads-and-writes approach is actually better than using
+ // copy_nonoverlapping as it easily puts things in LLVM registers
+ // directly and doesn't end up inlining allocas.
+ // And LLVM actually optimizes it to 3×memcpy if called with
+ // a type larger than it's willing to keep in a register.
+ // Having typed reads and writes in MIR here is also good as
+ // it lets MIRI and CTFE understand them better, including things
+ // like enforcing type validity for them.
+ // Importantly, read+copy_nonoverlapping+write introduces confusing
+ // asymmetry to the behaviour where one value went through read+write
+ // whereas the other was copied over by the intrinsic (see #94371).
+
// SAFETY: exclusive references are always valid to read/write,
- // are non-overlapping, and nothing here panics so it's drop-safe.
+ // including being aligned, and nothing here panics so it's drop-safe.
unsafe {
- let z = ptr::read(x);
- ptr::copy_nonoverlapping(y, x, 1);
- ptr::write(y, z);
+ let a = ptr::read(x);
+ let b = ptr::read(y);
+ ptr::write(x, b);
+ ptr::write(y, a);
}
}
#[stable(feature = "swap_nonoverlapping", since = "1.27.0")]
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
+ #[allow(unused)]
macro_rules! attempt_swap_as_chunks {
($ChunkTy:ty) => {
if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
};
}
- // Split up the slice into small power-of-two-sized chunks that LLVM is able
- // to vectorize (unless it's a special type with more-than-pointer alignment,
- // because we don't want to pessimize things like slices of SIMD vectors.)
- if mem::align_of::<T>() <= mem::size_of::<usize>()
- && (!mem::size_of::<T>().is_power_of_two()
- || mem::size_of::<T>() > mem::size_of::<usize>() * 2)
+ // NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
+ // pessimization for it. Also, if the type contains any unaligned pointers,
+ // copying those over multiple reads is difficult to support.
+ #[cfg(not(miri))]
{
- attempt_swap_as_chunks!(usize);
- attempt_swap_as_chunks!(u8);
+ // Split up the slice into small power-of-two-sized chunks that LLVM is able
+ // to vectorize (unless it's a special type with more-than-pointer alignment,
+ // because we don't want to pessimize things like slices of SIMD vectors.)
+ if mem::align_of::<T>() <= mem::size_of::<usize>()
+ && (!mem::size_of::<T>().is_power_of_two()
+ || mem::size_of::<T>() > mem::size_of::<usize>() * 2)
+ {
+ attempt_swap_as_chunks!(usize);
+ attempt_swap_as_chunks!(u8);
+ }
}
// SAFETY: Same preconditions as this function
swap(x, y)
}
+// Verify that types with usize alignment are swapped via vectored usizes,
+// not falling back to byte-level code.
+
// CHECK-LABEL: @swap_slice
#[no_mangle]
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
}
}
+// But for a large align-1 type, vectorized byte copying is what we want.
+
type OneKilobyteBuffer = [u8; 1024];
// CHECK-LABEL: @swap_1kb_slices
x.swap_with_slice(y);
}
}
+
+// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
+// for an unusual type like this. It's not clear whether we should do anything
+// smarter in Rust for these, so for now it's fine to leave these up to the backend.
+// That's not as bad as it might seem, as for example, LLVM will lower the
+// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
+// Eventually we'll be able to pass `align_of::<T>` to a const generic and
+// thus pick a smarter chunk size ourselves without huge code duplication.
+
+#[repr(align(64))]
+pub struct BigButHighlyAligned([u8; 64 * 3]);
+
+// CHECK-LABEL: @swap_big_aligned
+#[no_mangle]
+pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
+// CHECK-NOT: call void @llvm.memcpy
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
+// CHECK-NOT: call void @llvm.memcpy
+ swap(x, y)
+}