From: bjorn3 Date: Tue, 30 Jul 2019 12:37:20 +0000 (+0200) Subject: Implement some float simd intrinsics X-Git-Url: https://git.lizzy.rs/?a=commitdiff_plain;h=69526d464fc0ef021beb1718d68035555f30c33d;p=rust.git Implement some float simd intrinsics --- diff --git a/example/std_example.rs b/example/std_example.rs index 8a43af5bd80..33523a12871 100644 --- a/example/std_example.rs +++ b/example/std_example.rs @@ -1,5 +1,6 @@ #![feature(core_intrinsics)] +use std::arch::x86_64::*; use std::io::Write; use std::intrinsics; @@ -52,8 +53,6 @@ fn main() { #[target_feature(enable = "sse2")] unsafe fn test_simd() { - use std::arch::x86_64::*; - let x = _mm_setzero_si128(); let y = _mm_set1_epi16(7); let or = _mm_or_si128(x, y); @@ -67,6 +66,8 @@ unsafe fn test_simd() { test_mm_slli_si128(); test_mm_movemask_epi8(); test_mm256_movemask_epi8(); + test_mm_add_epi8(); + test_mm_add_pd(); let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))); assert_eq!(mask1, 1); @@ -74,8 +75,6 @@ unsafe fn test_simd() { #[target_feature(enable = "sse2")] unsafe fn test_mm_slli_si128() { - use std::arch::x86_64::*; - #[rustfmt::skip] let a = _mm_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, @@ -116,8 +115,6 @@ unsafe fn test_mm_slli_si128() { #[target_feature(enable = "sse2")] unsafe fn test_mm_movemask_epi8() { - use std::arch::x86_64::*; - #[rustfmt::skip] let a = _mm_setr_epi8( 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01, @@ -131,20 +128,48 @@ unsafe fn test_mm_movemask_epi8() { #[target_feature(enable = "avx2")] unsafe fn test_mm256_movemask_epi8() { - use std::arch::x86_64::*; - let a = _mm256_set1_epi8(-1); let r = _mm256_movemask_epi8(a); let e = -1; assert_eq!(r, e); } +#[target_feature(enable = "sse2")] +unsafe fn test_mm_add_epi8() { + let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[rustfmt::skip] + let b = _mm_setr_epi8( + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ); + let r = _mm_add_epi8(a, b); + #[rustfmt::skip] + let e = _mm_setr_epi8( + 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, + ); + assert_eq_m128i(r, e); +} + +#[target_feature(enable = "sse2")] +unsafe fn test_mm_add_pd() { + let a = _mm_setr_pd(1.0, 2.0); + let b = _mm_setr_pd(5.0, 10.0); + let r = _mm_add_pd(a, b); + assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0)); +} + fn assert_eq_m128i(x: std::arch::x86_64::__m128i, y: std::arch::x86_64::__m128i) { unsafe { assert_eq!(std::mem::transmute::<_, [u8; 16]>(x), std::mem::transmute::<_, [u8; 16]>(x)); } } +#[target_feature(enable = "sse2")] +pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) { + if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 { + panic!("{:?} != {:?}", a, b); + } +} + #[derive(PartialEq)] enum LoopState { Continue(()), diff --git a/src/constant.rs b/src/constant.rs index 10f757ffce8..c8fb2767f9a 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -88,7 +88,7 @@ pub fn trans_constant<'a, 'tcx: 'a>( } pub fn force_eval_const<'a, 'tcx: 'a>( - fx: &mut FunctionCx<'a, 'tcx, impl Backend>, + fx: &FunctionCx<'a, 'tcx, impl Backend>, const_: &'tcx Const, ) -> &'tcx Const<'tcx> { match const_.val { @@ -422,3 +422,32 @@ fn stack_pop(_: &mut InterpCx<'mir, 'tcx, Self>, _: ()) -> InterpResult<'tcx> { Ok(()) } } + +pub fn mir_operand_get_const_val<'tcx>( + fx: &FunctionCx<'_, 'tcx, impl Backend>, + operand: &Operand<'tcx>, +) -> Result<&'tcx Const<'tcx>, String> { + let place = match operand { + Operand::Copy(place) => place, + Operand::Constant(const_) => return Ok(force_eval_const(fx, const_.literal)), + _ => return Err(format!("{:?}", operand)), + }; + + assert!(place.projection.is_none()); + let static_ = match &place.base { + PlaceBase::Static(static_) => { + static_ + } + PlaceBase::Local(_) => return Err("local".to_string()), + }; + + Ok(match &static_.kind { + StaticKind::Static(_) => unimplemented!(), + StaticKind::Promoted(promoted) => { + fx.tcx.const_eval(ParamEnv::reveal_all().and(GlobalId { + instance: fx.instance, + promoted: Some(*promoted), + })).unwrap() + } + }) +} diff --git a/src/intrinsics.rs b/src/intrinsics.rs index 042a0934709..a456cac1d74 100644 --- a/src/intrinsics.rs +++ b/src/intrinsics.rs @@ -144,7 +144,7 @@ pub fn lane_type_and_count<'tcx>( (lane_layout, lane_count) } -fn simd_for_each_lane<'tcx, B: Backend>( +pub fn simd_for_each_lane<'tcx, B: Backend>( fx: &mut FunctionCx<'_, 'tcx, B>, intrinsic: &str, x: CValue<'tcx>, @@ -170,23 +170,37 @@ fn simd_for_each_lane<'tcx, B: Backend>( } } -fn bool_to_zero_or_max_uint<'tcx>( +pub fn bool_to_zero_or_max_uint<'tcx>( fx: &mut FunctionCx<'_, 'tcx, impl Backend>, layout: TyLayout<'tcx>, val: Value, ) -> CValue<'tcx> { let ty = fx.clif_type(layout.ty).unwrap(); - let zero = fx.bcx.ins().iconst(ty, 0); - let max = fx.bcx.ins().iconst(ty, (u64::max_value() >> (64 - ty.bits())) as i64); - let res = crate::common::codegen_select(&mut fx.bcx, val, max, zero); + let int_ty = match ty { + types::F32 => types::I32, + types::F64 => types::I64, + ty => ty, + }; + + let zero = fx.bcx.ins().iconst(int_ty, 0); + let max = fx.bcx.ins().iconst(int_ty, (u64::max_value() >> (64 - int_ty.bits())) as i64); + let mut res = crate::common::codegen_select(&mut fx.bcx, val, max, zero); + + if ty.is_float() { + res = fx.bcx.ins().bitcast(ty, res); + } + CValue::by_val(res, layout) } macro_rules! simd_cmp { ($fx:expr, $intrinsic:expr, $cc:ident($x:ident, $y:ident) -> $ret:ident) => { - simd_for_each_lane($fx, $intrinsic, $x, $y, $ret, |fx, _lane_layout, res_lane_layout, x_lane, y_lane| { - let res_lane = fx.bcx.ins().icmp(IntCC::$cc, x_lane, y_lane); + simd_for_each_lane($fx, $intrinsic, $x, $y, $ret, |fx, lane_layout, res_lane_layout, x_lane, y_lane| { + let res_lane = match lane_layout.ty.sty { + ty::Uint(_) | ty::Int(_) => fx.bcx.ins().icmp(IntCC::$cc, x_lane, y_lane), + _ => unreachable!("{:?}", lane_layout.ty), + }; bool_to_zero_or_max_uint(fx, res_lane_layout, res_lane) }); }; @@ -203,10 +217,13 @@ macro_rules! simd_cmp { } -macro_rules! simd_binop { +macro_rules! simd_int_binop { ($fx:expr, $intrinsic:expr, $op:ident($x:ident, $y:ident) -> $ret:ident) => { - simd_for_each_lane($fx, $intrinsic, $x, $y, $ret, |fx, _lane_layout, ret_lane_layout, x_lane, y_lane| { - let res_lane = fx.bcx.ins().$op(x_lane, y_lane); + simd_for_each_lane($fx, $intrinsic, $x, $y, $ret, |fx, lane_layout, ret_lane_layout, x_lane, y_lane| { + let res_lane = match lane_layout.ty.sty { + ty::Uint(_) | ty::Int(_) => fx.bcx.ins().$op(x_lane, y_lane), + _ => unreachable!("{:?}", lane_layout.ty), + }; CValue::by_val(res_lane, ret_lane_layout) }); }; @@ -222,6 +239,42 @@ macro_rules! simd_binop { }; } +macro_rules! simd_int_flt_binop { + ($fx:expr, $intrinsic:expr, $op:ident|$op_f:ident($x:ident, $y:ident) -> $ret:ident) => { + simd_for_each_lane($fx, $intrinsic, $x, $y, $ret, |fx, lane_layout, ret_lane_layout, x_lane, y_lane| { + let res_lane = match lane_layout.ty.sty { + ty::Uint(_) | ty::Int(_) => fx.bcx.ins().$op(x_lane, y_lane), + ty::Float(_) => fx.bcx.ins().$op_f(x_lane, y_lane), + _ => unreachable!("{:?}", lane_layout.ty), + }; + CValue::by_val(res_lane, ret_lane_layout) + }); + }; + ($fx:expr, $intrinsic:expr, $op_u:ident|$op_s:ident|$op_f:ident($x:ident, $y:ident) -> $ret:ident) => { + simd_for_each_lane($fx, $intrinsic, $x, $y, $ret, |fx, lane_layout, ret_lane_layout, x_lane, y_lane| { + let res_lane = match lane_layout.ty.sty { + ty::Uint(_) => fx.bcx.ins().$op_u(x_lane, y_lane), + ty::Int(_) => fx.bcx.ins().$op_s(x_lane, y_lane), + ty::Float(_) => fx.bcx.ins().$op_f(x_lane, y_lane), + _ => unreachable!("{:?}", lane_layout.ty), + }; + CValue::by_val(res_lane, ret_lane_layout) + }); + }; +} + +macro_rules! simd_flt_binop { + ($fx:expr, $intrinsic:expr, $op:ident($x:ident, $y:ident) -> $ret:ident) => { + simd_for_each_lane($fx, $intrinsic, $x, $y, $ret, |fx, lane_layout, ret_lane_layout, x_lane, y_lane| { + let res_lane = match lane_layout.ty.sty { + ty::Float(_) => fx.bcx.ins().$op(x_lane, y_lane), + _ => unreachable!("{:?}", lane_layout.ty), + }; + CValue::by_val(res_lane, ret_lane_layout) + }); + } +} + pub fn codegen_intrinsic_call<'a, 'tcx: 'a>( fx: &mut FunctionCx<'a, 'tcx, impl Backend>, def_id: DefId, @@ -840,30 +893,7 @@ fn swap(bcx: &mut FunctionBuilder, v: Value) -> Value { let indexes = { use rustc::mir::interpret::*; - let idx_place = match idx { - Operand::Copy(idx_place) => { - idx_place - } - _ => panic!("simd_shuffle* idx is not Operand::Copy, but {:?}", idx), - }; - - assert!(idx_place.projection.is_none()); - let static_ = match &idx_place.base { - PlaceBase::Static(static_) => { - static_ - } - PlaceBase::Local(_) => panic!("simd_shuffle* idx is not constant, but a local"), - }; - - let idx_const = match &static_.kind { - StaticKind::Static(_) => unimplemented!(), - StaticKind::Promoted(promoted) => { - fx.tcx.const_eval(ParamEnv::reveal_all().and(GlobalId { - instance: fx.instance, - promoted: Some(*promoted), - })).unwrap() - } - }; + let idx_const = crate::constant::mir_operand_get_const_val(fx, idx).expect("simd_shuffle* idx not const"); let idx_bytes = match idx_const.val { ConstValue::ByRef { align: _, offset, alloc } => { @@ -900,41 +930,38 @@ fn swap(bcx: &mut FunctionBuilder, v: Value) -> Value { }; simd_add, (c x, c y) { - simd_binop!(fx, intrinsic, iadd(x, y) -> ret); + simd_int_flt_binop!(fx, intrinsic, iadd|fadd(x, y) -> ret); }; simd_sub, (c x, c y) { - simd_binop!(fx, intrinsic, isub(x, y) -> ret); + simd_int_flt_binop!(fx, intrinsic, isub|fsub(x, y) -> ret); }; simd_mul, (c x, c y) { - simd_binop!(fx, intrinsic, imul(x, y) -> ret); + simd_int_flt_binop!(fx, intrinsic, imul|fmul(x, y) -> ret); }; simd_div, (c x, c y) { - simd_binop!(fx, intrinsic, udiv|sdiv(x, y) -> ret); - }; - simd_rem, (c x, c y) { - simd_binop!(fx, intrinsic, urem|srem(x, y) -> ret); + simd_int_flt_binop!(fx, intrinsic, udiv|sdiv|fdiv(x, y) -> ret); }; simd_shl, (c x, c y) { - simd_binop!(fx, intrinsic, ishl(x, y) -> ret); + simd_int_binop!(fx, intrinsic, ishl(x, y) -> ret); }; simd_shr, (c x, c y) { - simd_binop!(fx, intrinsic, ushr|sshr(x, y) -> ret); + simd_int_binop!(fx, intrinsic, ushr|sshr(x, y) -> ret); }; simd_and, (c x, c y) { - simd_binop!(fx, intrinsic, band(x, y) -> ret); + simd_int_binop!(fx, intrinsic, band(x, y) -> ret); }; simd_or, (c x, c y) { - simd_binop!(fx, intrinsic, bor(x, y) -> ret); + simd_int_binop!(fx, intrinsic, bor(x, y) -> ret); }; simd_xor, (c x, c y) { - simd_binop!(fx, intrinsic, bxor(x, y) -> ret); + simd_int_binop!(fx, intrinsic, bxor(x, y) -> ret); }; simd_fmin, (c x, c y) { - simd_binop!(fx, intrinsic, fmin(x, y) -> ret); + simd_flt_binop!(fx, intrinsic, fmin(x, y) -> ret); }; simd_fmax, (c x, c y) { - simd_binop!(fx, intrinsic, fmax(x, y) -> ret); + simd_flt_binop!(fx, intrinsic, fmax(x, y) -> ret); }; } diff --git a/src/llvm_intrinsics.rs b/src/llvm_intrinsics.rs index b93fa1bdbdf..284bdee52b8 100644 --- a/src/llvm_intrinsics.rs +++ b/src/llvm_intrinsics.rs @@ -1,4 +1,5 @@ use crate::prelude::*; +use crate::intrinsics::*; use rustc::ty::subst::SubstsRef; @@ -26,7 +27,7 @@ pub fn codegen_llvm_intrinsic_call<'a, 'tcx: 'a>( } }; - crate::intrinsics::intrinsic_match! { + intrinsic_match! { fx, intrinsic, substs, args, _ => { fx.tcx.sess.warn(&format!("unsupported llvm intrinsic {}; replacing with trap", intrinsic)); @@ -34,17 +35,28 @@ pub fn codegen_llvm_intrinsic_call<'a, 'tcx: 'a>( }; // Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8` - llvm.x86.sse2.pmovmskb.128 | llvm.x86.avx2.pmovmskb, (c a) { - let (lane_layout, lane_count) = crate::intrinsics::lane_type_and_count(fx, a.layout(), intrinsic); - assert_eq!(lane_layout.ty.sty, fx.tcx.types.i8.sty); - assert!(lane_count == 16 || lane_count == 32); + llvm.x86.sse2.pmovmskb.128 | llvm.x86.avx2.pmovmskb | llvm.x86.sse2.movmsk.pd, (c a) { + let (lane_layout, lane_count) = lane_type_and_count(fx, a.layout(), intrinsic); + let lane_ty = fx.clif_type(lane_layout.ty).unwrap(); + assert!(lane_count <= 32); let mut res = fx.bcx.ins().iconst(types::I32, 0); for lane in (0..lane_count).rev() { let a_lane = a.value_field(fx, mir::Field::new(lane.try_into().unwrap())).load_scalar(fx); - let a_lane_sign = fx.bcx.ins().ushr_imm(a_lane, 7); // extract sign bit of 8bit int - let a_lane_sign = fx.bcx.ins().uextend(types::I32, a_lane_sign); + + // cast float to int + let a_lane = match lane_ty { + types::F32 => fx.bcx.ins().bitcast(types::I32, a_lane), + types::F64 => fx.bcx.ins().bitcast(types::I64, a_lane), + _ => a_lane, + }; + + // extract sign bit of an int + let a_lane_sign = fx.bcx.ins().ushr_imm(a_lane, i64::from(lane_ty.bits() - 1)); + + // shift sign bit into result + let a_lane_sign = clif_intcast(fx, a_lane_sign, types::I32, false); res = fx.bcx.ins().ishl_imm(res, 1); res = fx.bcx.ins().bor(res, a_lane_sign); } @@ -52,6 +64,36 @@ pub fn codegen_llvm_intrinsic_call<'a, 'tcx: 'a>( let res = CValue::by_val(res, fx.layout_of(fx.tcx.types.i32)); ret.write_cvalue(fx, res); }; + llvm.x86.sse2.cmp.ps | llvm.x86.sse2.cmp.pd, (c x, c y, o kind) { + let kind_const = crate::constant::mir_operand_get_const_val(fx, kind).expect("llvm.x86.sse2.cmp.* kind not const"); + let flt_cc = match kind_const.val.try_to_bits(Size::from_bytes(1)).expect(&format!("kind not scalar: {:?}", kind_const)) { + 0 => FloatCC::Equal, + 1 => FloatCC::LessThan, + 2 => FloatCC::LessThanOrEqual, + 7 => { + unimplemented!("Compares corresponding elements in `a` and `b` to see if neither is `NaN`."); + } + 3 => { + unimplemented!("Compares corresponding elements in `a` and `b` to see if either is `NaN`."); + } + 4 => FloatCC::NotEqual, + 5 => { + unimplemented!("not less than"); + } + 6 => { + unimplemented!("not less than or equal"); + } + kind => unreachable!("kind {:?}", kind), + }; + + simd_for_each_lane(fx, intrinsic, x, y, ret, |fx, lane_layout, res_lane_layout, x_lane, y_lane| { + let res_lane = match lane_layout.ty.sty { + ty::Float(_) => fx.bcx.ins().fcmp(flt_cc, x_lane, y_lane), + _ => unreachable!("{:?}", lane_layout.ty), + }; + bool_to_zero_or_max_uint(fx, res_lane_layout, res_lane) + }); + }; } if let Some((_, dest)) = destination {