1//! Emulate x86 LLVM intrinsics23use rustc_ast::ast::{InlineAsmOptions, InlineAsmTemplatePiece};4use rustc_target::asm::*;56use crate::inline_asm::{CInlineAsmOperand, codegen_inline_asm_inner};7use crate::intrinsics::*;8use crate::prelude::*;910pub(super) fn codegen_x86_llvm_intrinsic_call<'tcx>(11 fx: &mut FunctionCx<'_, '_, 'tcx>,12 intrinsic: &str,13 args: &[Spanned<mir::Operand<'tcx>>],14 ret: CPlace<'tcx>,15 target: Option<BasicBlock>,16 span: Span,17) {18 match intrinsic {19 "llvm.x86.sse2.pause" | "llvm.aarch64.isb" => {20 // Spin loop hint21 }2223 "llvm.x86.avx.vzeroupper" => {24 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper&ig_expand=721825 // Do nothing. It is a perf hint anyway.26 }2728 // Used by is_x86_feature_detected!();29 "llvm.x86.xgetbv" => {30 intrinsic_args!(fx, args => (xcr_no); intrinsic);3132 let xcr_no = xcr_no.load_scalar(fx);3334 codegen_inline_asm_inner(35 fx,36 &[InlineAsmTemplatePiece::String(37 "38 xgetbv39 // out = rdx << 32 | rax40 shl rdx, 3241 or rax, rdx42 "43 .into(),44 )],45 &[46 CInlineAsmOperand::In {47 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),48 value: xcr_no,49 },50 CInlineAsmOperand::Out {51 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),52 late: true,53 place: Some(ret),54 },55 CInlineAsmOperand::Out {56 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),57 late: true,58 place: None,59 },60 ],61 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,62 );63 }6465 "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {66 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128&ig_expand=400967 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256&ig_expand=401068 intrinsic_args!(fx, args => (ptr); intrinsic);6970 // FIXME correctly handle unalignedness71 let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), ret.layout());72 ret.write_cvalue(fx, val);73 }7475 "llvm.x86.avx2.gather.d.d"76 | "llvm.x86.avx2.gather.d.q"77 | "llvm.x86.avx2.gather.d.ps"78 | "llvm.x86.avx2.gather.d.pd"79 | "llvm.x86.avx2.gather.d.d.256"80 | "llvm.x86.avx2.gather.d.q.256"81 | "llvm.x86.avx2.gather.d.ps.256"82 | "llvm.x86.avx2.gather.d.pd.256"83 | "llvm.x86.avx2.gather.q.d"84 | "llvm.x86.avx2.gather.q.q"85 | "llvm.x86.avx2.gather.q.ps"86 | "llvm.x86.avx2.gather.q.pd"87 | "llvm.x86.avx2.gather.q.d.256"88 | "llvm.x86.avx2.gather.q.q.256"89 | "llvm.x86.avx2.gather.q.ps.256"90 | "llvm.x86.avx2.gather.q.pd.256" => {91 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=381892 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd&ig_expand=381993 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd&ig_expand=382194 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd&ig_expand=382295 // ...9697 intrinsic_args!(fx, args => (src, ptr, index, mask, scale); intrinsic);9899 let (src_lane_count, src_lane_ty) = src.layout().ty.simd_size_and_type(fx.tcx);100 let (index_lane_count, index_lane_ty) = index.layout().ty.simd_size_and_type(fx.tcx);101 let (mask_lane_count, mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx);102 let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);103 assert_eq!(src_lane_ty, ret_lane_ty);104 assert!(index_lane_ty.is_integral());105 assert_eq!(src_lane_count, mask_lane_count);106 assert_eq!(src_lane_count, ret_lane_count);107108 let lane_clif_ty = fx.clif_type(ret_lane_ty).unwrap();109 let index_lane_clif_ty = fx.clif_type(index_lane_ty).unwrap();110 let mask_lane_clif_ty = fx.clif_type(mask_lane_ty).unwrap();111 let ret_lane_layout = fx.layout_of(ret_lane_ty);112113 let ptr = ptr.load_scalar(fx);114 let scale = scale.load_scalar(fx);115 let scale = fx.bcx.ins().uextend(types::I64, scale);116 for lane_idx in 0..std::cmp::min(src_lane_count, index_lane_count) {117 let src_lane = src.value_lane(fx, lane_idx).load_scalar(fx);118 let index_lane = index.value_lane(fx, lane_idx).load_scalar(fx);119 let mask_lane = mask.value_lane(fx, lane_idx).load_scalar(fx);120 let mask_lane =121 fx.bcx.ins().bitcast(mask_lane_clif_ty.as_int(), MemFlags::new(), mask_lane);122123 let if_enabled = fx.bcx.create_block();124 let if_disabled = fx.bcx.create_block();125 let next = fx.bcx.create_block();126 let res_lane = fx.bcx.append_block_param(next, lane_clif_ty);127128 let mask_lane = match mask_lane_clif_ty {129 types::I32 | types::F32 => {130 fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64)131 }132 types::I64 | types::F64 => {133 fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64)134 }135 _ => unreachable!(),136 };137 fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]);138 fx.bcx.seal_block(if_enabled);139 fx.bcx.seal_block(if_disabled);140141 fx.bcx.switch_to_block(if_enabled);142 let index_lane = if index_lane_clif_ty != types::I64 {143 fx.bcx.ins().sextend(types::I64, index_lane)144 } else {145 index_lane146 };147 let offset = fx.bcx.ins().imul(index_lane, scale);148 let lane_ptr = fx.bcx.ins().iadd(ptr, offset);149 let res = fx.bcx.ins().load(lane_clif_ty, MemFlags::trusted(), lane_ptr, 0);150 fx.bcx.ins().jump(next, &[res.into()]);151152 fx.bcx.switch_to_block(if_disabled);153 fx.bcx.ins().jump(next, &[src_lane.into()]);154155 fx.bcx.seal_block(next);156 fx.bcx.switch_to_block(next);157158 fx.bcx.ins().nop();159160 ret.place_lane(fx, lane_idx)161 .write_cvalue(fx, CValue::by_val(res_lane, ret_lane_layout));162 }163164 for lane_idx in std::cmp::min(src_lane_count, index_lane_count)..ret_lane_count {165 let zero_lane = fx.bcx.ins().iconst(mask_lane_clif_ty.as_int(), 0);166 let zero_lane = fx.bcx.ins().bitcast(mask_lane_clif_ty, MemFlags::new(), zero_lane);167 ret.place_lane(fx, lane_idx)168 .write_cvalue(fx, CValue::by_val(zero_lane, ret_lane_layout));169 }170 }171172 "llvm.x86.sse.max.ps" => {173 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps&ig_expand=4357174 intrinsic_args!(fx, args => (a, b); intrinsic);175176 simd_pair_for_each_lane(177 fx,178 a,179 b,180 ret,181 &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| fx.bcx.ins().fmax(a_lane, b_lane),182 );183 }184185 "llvm.x86.sse.min.ps" => {186 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps&ig_expand=4489187 intrinsic_args!(fx, args => (a, b); intrinsic);188189 simd_pair_for_each_lane(190 fx,191 a,192 b,193 ret,194 &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| fx.bcx.ins().fmin(a_lane, b_lane),195 );196 }197198 "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {199 let (x, y, kind) = match args {200 [x, y, kind] => (x, y, kind),201 _ => bug!("wrong number of args for intrinsic {intrinsic}"),202 };203 let x = codegen_operand(fx, &x.node);204 let y = codegen_operand(fx, &y.node);205 let kind = if let Some(const_) = kind.node.constant() {206 crate::constant::eval_mir_constant(fx, const_).0207 } else {208 unreachable!("{kind:?}")209 };210211 let flt_cc = match kind212 .try_to_bits(Size::from_bytes(1))213 .unwrap_or_else(|| panic!("kind not scalar: {:?}", kind))214 .try_into()215 .unwrap()216 {217 _CMP_EQ_OQ | _CMP_EQ_OS => FloatCC::Equal,218 _CMP_LT_OS | _CMP_LT_OQ => FloatCC::LessThan,219 _CMP_LE_OS | _CMP_LE_OQ => FloatCC::LessThanOrEqual,220 _CMP_UNORD_Q | _CMP_UNORD_S => FloatCC::Unordered,221 _CMP_NEQ_UQ | _CMP_NEQ_US => FloatCC::NotEqual,222 _CMP_NLT_US | _CMP_NLT_UQ => FloatCC::UnorderedOrGreaterThanOrEqual,223 _CMP_NLE_US | _CMP_NLE_UQ => FloatCC::UnorderedOrGreaterThan,224 _CMP_ORD_Q | _CMP_ORD_S => FloatCC::Ordered,225 _CMP_EQ_UQ | _CMP_EQ_US => FloatCC::UnorderedOrEqual,226 _CMP_NGE_US | _CMP_NGE_UQ => FloatCC::UnorderedOrLessThan,227 _CMP_NGT_US | _CMP_NGT_UQ => FloatCC::UnorderedOrLessThanOrEqual,228 _CMP_FALSE_OQ | _CMP_FALSE_OS => todo!(),229 _CMP_NEQ_OQ | _CMP_NEQ_OS => FloatCC::OrderedNotEqual,230 _CMP_GE_OS | _CMP_GE_OQ => FloatCC::GreaterThanOrEqual,231 _CMP_GT_OS | _CMP_GT_OQ => FloatCC::GreaterThan,232 _CMP_TRUE_UQ | _CMP_TRUE_US => todo!(),233234 kind => unreachable!("kind {:?}", kind),235 };236237 // Copied from stdarch238 /// Equal (ordered, non-signaling)239 const _CMP_EQ_OQ: i32 = 0x00;240 /// Less-than (ordered, signaling)241 const _CMP_LT_OS: i32 = 0x01;242 /// Less-than-or-equal (ordered, signaling)243 const _CMP_LE_OS: i32 = 0x02;244 /// Unordered (non-signaling)245 const _CMP_UNORD_Q: i32 = 0x03;246 /// Not-equal (unordered, non-signaling)247 const _CMP_NEQ_UQ: i32 = 0x04;248 /// Not-less-than (unordered, signaling)249 const _CMP_NLT_US: i32 = 0x05;250 /// Not-less-than-or-equal (unordered, signaling)251 const _CMP_NLE_US: i32 = 0x06;252 /// Ordered (non-signaling)253 const _CMP_ORD_Q: i32 = 0x07;254 /// Equal (unordered, non-signaling)255 const _CMP_EQ_UQ: i32 = 0x08;256 /// Not-greater-than-or-equal (unordered, signaling)257 const _CMP_NGE_US: i32 = 0x09;258 /// Not-greater-than (unordered, signaling)259 const _CMP_NGT_US: i32 = 0x0a;260 /// False (ordered, non-signaling)261 const _CMP_FALSE_OQ: i32 = 0x0b;262 /// Not-equal (ordered, non-signaling)263 const _CMP_NEQ_OQ: i32 = 0x0c;264 /// Greater-than-or-equal (ordered, signaling)265 const _CMP_GE_OS: i32 = 0x0d;266 /// Greater-than (ordered, signaling)267 const _CMP_GT_OS: i32 = 0x0e;268 /// True (unordered, non-signaling)269 const _CMP_TRUE_UQ: i32 = 0x0f;270 /// Equal (ordered, signaling)271 const _CMP_EQ_OS: i32 = 0x10;272 /// Less-than (ordered, non-signaling)273 const _CMP_LT_OQ: i32 = 0x11;274 /// Less-than-or-equal (ordered, non-signaling)275 const _CMP_LE_OQ: i32 = 0x12;276 /// Unordered (signaling)277 const _CMP_UNORD_S: i32 = 0x13;278 /// Not-equal (unordered, signaling)279 const _CMP_NEQ_US: i32 = 0x14;280 /// Not-less-than (unordered, non-signaling)281 const _CMP_NLT_UQ: i32 = 0x15;282 /// Not-less-than-or-equal (unordered, non-signaling)283 const _CMP_NLE_UQ: i32 = 0x16;284 /// Ordered (signaling)285 const _CMP_ORD_S: i32 = 0x17;286 /// Equal (unordered, signaling)287 const _CMP_EQ_US: i32 = 0x18;288 /// Not-greater-than-or-equal (unordered, non-signaling)289 const _CMP_NGE_UQ: i32 = 0x19;290 /// Not-greater-than (unordered, non-signaling)291 const _CMP_NGT_UQ: i32 = 0x1a;292 /// False (ordered, signaling)293 const _CMP_FALSE_OS: i32 = 0x1b;294 /// Not-equal (ordered, signaling)295 const _CMP_NEQ_OS: i32 = 0x1c;296 /// Greater-than-or-equal (ordered, non-signaling)297 const _CMP_GE_OQ: i32 = 0x1d;298 /// Greater-than (ordered, non-signaling)299 const _CMP_GT_OQ: i32 = 0x1e;300 /// True (unordered, signaling)301 const _CMP_TRUE_US: i32 = 0x1f;302303 simd_pair_for_each_lane(fx, x, y, ret, &|fx, lane_ty, res_lane_ty, x_lane, y_lane| {304 let res_lane = match lane_ty.kind() {305 ty::Float(_) => fx.bcx.ins().fcmp(flt_cc, x_lane, y_lane),306 _ => unreachable!("{:?}", lane_ty),307 };308 bool_to_zero_or_max_uint(fx, res_lane_ty, res_lane)309 });310 }311 "llvm.x86.ssse3.pshuf.b.128" | "llvm.x86.avx2.pshuf.b" => {312 let (a, b) = match args {313 [a, b] => (a, b),314 _ => bug!("wrong number of args for intrinsic {intrinsic}"),315 };316 let a = codegen_operand(fx, &a.node);317 let b = codegen_operand(fx, &b.node);318319 // Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332320 let zero = fx.bcx.ins().iconst(types::I8, 0);321 for i in 0..16 {322 let b_lane = b.value_lane(fx, i).load_scalar(fx);323 let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);324 let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf);325 let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);326 let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);327 let res = fx.bcx.ins().select(is_zero, zero, a_lane);328 ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());329 }330331 if intrinsic == "llvm.x86.avx2.pshuf.b" {332 for i in 16..32 {333 let b_lane = b.value_lane(fx, i).load_scalar(fx);334 let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);335 let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);336 let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);337 let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);338 let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);339 let res = fx.bcx.ins().select(is_zero, zero, a_lane);340 ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());341 }342 }343 }344 "llvm.x86.avx2.permd" => {345 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32346 intrinsic_args!(fx, args => (a, idx); intrinsic);347348 for j in 0..=7 {349 let index = idx.value_typed_lane(fx, fx.tcx.types.u32, j).load_scalar(fx);350 let index = fx.bcx.ins().uextend(fx.pointer_type, index);351 let value = a.value_lane_dyn(fx, index).load_scalar(fx);352 ret.place_typed_lane(fx, fx.tcx.types.u32, j).to_ptr().store(353 fx,354 value,355 MemFlags::trusted(),356 );357 }358 }359 "llvm.x86.avx2.vperm2i128"360 | "llvm.x86.avx.vperm2f128.ps.256"361 | "llvm.x86.avx.vperm2f128.pd.256" => {362 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256363 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps364 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd365 let (a, b, imm8) = match args {366 [a, b, imm8] => (a, b, imm8),367 _ => bug!("wrong number of args for intrinsic {intrinsic}"),368 };369 let a = codegen_operand(fx, &a.node);370 let b = codegen_operand(fx, &b.node);371 let imm8 = codegen_operand(fx, &imm8.node).load_scalar(fx);372373 let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);374 let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);375376 let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);377 let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);378379 fn select4(380 fx: &mut FunctionCx<'_, '_, '_>,381 a_high: Value,382 a_low: Value,383 b_high: Value,384 b_low: Value,385 control: Value,386 ) -> Value {387 let a_or_b = fx.bcx.ins().band_imm(control, 0b0010);388 let high_or_low = fx.bcx.ins().band_imm(control, 0b0001);389 let is_zero = fx.bcx.ins().band_imm(control, 0b1000);390391 let zero = fx.bcx.ins().iconst(types::I64, 0);392 let zero = fx.bcx.ins().iconcat(zero, zero);393394 let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low);395 let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low);396 let res = fx.bcx.ins().select(a_or_b, res_b, res_a);397 fx.bcx.ins().select(is_zero, zero, res)398 }399400 let control0 = imm8;401 let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);402403 let control1 = fx.bcx.ins().ushr_imm(imm8, 4);404 let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);405406 ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(407 fx,408 res_low,409 MemFlags::trusted(),410 );411 ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(412 fx,413 res_high,414 MemFlags::trusted(),415 );416 }417 "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {418 intrinsic_args!(fx, args => (a); intrinsic);419420 simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| {421 fx.bcx.ins().iabs(lane)422 });423 }424 "llvm.x86.sse2.cvttps2dq" => {425 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32&ig_expand=2429426 intrinsic_args!(fx, args => (a); intrinsic);427 let a = a.load_scalar(fx);428429 let value = fx.bcx.ins().x86_cvtt2dq(types::I32X4, a);430 let cvalue = CValue::by_val(value, ret.layout());431 ret.write_cvalue(fx, cvalue);432 }433 "llvm.x86.sse2.cvtps2dq" => {434 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32435 intrinsic_args!(fx, args => (a); intrinsic);436 let a = a.load_scalar(fx);437438 // Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned439 // into 0x80000000 for which Cranelift doesn't have a native instruction.440 codegen_inline_asm_inner(441 fx,442 &[InlineAsmTemplatePiece::String("cvtps2dq xmm0, xmm0".into())],443 &[CInlineAsmOperand::InOut {444 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),445 _late: true,446 in_value: a,447 out_place: Some(ret),448 }],449 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,450 );451 }452 "llvm.x86.addcarry.32" | "llvm.x86.addcarry.64" => {453 intrinsic_args!(fx, args => (c_in, a, b); intrinsic);454 let c_in = c_in.load_scalar(fx);455456 let (cb_out, c) = llvm_add_sub(fx, BinOp::Add, c_in, a, b);457458 let layout = fx.layout_of(Ty::new_tup(fx.tcx, &[fx.tcx.types.u8, a.layout().ty]));459 let val = CValue::by_val_pair(cb_out, c, layout);460 ret.write_cvalue(fx, val);461 }462 "llvm.x86.addcarryx.u32" | "llvm.x86.addcarryx.u64" => {463 intrinsic_args!(fx, args => (c_in, a, b, out); intrinsic);464 let c_in = c_in.load_scalar(fx);465466 let (cb_out, c) = llvm_add_sub(fx, BinOp::Add, c_in, a, b);467468 Pointer::new(out.load_scalar(fx)).store(fx, c, MemFlags::trusted());469 ret.write_cvalue(fx, CValue::by_val(cb_out, fx.layout_of(fx.tcx.types.u8)));470 }471 "llvm.x86.subborrow.32" | "llvm.x86.subborrow.64" => {472 intrinsic_args!(fx, args => (b_in, a, b); intrinsic);473 let b_in = b_in.load_scalar(fx);474475 let (cb_out, c) = llvm_add_sub(fx, BinOp::Sub, b_in, a, b);476477 let layout = fx.layout_of(Ty::new_tup(fx.tcx, &[fx.tcx.types.u8, a.layout().ty]));478 let val = CValue::by_val_pair(cb_out, c, layout);479 ret.write_cvalue(fx, val);480 }481 "llvm.x86.sse2.pavg.b" | "llvm.x86.sse2.pavg.w" => {482 intrinsic_args!(fx, args => (a, b); intrinsic);483484 // FIXME use vector instructions when possible485 simd_pair_for_each_lane(486 fx,487 a,488 b,489 ret,490 &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| {491 // (a + b + 1) >> 1492 let lane_ty = fx.bcx.func.dfg.value_type(a_lane);493 let a_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), a_lane);494 let b_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), b_lane);495 let sum = fx.bcx.ins().iadd(a_lane, b_lane);496 let num_plus_one = fx.bcx.ins().iadd_imm(sum, 1);497 let res = fx.bcx.ins().ushr_imm(num_plus_one, 1);498 fx.bcx.ins().ireduce(lane_ty, res)499 },500 );501 }502 "llvm.x86.sse2.psra.w" => {503 intrinsic_args!(fx, args => (a, count); intrinsic);504505 let count_lane = count.force_stack(fx).0.load(fx, types::I64, MemFlags::trusted());506 let lane_ty = fx.clif_type(a.layout().ty.simd_size_and_type(fx.tcx).1).unwrap();507 let max_count = fx.bcx.ins().iconst(types::I64, i64::from(lane_ty.bits() - 1));508 let saturated_count = fx.bcx.ins().umin(count_lane, max_count);509510 // FIXME use vector instructions when possible511 simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, a_lane| {512 fx.bcx.ins().sshr(a_lane, saturated_count)513 });514 }515 "llvm.x86.sse2.psad.bw" | "llvm.x86.avx2.psad.bw" => {516 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8&ig_expand=5770517 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8&ig_expand=5771518 intrinsic_args!(fx, args => (a, b); intrinsic);519520 assert_eq!(a.layout(), b.layout());521 let layout = a.layout();522523 let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);524 let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);525 assert_eq!(lane_ty, fx.tcx.types.u8);526 assert_eq!(ret_lane_ty, fx.tcx.types.u64);527 assert_eq!(lane_count, ret_lane_count * 8);528529 let ret_lane_layout = fx.layout_of(fx.tcx.types.u64);530 for out_lane_idx in 0..lane_count / 8 {531 let mut lane_diff_acc = fx.bcx.ins().iconst(types::I64, 0);532533 for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 8 {534 let a_lane = a.value_lane(fx, lane_idx).load_scalar(fx);535 let a_lane = fx.bcx.ins().uextend(types::I16, a_lane);536 let b_lane = b.value_lane(fx, lane_idx).load_scalar(fx);537 let b_lane = fx.bcx.ins().uextend(types::I16, b_lane);538539 let lane_diff = fx.bcx.ins().isub(a_lane, b_lane);540 let abs_lane_diff = fx.bcx.ins().iabs(lane_diff);541 let abs_lane_diff = fx.bcx.ins().uextend(types::I64, abs_lane_diff);542 lane_diff_acc = fx.bcx.ins().iadd(lane_diff_acc, abs_lane_diff);543 }544545 let res_lane = CValue::by_val(lane_diff_acc, ret_lane_layout);546547 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);548 }549 }550 "llvm.x86.ssse3.pmadd.ub.sw.128" | "llvm.x86.avx2.pmadd.ub.sw" => {551 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16&ig_expand=4267552 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&ig_expand=4270553 intrinsic_args!(fx, args => (a, b); intrinsic);554555 let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);556 let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);557 assert_eq!(lane_ty, fx.tcx.types.u8);558 assert_eq!(ret_lane_ty, fx.tcx.types.i16);559 assert_eq!(lane_count, ret_lane_count * 2);560561 let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);562 for out_lane_idx in 0..lane_count / 2 {563 let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);564 let a_lane0 = fx.bcx.ins().uextend(types::I16, a_lane0);565 let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);566 let b_lane0 = fx.bcx.ins().sextend(types::I16, b_lane0);567568 let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);569 let a_lane1 = fx.bcx.ins().uextend(types::I16, a_lane1);570 let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);571 let b_lane1 = fx.bcx.ins().sextend(types::I16, b_lane1);572573 let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0);574 let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1);575576 let (val, has_overflow) = fx.bcx.ins().sadd_overflow(mul0, mul1);577578 let rhs_ge_zero = fx.bcx.ins().icmp_imm(IntCC::SignedGreaterThanOrEqual, mul1, 0);579580 let min = fx.bcx.ins().iconst(types::I16, i64::from(i16::MIN as u16));581 let max = fx.bcx.ins().iconst(types::I16, i64::from(i16::MAX as u16));582583 let sat_val = fx.bcx.ins().select(rhs_ge_zero, max, min);584 let res_lane = fx.bcx.ins().select(has_overflow, sat_val, val);585586 let res_lane = CValue::by_val(res_lane, ret_lane_layout);587588 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);589 }590 }591 "llvm.x86.sse2.pmadd.wd" | "llvm.x86.avx2.pmadd.wd" => {592 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16&ig_expand=4231593 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16&ig_expand=4234594 intrinsic_args!(fx, args => (a, b); intrinsic);595596 assert_eq!(a.layout(), b.layout());597 let layout = a.layout();598599 let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);600 let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);601 assert_eq!(lane_ty, fx.tcx.types.i16);602 assert_eq!(ret_lane_ty, fx.tcx.types.i32);603 assert_eq!(lane_count, ret_lane_count * 2);604605 let ret_lane_layout = fx.layout_of(fx.tcx.types.i32);606 for out_lane_idx in 0..lane_count / 2 {607 let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);608 let a_lane0 = fx.bcx.ins().sextend(types::I32, a_lane0);609 let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);610 let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0);611612 let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);613 let a_lane1 = fx.bcx.ins().sextend(types::I32, a_lane1);614 let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);615 let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1);616617 let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0);618 let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1);619620 let res_lane = fx.bcx.ins().iadd(mul0, mul1);621 let res_lane = CValue::by_val(res_lane, ret_lane_layout);622623 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);624 }625 }626627 "llvm.x86.ssse3.pmul.hr.sw.128" => {628 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782629 intrinsic_args!(fx, args => (a, b); intrinsic);630631 assert_eq!(a.layout(), b.layout());632 let layout = a.layout();633634 let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);635 let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);636 assert_eq!(lane_ty, fx.tcx.types.i16);637 assert_eq!(ret_lane_ty, fx.tcx.types.i16);638 assert_eq!(lane_count, ret_lane_count);639640 let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);641 for out_lane_idx in 0..lane_count {642 let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);643 let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);644 let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);645 let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);646647 let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);648 let shifted = fx.bcx.ins().ushr_imm(mul, 14);649 let incremented = fx.bcx.ins().iadd_imm(shifted, 1);650 let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);651652 let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);653 let res_lane = CValue::by_val(res_lane, ret_lane_layout);654655 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);656 }657 }658659 "llvm.x86.sse2.packuswb.128" => {660 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903661 intrinsic_args!(fx, args => (a, b); intrinsic);662663 pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Sse);664 }665666 "llvm.x86.sse2.packsswb.128" => {667 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16&ig_expand=4848668 intrinsic_args!(fx, args => (a, b); intrinsic);669670 pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Sse);671 }672673 "llvm.x86.avx2.packuswb" => {674 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906675 intrinsic_args!(fx, args => (a, b); intrinsic);676677 pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Avx);678 }679680 "llvm.x86.avx2.packsswb" => {681 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16&ig_expand=4851682 intrinsic_args!(fx, args => (a, b); intrinsic);683684 pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Avx);685 }686687 "llvm.x86.sse41.packusdw" => {688 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912689 intrinsic_args!(fx, args => (a, b); intrinsic);690691 pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Sse);692 }693694 "llvm.x86.sse2.packssdw.128" => {695 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889696 intrinsic_args!(fx, args => (a, b); intrinsic);697698 pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Sse);699 }700701 "llvm.x86.avx2.packusdw" => {702 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32&ig_expand=4883703 intrinsic_args!(fx, args => (a, b); intrinsic);704705 pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Avx);706 }707708 "llvm.x86.avx2.packssdw" => {709 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892710 intrinsic_args!(fx, args => (a, b); intrinsic);711712 pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Avx);713 }714715 "llvm.x86.sse42.crc32.32.8"716 | "llvm.x86.sse42.crc32.32.16"717 | "llvm.x86.sse42.crc32.32.32"718 | "llvm.x86.sse42.crc32.64.64" => {719 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=1419&text=_mm_crc32_u32720 intrinsic_args!(fx, args => (crc, v); intrinsic);721722 let crc = crc.load_scalar(fx);723 let v = v.load_scalar(fx);724725 let asm = match intrinsic {726 "llvm.x86.sse42.crc32.32.8" => "crc32 eax, dl",727 "llvm.x86.sse42.crc32.32.16" => "crc32 eax, dx",728 "llvm.x86.sse42.crc32.32.32" => "crc32 eax, edx",729 "llvm.x86.sse42.crc32.64.64" => "crc32 rax, rdx",730 _ => unreachable!(),731 };732733 codegen_inline_asm_inner(734 fx,735 &[InlineAsmTemplatePiece::String(asm.into())],736 &[737 CInlineAsmOperand::InOut {738 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),739 _late: true,740 in_value: crc,741 out_place: Some(ret),742 },743 CInlineAsmOperand::In {744 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),745 value: v,746 },747 ],748 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,749 );750 }751752 "llvm.x86.sse42.pcmpestri128" => {753 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri&ig_expand=939754 intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);755756 let a = a.load_scalar(fx);757 let la = la.load_scalar(fx);758 let b = b.load_scalar(fx);759 let lb = lb.load_scalar(fx);760761 let imm8 =762 if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4].node) {763 imm8764 } else {765 fx.tcx766 .dcx()767 .span_fatal(span, "Index argument for `_mm_cmpestri` is not a constant");768 };769770 let imm8 = imm8.to_u8();771772 codegen_inline_asm_inner(773 fx,774 &[InlineAsmTemplatePiece::String(format!("pcmpestri xmm0, xmm1, {imm8}").into())],775 &[776 CInlineAsmOperand::In {777 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),778 value: a,779 },780 CInlineAsmOperand::In {781 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),782 value: b,783 },784 // Implicit argument to the pcmpestri intrinsic785 CInlineAsmOperand::In {786 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),787 value: la,788 },789 // Implicit argument to the pcmpestri intrinsic790 CInlineAsmOperand::In {791 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),792 value: lb,793 },794 // Implicit result of the pcmpestri intrinsic795 CInlineAsmOperand::Out {796 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),797 late: true,798 place: Some(ret),799 },800 ],801 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,802 );803 }804805 "llvm.x86.sse42.pcmpestrm128" => {806 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm&ig_expand=940807 intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);808809 let a = a.load_scalar(fx);810 let la = la.load_scalar(fx);811 let b = b.load_scalar(fx);812 let lb = lb.load_scalar(fx);813814 let imm8 =815 if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4].node) {816 imm8817 } else {818 fx.tcx819 .dcx()820 .span_fatal(span, "Index argument for `_mm_cmpestrm` is not a constant");821 };822823 let imm8 = imm8.to_u8();824825 codegen_inline_asm_inner(826 fx,827 &[InlineAsmTemplatePiece::String(format!("pcmpestrm xmm0, xmm1, {imm8}").into())],828 &[829 CInlineAsmOperand::InOut {830 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),831 _late: true,832 in_value: a,833 out_place: Some(ret),834 },835 CInlineAsmOperand::In {836 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),837 value: b,838 },839 // Implicit argument to the pcmpestri intrinsic840 CInlineAsmOperand::In {841 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),842 value: la,843 },844 // Implicit argument to the pcmpestri intrinsic845 CInlineAsmOperand::In {846 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),847 value: lb,848 },849 ],850 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,851 );852 }853854 "llvm.x86.pclmulqdq" => {855 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772856 intrinsic_args!(fx, args => (a, b, _imm8); intrinsic);857858 let a = a.load_scalar(fx);859 let b = b.load_scalar(fx);860861 let imm8 =862 if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[2].node) {863 imm8864 } else {865 fx.tcx.dcx().span_fatal(866 span,867 "Index argument for `_mm_clmulepi64_si128` is not a constant",868 );869 };870871 let imm8 = imm8.to_u8();872873 codegen_inline_asm_inner(874 fx,875 &[InlineAsmTemplatePiece::String(format!("pclmulqdq xmm0, xmm1, {imm8}").into())],876 &[877 CInlineAsmOperand::InOut {878 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),879 _late: true,880 in_value: a,881 out_place: Some(ret),882 },883 CInlineAsmOperand::In {884 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),885 value: b,886 },887 ],888 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,889 );890 }891892 "llvm.x86.aesni.aeskeygenassist" => {893 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128&ig_expand=261894 intrinsic_args!(fx, args => (a, _imm8); intrinsic);895896 let a = a.load_scalar(fx);897898 let imm8 =899 if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[1].node) {900 imm8901 } else {902 fx.tcx.dcx().span_fatal(903 span,904 "Index argument for `_mm_aeskeygenassist_si128` is not a constant",905 );906 };907908 let imm8 = imm8.to_u8();909910 codegen_inline_asm_inner(911 fx,912 &[InlineAsmTemplatePiece::String(913 format!("aeskeygenassist xmm0, xmm0, {imm8}").into(),914 )],915 &[CInlineAsmOperand::InOut {916 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),917 _late: true,918 in_value: a,919 out_place: Some(ret),920 }],921 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,922 );923 }924925 "llvm.x86.aesni.aesimc" => {926 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128&ig_expand=260927 intrinsic_args!(fx, args => (a); intrinsic);928929 let a = a.load_scalar(fx);930931 codegen_inline_asm_inner(932 fx,933 &[InlineAsmTemplatePiece::String("aesimc xmm0, xmm0".into())],934 &[CInlineAsmOperand::InOut {935 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),936 _late: true,937 in_value: a,938 out_place: Some(ret),939 }],940 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,941 );942 }943944 "llvm.x86.aesni.aesenc" => {945 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128&ig_expand=252946 intrinsic_args!(fx, args => (a, round_key); intrinsic);947948 let a = a.load_scalar(fx);949 let round_key = round_key.load_scalar(fx);950951 codegen_inline_asm_inner(952 fx,953 &[InlineAsmTemplatePiece::String("aesenc xmm0, xmm1".into())],954 &[955 CInlineAsmOperand::InOut {956 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),957 _late: true,958 in_value: a,959 out_place: Some(ret),960 },961 CInlineAsmOperand::In {962 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),963 value: round_key,964 },965 ],966 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,967 );968 }969970 "llvm.x86.aesni.aesenclast" => {971 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128&ig_expand=257972 intrinsic_args!(fx, args => (a, round_key); intrinsic);973974 let a = a.load_scalar(fx);975 let round_key = round_key.load_scalar(fx);976977 codegen_inline_asm_inner(978 fx,979 &[InlineAsmTemplatePiece::String("aesenclast xmm0, xmm1".into())],980 &[981 CInlineAsmOperand::InOut {982 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),983 _late: true,984 in_value: a,985 out_place: Some(ret),986 },987 CInlineAsmOperand::In {988 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),989 value: round_key,990 },991 ],992 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,993 );994 }995996 "llvm.x86.aesni.aesdec" => {997 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128&ig_expand=242998 intrinsic_args!(fx, args => (a, round_key); intrinsic);9991000 let a = a.load_scalar(fx);1001 let round_key = round_key.load_scalar(fx);10021003 codegen_inline_asm_inner(1004 fx,1005 &[InlineAsmTemplatePiece::String("aesdec xmm0, xmm1".into())],1006 &[1007 CInlineAsmOperand::InOut {1008 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),1009 _late: true,1010 in_value: a,1011 out_place: Some(ret),1012 },1013 CInlineAsmOperand::In {1014 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1015 value: round_key,1016 },1017 ],1018 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1019 );1020 }10211022 "llvm.x86.aesni.aesdeclast" => {1023 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128&ig_expand=2471024 intrinsic_args!(fx, args => (a, round_key); intrinsic);10251026 let a = a.load_scalar(fx);1027 let round_key = round_key.load_scalar(fx);10281029 codegen_inline_asm_inner(1030 fx,1031 &[InlineAsmTemplatePiece::String("aesdeclast xmm0, xmm1".into())],1032 &[1033 CInlineAsmOperand::InOut {1034 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),1035 _late: true,1036 in_value: a,1037 out_place: Some(ret),1038 },1039 CInlineAsmOperand::In {1040 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1041 value: round_key,1042 },1043 ],1044 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1045 );1046 }10471048 "llvm.x86.sha1rnds4" => {1049 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1rnds4_epu32&ig_expand=58771050 intrinsic_args!(fx, args => (a, b, _func); intrinsic);10511052 let a = a.load_scalar(fx);1053 let b = b.load_scalar(fx);10541055 let func = if let Some(func) =1056 crate::constant::mir_operand_get_const_val(fx, &args[2].node)1057 {1058 func1059 } else {1060 fx.tcx1061 .dcx()1062 .span_fatal(span, "Func argument for `_mm_sha1rnds4_epu32` is not a constant");1063 };10641065 let func = func.to_u8();10661067 codegen_inline_asm_inner(1068 fx,1069 &[InlineAsmTemplatePiece::String(format!("sha1rnds4 xmm1, xmm2, {func}").into())],1070 &[1071 CInlineAsmOperand::InOut {1072 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1073 _late: true,1074 in_value: a,1075 out_place: Some(ret),1076 },1077 CInlineAsmOperand::In {1078 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),1079 value: b,1080 },1081 ],1082 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1083 );1084 }10851086 "llvm.x86.sha1msg1" => {1087 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg1_epu32&ig_expand=58741088 intrinsic_args!(fx, args => (a, b); intrinsic);10891090 let a = a.load_scalar(fx);1091 let b = b.load_scalar(fx);10921093 codegen_inline_asm_inner(1094 fx,1095 &[InlineAsmTemplatePiece::String("sha1msg1 xmm1, xmm2".into())],1096 &[1097 CInlineAsmOperand::InOut {1098 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1099 _late: true,1100 in_value: a,1101 out_place: Some(ret),1102 },1103 CInlineAsmOperand::In {1104 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),1105 value: b,1106 },1107 ],1108 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1109 );1110 }11111112 "llvm.x86.sha1msg2" => {1113 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg2_epu32&ig_expand=58751114 intrinsic_args!(fx, args => (a, b); intrinsic);11151116 let a = a.load_scalar(fx);1117 let b = b.load_scalar(fx);11181119 codegen_inline_asm_inner(1120 fx,1121 &[InlineAsmTemplatePiece::String("sha1msg2 xmm1, xmm2".into())],1122 &[1123 CInlineAsmOperand::InOut {1124 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1125 _late: true,1126 in_value: a,1127 out_place: Some(ret),1128 },1129 CInlineAsmOperand::In {1130 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),1131 value: b,1132 },1133 ],1134 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1135 );1136 }11371138 "llvm.x86.sha1nexte" => {1139 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1nexte_epu32&ig_expand=58761140 intrinsic_args!(fx, args => (a, b); intrinsic);11411142 let a = a.load_scalar(fx);1143 let b = b.load_scalar(fx);11441145 codegen_inline_asm_inner(1146 fx,1147 &[InlineAsmTemplatePiece::String("sha1nexte xmm1, xmm2".into())],1148 &[1149 CInlineAsmOperand::InOut {1150 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1151 _late: true,1152 in_value: a,1153 out_place: Some(ret),1154 },1155 CInlineAsmOperand::In {1156 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),1157 value: b,1158 },1159 ],1160 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1161 );1162 }11631164 "llvm.x86.sha256rnds2" => {1165 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256rnds2_epu32&ig_expand=59771166 intrinsic_args!(fx, args => (a, b, k); intrinsic);11671168 let a = a.load_scalar(fx);1169 let b = b.load_scalar(fx);1170 let k = k.load_scalar(fx);11711172 codegen_inline_asm_inner(1173 fx,1174 &[InlineAsmTemplatePiece::String("sha256rnds2 xmm1, xmm2".into())],1175 &[1176 CInlineAsmOperand::InOut {1177 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1178 _late: true,1179 in_value: a,1180 out_place: Some(ret),1181 },1182 CInlineAsmOperand::In {1183 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),1184 value: b,1185 },1186 // Implicit argument to the sha256rnds2 instruction1187 CInlineAsmOperand::In {1188 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),1189 value: k,1190 },1191 ],1192 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1193 );1194 }11951196 "llvm.x86.sha256msg1" => {1197 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg1_epu32&ig_expand=59751198 intrinsic_args!(fx, args => (a, b); intrinsic);11991200 let a = a.load_scalar(fx);1201 let b = b.load_scalar(fx);12021203 codegen_inline_asm_inner(1204 fx,1205 &[InlineAsmTemplatePiece::String("sha256msg1 xmm1, xmm2".into())],1206 &[1207 CInlineAsmOperand::InOut {1208 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1209 _late: true,1210 in_value: a,1211 out_place: Some(ret),1212 },1213 CInlineAsmOperand::In {1214 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),1215 value: b,1216 },1217 ],1218 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1219 );1220 }12211222 "llvm.x86.sha256msg2" => {1223 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg2_epu32&ig_expand=59761224 intrinsic_args!(fx, args => (a, b); intrinsic);12251226 let a = a.load_scalar(fx);1227 let b = b.load_scalar(fx);12281229 codegen_inline_asm_inner(1230 fx,1231 &[InlineAsmTemplatePiece::String("sha256msg2 xmm1, xmm2".into())],1232 &[1233 CInlineAsmOperand::InOut {1234 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),1235 _late: true,1236 in_value: a,1237 out_place: Some(ret),1238 },1239 CInlineAsmOperand::In {1240 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),1241 value: b,1242 },1243 ],1244 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1245 );1246 }12471248 "llvm.x86.avx.ptestz.256" => {1249 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=69451250 intrinsic_args!(fx, args => (a, b); intrinsic);12511252 assert_eq!(a.layout(), b.layout());1253 let layout = a.layout();12541255 let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);1256 assert_eq!(lane_ty, fx.tcx.types.i64);1257 assert_eq!(ret.layout().ty, fx.tcx.types.i32);1258 assert_eq!(lane_count, 4);12591260 let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);1261 let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);1262 let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);1263 let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);1264 let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);1265 let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);1266 let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);1267 let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);12681269 let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);1270 let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);1271 let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);1272 let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);12731274 let all_zero0 = fx.bcx.ins().bor(zero0, zero1);1275 let all_zero1 = fx.bcx.ins().bor(zero2, zero3);1276 let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);12771278 let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);1279 let res = CValue::by_val(1280 fx.bcx.ins().uextend(types::I32, res),1281 fx.layout_of(fx.tcx.types.i32),1282 );1283 ret.write_cvalue(fx, res);1284 }12851286 "llvm.x86.rdtsc" => {1287 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdtsc&ig_expand=527312881289 let res_place = CPlace::new_stack_slot(1290 fx,1291 fx.layout_of(Ty::new_tup(fx.tcx, &[fx.tcx.types.u32, fx.tcx.types.u32])),1292 );1293 let eax_place = res_place.place_field(fx, FieldIdx::new(0));1294 let edx_place = res_place.place_field(fx, FieldIdx::new(1));1295 codegen_inline_asm_inner(1296 fx,1297 &[InlineAsmTemplatePiece::String("rdtsc".into())],1298 &[1299 CInlineAsmOperand::Out {1300 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),1301 late: true,1302 place: Some(eax_place),1303 },1304 CInlineAsmOperand::Out {1305 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),1306 late: true,1307 place: Some(edx_place),1308 },1309 ],1310 InlineAsmOptions::NOSTACK | InlineAsmOptions::NOMEM,1311 );1312 let res = res_place.to_cvalue(fx);1313 ret.write_cvalue_transmute(fx, res);1314 }13151316 "llvm.x86.vcvtps2ph.128" => {1317 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_ph1318 intrinsic_args!(fx, args => (a, _imm8); intrinsic);1319 let a = a.load_scalar(fx);13201321 let imm8 =1322 if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[1].node) {1323 imm81324 } else {1325 fx.tcx1326 .dcx()1327 .span_fatal(span, "Index argument for `_mm_cvtps_ph` is not a constant");1328 };13291330 let imm8 = imm8.to_u32();13311332 codegen_inline_asm_inner(1333 fx,1334 &[InlineAsmTemplatePiece::String(format!("vcvtps2ph xmm0, xmm0, {imm8}").into())],1335 &[CInlineAsmOperand::InOut {1336 reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),1337 _late: true,1338 in_value: a,1339 out_place: Some(ret),1340 }],1341 InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,1342 );1343 }13441345 _ => {1346 fx.tcx1347 .dcx()1348 .warn(format!("unsupported x86 llvm intrinsic {}; replacing with trap", intrinsic));1349 let msg = format!(1350 "{intrinsic} is not yet supported.\n\1351 See https://github.com/rust-lang/rustc_codegen_cranelift/issues/171\n\1352 Please open an issue at https://github.com/rust-lang/rustc_codegen_cranelift/issues"1353 );1354 crate::base::codegen_panic_nounwind(fx, &msg, span);1355 return;1356 }1357 }13581359 let dest = target.expect("all llvm intrinsics used by stdlib should return");1360 let ret_block = fx.get_block(dest);1361 fx.bcx.ins().jump(ret_block, &[]);1362}13631364// llvm.x86.avx2.vperm2i1281365// llvm.x86.ssse3.pshuf.b.1281366// llvm.x86.avx2.pshuf.b13671368fn llvm_add_sub<'tcx>(1369 fx: &mut FunctionCx<'_, '_, 'tcx>,1370 bin_op: BinOp,1371 cb_in: Value,1372 a: CValue<'tcx>,1373 b: CValue<'tcx>,1374) -> (Value, Value) {1375 assert_eq!(a.layout().ty, b.layout().ty);13761377 // c + carry -> c + first intermediate carry or borrow respectively1378 let int0 = crate::num::codegen_checked_int_binop(fx, bin_op, a, b);1379 let c = int0.value_field(fx, FieldIdx::ZERO);1380 let cb0 = int0.value_field(fx, FieldIdx::new(1)).load_scalar(fx);13811382 // c + carry -> c + second intermediate carry or borrow respectively1383 let clif_ty = fx.clif_type(a.layout().ty).unwrap();1384 let cb_in_as_int = fx.bcx.ins().uextend(clif_ty, cb_in);1385 let cb_in_as_int = CValue::by_val(cb_in_as_int, fx.layout_of(a.layout().ty));1386 let int1 = crate::num::codegen_checked_int_binop(fx, bin_op, c, cb_in_as_int);1387 let (c, cb1) = int1.load_scalar_pair(fx);13881389 // carry0 | carry1 -> carry or borrow respectively1390 let cb_out = fx.bcx.ins().bor(cb0, cb1);13911392 (cb_out, c)1393}13941395enum PackSize {1396 U8,1397 U16,1398 S8,1399 S16,1400}14011402impl PackSize {1403 fn ret_clif_type(&self) -> Type {1404 match self {1405 Self::U8 | Self::S8 => types::I8,1406 Self::U16 | Self::S16 => types::I16,1407 }1408 }1409 fn src_clif_type(&self) -> Type {1410 match self {1411 Self::U8 | Self::S8 => types::I16,1412 Self::U16 | Self::S16 => types::I32,1413 }1414 }1415 fn src_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {1416 match self {1417 Self::U8 | Self::S8 => tcx.types.i16,1418 Self::U16 | Self::S16 => tcx.types.i32,1419 }1420 }1421 fn ret_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {1422 match self {1423 Self::U8 => tcx.types.u8,1424 Self::S8 => tcx.types.i8,1425 Self::U16 => tcx.types.u16,1426 Self::S16 => tcx.types.i16,1427 }1428 }1429 fn max(&self) -> i64 {1430 match self {1431 Self::U8 => u8::MAX as u64 as i64,1432 Self::S8 => i8::MAX as u8 as u64 as i64,1433 Self::U16 => u16::MAX as u64 as i64,1434 Self::S16 => i16::MAX as u64 as u64 as i64,1435 }1436 }1437 fn min(&self) -> i64 {1438 match self {1439 Self::U8 | Self::U16 => 0,1440 Self::S8 => i16::from(i8::MIN) as u16 as i64,1441 Self::S16 => i32::from(i16::MIN) as u32 as i64,1442 }1443 }1444}14451446enum PackWidth {1447 Sse = 1,1448 Avx = 2,1449}1450impl PackWidth {1451 fn divisor(&self) -> u64 {1452 match self {1453 Self::Sse => 1,1454 Self::Avx => 2,1455 }1456 }1457}14581459/// Implement an x86 pack instruction with the intrinsic `_mm{,256}pack{us,s}_epi{16,32}`.1460/// Validated for correctness against LLVM, see commit `c8f5d35508e062bd2d95e6c03429bfec831db6d3`.1461fn pack_instruction<'tcx>(1462 fx: &mut FunctionCx<'_, '_, 'tcx>,1463 a: CValue<'tcx>,1464 b: CValue<'tcx>,1465 ret: CPlace<'tcx>,1466 ret_size: PackSize,1467 width: PackWidth,1468) {1469 assert_eq!(a.layout(), b.layout());1470 let layout = a.layout();14711472 let (src_lane_count, src_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);1473 let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);1474 assert_eq!(src_lane_ty, ret_size.src_ty(fx.tcx));1475 assert_eq!(ret_lane_ty, ret_size.ret_ty(fx.tcx));1476 assert_eq!(src_lane_count * 2, ret_lane_count);14771478 let min = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.min());1479 let max = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.max());1480 let ret_lane_layout = fx.layout_of(ret_size.ret_ty(fx.tcx));14811482 let mut round = |source: CValue<'tcx>, source_offset: u64, dest_offset: u64| {1483 let step_amount = src_lane_count / width.divisor();1484 let dest_offset = step_amount * dest_offset;1485 for idx in 0..step_amount {1486 let lane = source.value_lane(fx, step_amount * source_offset + idx).load_scalar(fx);1487 let sat = fx.bcx.ins().smax(lane, min);1488 let sat = match ret_size {1489 PackSize::U8 | PackSize::U16 => fx.bcx.ins().umin(sat, max),1490 PackSize::S8 | PackSize::S16 => fx.bcx.ins().smin(sat, max),1491 };1492 let res = fx.bcx.ins().ireduce(ret_size.ret_clif_type(), sat);1493 let res_lane = CValue::by_val(res, ret_lane_layout);1494 ret.place_lane(fx, dest_offset + idx).write_cvalue(fx, res_lane);1495 }1496 };14971498 round(a, 0, 0);1499 round(b, 0, 1);15001501 if let PackWidth::Avx = width {1502 round(a, 1, 2);1503 round(b, 1, 3);1504 }1505}