diff options
author | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2015-05-11 06:05:05 +0000 |
---|---|---|
committer | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2015-05-11 06:05:05 +0000 |
commit | 8189eb4d7e8344d18026bbb94a67787c73985830 (patch) | |
tree | 07135e86743ea5d31d9278dc034957509ea20296 | |
parent | 73f2a7bbb23581f68954ce13807fcdacef48a3bf (diff) |
AVX-512: Added SKX instructions and intrinsics:
{add/sub/mul/div/} x {ps/pd} x {128/256} 2. max/min with sae
By Asaf Badouh (asaf.badouh@intel.com)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236971 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/IR/IntrinsicsX86.td | 72 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 2 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrAVX512.td | 132 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFragmentsSIMD.td | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86IntrinsicsInfo.h | 33 | ||||
-rw-r--r-- | test/CodeGen/X86/avx512-intrinsics.ll | 377 | ||||
-rw-r--r-- | test/CodeGen/X86/avx512er-intrinsics.ll | 24 | ||||
-rw-r--r-- | test/CodeGen/X86/avx512vl-intrinsics.ll | 264 | ||||
-rw-r--r-- | test/MC/X86/avx512-encodings.s | 583 |
9 files changed, 1391 insertions, 100 deletions
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 43aa89871e8..998fa7a6ce3 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3205,39 +3205,111 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx512_mask_add_ps_128 : GCCBuiltin<"__builtin_ia32_addps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_add_ps_256 : GCCBuiltin<"__builtin_ia32_addps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_add_pd_128 : GCCBuiltin<"__builtin_ia32_addpd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_add_pd_256 : GCCBuiltin<"__builtin_ia32_addpd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_sub_ps_128 : GCCBuiltin<"__builtin_ia32_subps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_sub_ps_256 : GCCBuiltin<"__builtin_ia32_subps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_sub_pd_128 : GCCBuiltin<"__builtin_ia32_subpd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_sub_pd_256 : GCCBuiltin<"__builtin_ia32_subpd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_mul_ps_128 : GCCBuiltin<"__builtin_ia32_mulps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_mul_ps_256 : GCCBuiltin<"__builtin_ia32_mulps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_mul_pd_128 : GCCBuiltin<"__builtin_ia32_mulpd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_mul_pd_256 : GCCBuiltin<"__builtin_ia32_mulpd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_div_ps_128 : GCCBuiltin<"__builtin_ia32_divps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_div_ps_256 : GCCBuiltin<"__builtin_ia32_divps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_div_pd_128 : GCCBuiltin<"__builtin_ia32_divpd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_div_pd_256 : GCCBuiltin<"__builtin_ia32_divpd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_max_ps_128 : GCCBuiltin<"__builtin_ia32_maxps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_max_pd_128 : GCCBuiltin<"__builtin_ia32_maxpd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_min_ps_128 : GCCBuiltin<"__builtin_ia32_minps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_min_pd_128 : GCCBuiltin<"__builtin_ia32_minpd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index bc6138c67ec..90b49328f22 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -205,6 +205,8 @@ namespace llvm { FSUB_RND, FMUL_RND, FDIV_RND, + FMAX_RND, + FMIN_RND, // Integer add/sub with unsigned saturation. ADDUS, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index bc8ab83fe31..4f9b467d3b2 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -180,21 +180,20 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, list<dag> Pattern, list<dag> MaskingPattern, list<dag> ZeroMaskingPattern, - string Round = "", string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512<O, F, Outs, Ins, - OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# - "$dst "#Round#", "#IntelSrcAsm#"}", + OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# + "$dst , "#IntelSrcAsm#"}", Pattern, itin>; // Prefer over VMOV*rrk Pat<> let AddedComplexity = 20 in def NAME#k: AVX512<O, F, Outs, MaskingIns, - OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"# - "$dst {${mask}}"#Round#", "#IntelSrcAsm#"}", + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", MaskingPattern, itin>, EVEX_K { // In case of the 3src subclass this is overridden with a let. @@ -202,8 +201,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, } let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, - OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"# - "$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}", + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, "#IntelSrcAsm#"}", ZeroMaskingPattern, itin>, EVEX_KZ; @@ -217,7 +216,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, - SDNode Select = vselect, string Round = "", + SDNode Select = vselect, string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : @@ -227,7 +226,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, MaskingRHS)], [(set _.RC:$dst, (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], - Round, MaskingConstraint, NoItinerary, IsCommutable>; + MaskingConstraint, NoItinerary, IsCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -235,7 +234,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, string Round = "", + dag RHS, InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : AVX512_maskable_common<O, F, _, Outs, Ins, @@ -243,14 +242,14 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect, - Round, "$src0 = $dst", itin, IsCommutable>; + "$src0 = $dst", itin, IsCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, string Round = "", + dag RHS, InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : AVX512_maskable_common<O, F, _, Outs, Ins, @@ -258,7 +257,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select, - Round, "$src0 = $dst", itin, IsCommutable>; + "$src0 = $dst", itin, IsCommutable>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -284,7 +283,7 @@ multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, AVX512_maskable_custom<O, F, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), - OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "", + OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "$src0 = $dst">; @@ -2963,7 +2962,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), - "", itins.rr, IsCommutable>, + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in @@ -2972,7 +2971,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), - "", itins.rm>, + itins.rm>, AVX512BIBase, EVEX_4V; } @@ -2988,7 +2987,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - "", itins.rm>, + itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; } @@ -3090,7 +3089,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), - "",itins.rr, IsCommutable>, + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in { defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), @@ -3098,7 +3097,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, "$src2, $src1", "$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Src.LdFrag addr:$src2)))), - "", itins.rm>, + itins.rm>, AVX512BIBase, EVEX_4V; defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), @@ -3109,7 +3108,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Dst.VT (X86VBroadcast (_Dst.ScalarLdFrag addr:$src2)))))), - "", itins.rm>, + itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; } } @@ -3165,8 +3164,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_Src.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Src.VT (X86VBroadcast - (_Src.ScalarLdFrag addr:$src2)))))), - "">, + (_Src.ScalarLdFrag addr:$src2))))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>; } } @@ -3179,15 +3177,15 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, "$src2, $src1","$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), - (_Src.VT _Src.RC:$src2))), - "">, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; + (_Src.VT _Src.RC:$src2)))>, + EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; let mayLoad = 1 in { defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), - (bitconvert (_Src.LdFrag addr:$src2)))), - "">, EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>; + (bitconvert (_Src.LdFrag addr:$src2))))>, + EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>; } } @@ -3390,7 +3388,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, "$src2, $src1", "$src1, $src2", (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 FROUND_CURRENT)), - "", itins.rr, IsCommutable>; + itins.rr, IsCommutable>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, @@ -3398,7 +3396,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (VecNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT)), - "", itins.rm, IsCommutable>; + itins.rm, IsCommutable>; let isCodeGenOnly = 1, isCommutable = IsCommutable, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), @@ -3421,7 +3419,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$rc)), "", itins.rr, IsCommutable>, + (i32 imm:$rc)), itins.rr, IsCommutable>, EVEX_B, EVEX_RC; } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, @@ -3429,9 +3427,9 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, - "$src2, $src1", "$src1, $src2", + "{sae}, $src2, $src1", "$src1, $src2, {sae}", (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; + (i32 FROUND_NO_EXC))>, EVEX_B; } multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3500,6 +3498,16 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRn EVEX_4V, EVEX_B, EVEX_RC; } + +multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86VectorVTInfo _, bit IsCommutable> { + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>, + EVEX_4V, EVEX_B; +} + multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, bit IsCommutable = 0> { defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, @@ -3533,6 +3541,13 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } +multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { + defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>, @@ -3541,33 +3556,17 @@ defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; -defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>; -defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>, + avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>, + avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>; let Predicates = [HasDQI] in { defm VAND : avx512_fp_binop_p<0x54, "vand", X86fand, 1>; defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>; defm VOR : avx512_fp_binop_p<0x56, "vor", X86for, 1>; defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>; } -def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), - (i16 -1), FROUND_CURRENT)), - (VMAXPSZrr VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_max_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)), - (i8 -1), FROUND_CURRENT)), - (VMAXPDZrr VR512:$src1, VR512:$src2)>; - -def : Pat<(v16f32 (int_x86_avx512_mask_min_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), - (i16 -1), FROUND_CURRENT)), - (VMINPSZrr VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)), - (i8 -1), FROUND_CURRENT)), - (VMINPDZrr VR512:$src1, VR512:$src2)>; + //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// @@ -3667,14 +3666,14 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; + SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; let mayLoad = 1 in defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i8 imm:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; + SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; } multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, @@ -3684,7 +3683,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B; + SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B; } multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3694,12 +3693,12 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, VR128X:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; + SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, i128mem:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, + SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; } @@ -3798,13 +3797,13 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, + SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } @@ -3817,7 +3816,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2))))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, + SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -4775,9 +4774,9 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, - "$src2, $src1", "$src1, $src2", + "{sae}, $src2, $src1", "$src1, $src2, {sae}", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; + (i32 FROUND_NO_EXC))>, EVEX_B; defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, @@ -4809,9 +4808,8 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, - "$src", "$src", - (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), - "{sae}">, EVEX_B; + "{sae}, $src", "$src, {sae}", + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B; defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", @@ -5051,9 +5049,9 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, - "$src3, $src2, $src1", "$src1, $src2, $src3", + "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}", (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), (i32 FROUND_NO_EXC))), "{sae}">, EVEX_B; + (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B; let mayLoad = 1 in defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 497bdf65315..d9eebc5ddfd 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -293,8 +293,8 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN", SDTFPBinOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 648769e7069..e4d82335ece 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -243,8 +243,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), @@ -322,8 +326,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV, X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, @@ -350,9 +358,28 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), - + X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), @@ -512,8 +539,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::RNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index d4129e14c95..d99764e4007 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -434,15 +434,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8) ; fp min - max -define <16 x float> @test_vmaxps(<16 x float> %a0, <16 x float> %a1) { - ; CHECK: vmaxps - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, - <16 x float>, i16, i32) - define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) { ; CHECK: vmaxpd %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1, @@ -452,15 +443,6 @@ define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) { declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) -define <16 x float> @test_vminps(<16 x float> %a0, <16 x float> %a1) { - ; CHECK: vminps - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, - <16 x float>, i16, i32) - define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) { ; CHECK: vminpd %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1, @@ -2240,3 +2222,362 @@ define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i } declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae + ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae + ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae + ;CHECK: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae + ;CHECK: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_add_round_ps_current + ;CHECK: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae + ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae + ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae + ;CHECK: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae + ;CHECK: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_add_round_ps_current + ;CHECK: vaddps %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_add_round_ps_rn_sae + ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_add_round_ps_rd_sae + ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_add_round_ps_ru_sae + ;CHECK: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_add_round_ps_rz_sae + ;CHECK: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_add_round_ps_current + ;CHECK: vaddps %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae + ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae + ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae + ;CHECK: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae + ;CHECK: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_sub_round_ps_current + ;CHECK: vsubps %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_sub_round_ps_rn_sae + ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_sub_round_ps_rd_sae + ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_sub_round_ps_ru_sae + ;CHECK: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_sub_round_ps_rz_sae + ;CHECK: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_sub_round_ps_current + ;CHECK: vsubps %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae + ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae + ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae + ;CHECK: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae + ;CHECK: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_div_round_ps_current + ;CHECK: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae + ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae + ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae + ;CHECK: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae + ;CHECK: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_div_round_ps_current + ;CHECK: vdivps %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_div_round_ps_rn_sae + ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_div_round_ps_rd_sae + ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_div_round_ps_ru_sae + ;CHECK: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_div_round_ps_rz_sae + ;CHECK: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_div_round_ps_current + ;CHECK: vdivps %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_min_round_ps_sae + ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_min_round_ps_current + ;CHECK: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_min_round_ps_sae + ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_min_round_ps_current + ;CHECK: vminps %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_min_round_ps_sae + ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_min_round_ps_current + ;CHECK: vminps %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_max_round_ps_sae + ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_maskz_max_round_ps_current + ;CHECK: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_max_round_ps_sae + ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { + ;CHECK-LABEL: test_mm512_mask_max_round_ps_current + ;CHECK: vmaxps %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_max_round_ps_sae + ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: test_mm512_max_round_ps_current + ;CHECK: vmaxps %zmm1, %zmm0, %zmm0 + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll index dcde9c4153e..827a56d76ae 100644 --- a/test/CodeGen/X86/avx512er-intrinsics.ll +++ b/test/CodeGen/X86/avx512er-intrinsics.ll @@ -1,14 +1,14 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) { - ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] + ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ret <16 x float> %res } define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) { ; CHECK: kmovw - ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8] + ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8] %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8) ret <16 x float> %res } @@ -27,7 +27,7 @@ define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) { } define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) { - ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0] + ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0] %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) ret <16 x float> %res } @@ -36,61 +36,61 @@ define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) { declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { - ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] + ; CHECK: vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { - ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] + ; CHECK: vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_exp2_ps_512(<16 x float> %a0) { - ; CHECK: vexp2ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0] + ; CHECK: vexp2ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0] %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_exp2_pd_512(<8 x double> %a0) { - ; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0] + ; CHECK: vexp2pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0] %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { - ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] + ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] ret <4 x float> %res } declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone define <4 x float> @test_rcp28_ss(<4 x float> %a0) { - ; CHECK: vrcp28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] + ; CHECK: vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] ret <4 x float> %res } declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) { - ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] + ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; ret <4 x float> %res } define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) { - ; CHECK: vrsqrt28ss %xmm1, %xmm0, %xmm2 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] + ; CHECK: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ; ret <4 x float> %res } define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) { - ; CHECK: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] + ; CHECK: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; ret <2 x double> %res } diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 64ec8166d50..4a99a9699e3 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2289,3 +2289,267 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) { ret i8 %res } declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8) + +define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_add_ps_256 + ;CHECK: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_add_ps_256 + ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1} + %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_add_ps_256 + ;CHECK: vaddps %ymm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_add_ps_128 + ;CHECK: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_add_ps_128 + ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_add_ps_128 + ;CHECK: vaddps %xmm1, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_sub_ps_256 + ;CHECK: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_sub_ps_256 + ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1} + %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_sub_ps_256 + ;CHECK: vsubps %ymm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_sub_ps_128 + ;CHECK: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_sub_ps_128 + ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_sub_ps_128 + ;CHECK: vsubps %xmm1, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_mul_ps_256 + ;CHECK: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_mul_ps_256 + ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1} + %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_mul_ps_256 + ;CHECK: vmulps %ymm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_mul_ps_128 + ;CHECK: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_mul_ps_128 + ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_mul_ps_128 + ;CHECK: vmulps %xmm1, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_div_ps_256 + ;CHECK: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_div_ps_256 + ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1} + %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_div_ps_256 + ;CHECK: vdivps %ymm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_div_ps_128 + ;CHECK: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_div_ps_128 + ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_div_ps_128 + ;CHECK: vdivps %xmm1, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_max_ps_256 + ;CHECK: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_max_ps_256 + ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1} + %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_max_ps_256 + ;CHECK: vmaxps %ymm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_max_ps_128 + ;CHECK: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_max_ps_128 + ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_max_ps_128 + ;CHECK: vmaxps %xmm1, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_min_ps_256 + ;CHECK: vminps %ymm1, %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_min_ps_256 + ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1} + %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_min_ps_256 + ;CHECK: vminps %ymm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_maskz_min_ps_128 + ;CHECK: vminps %xmm1, %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { + ;CHECK-LABEL: test_mm512_mask_min_ps_128 + ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { + ;CHECK-LABEL: test_mm512_min_ps_128 + ;CHECK: vminps %xmm1, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
\ No newline at end of file diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index 91107cc6fee..04b736348dd 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -6112,3 +6112,586 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2 // CHECK: encoding: [0x62,0xf1,0x74,0x50,0xc2,0x92,0xfc,0xfd,0xff,0xff,0x7b] vcmpps $0x7b, -516(%rdx){1to16}, %zmm17, %k2 +// CHECK: vaddpd {rn-sae}, %zmm6, %zmm27, %zmm8 +// CHECK: encoding: [0x62,0x71,0xa5,0x10,0x58,0xc6] + vaddpd {rn-sae}, %zmm6, %zmm27, %zmm8 + +// CHECK: vaddpd {ru-sae}, %zmm6, %zmm27, %zmm8 +// CHECK: encoding: [0x62,0x71,0xa5,0x50,0x58,0xc6] + vaddpd {ru-sae}, %zmm6, %zmm27, %zmm8 + +// CHECK: vaddpd {rd-sae}, %zmm6, %zmm27, %zmm8 +// CHECK: encoding: [0x62,0x71,0xa5,0x30,0x58,0xc6] + vaddpd {rd-sae}, %zmm6, %zmm27, %zmm8 + +// CHECK: vaddpd {rz-sae}, %zmm6, %zmm27, %zmm8 +// CHECK: encoding: [0x62,0x71,0xa5,0x70,0x58,0xc6] + vaddpd {rz-sae}, %zmm6, %zmm27, %zmm8 + +// CHECK: vaddps {rn-sae}, %zmm2, %zmm13, %zmm18 +// CHECK: encoding: [0x62,0xe1,0x14,0x18,0x58,0xd2] + vaddps {rn-sae}, %zmm2, %zmm13, %zmm18 + +// CHECK: vaddps {ru-sae}, %zmm2, %zmm13, %zmm18 +// CHECK: encoding: [0x62,0xe1,0x14,0x58,0x58,0xd2] + vaddps {ru-sae}, %zmm2, %zmm13, %zmm18 + +// CHECK: vaddps {rd-sae}, %zmm2, %zmm13, %zmm18 +// CHECK: encoding: [0x62,0xe1,0x14,0x38,0x58,0xd2] + vaddps {rd-sae}, %zmm2, %zmm13, %zmm18 + +// CHECK: vaddps {rz-sae}, %zmm2, %zmm13, %zmm18 +// CHECK: encoding: [0x62,0xe1,0x14,0x78,0x58,0xd2] + vaddps {rz-sae}, %zmm2, %zmm13, %zmm18 + +// CHECK: vaddsd %xmm8, %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xd1,0xf7,0x00,0x58,0xd8] + vaddsd %xmm8, %xmm17, %xmm3 + +// CHECK: vaddsd %xmm8, %xmm17, %xmm3 {%k3} +// CHECK: encoding: [0x62,0xd1,0xf7,0x03,0x58,0xd8] + vaddsd %xmm8, %xmm17, %xmm3 {%k3} + +// CHECK: vaddsd %xmm8, %xmm17, %xmm3 {%k3} {z} +// CHECK: encoding: [0x62,0xd1,0xf7,0x83,0x58,0xd8] + vaddsd %xmm8, %xmm17, %xmm3 {%k3} {z} + +// CHECK: vaddsd {rn-sae}, %xmm8, %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xd1,0xf7,0x10,0x58,0xd8] + vaddsd {rn-sae}, %xmm8, %xmm17, %xmm3 + +// CHECK: vaddsd {ru-sae}, %xmm8, %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xd1,0xf7,0x50,0x58,0xd8] + vaddsd {ru-sae}, %xmm8, %xmm17, %xmm3 + +// CHECK: vaddsd {rd-sae}, %xmm8, %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xd1,0xf7,0x30,0x58,0xd8] + vaddsd {rd-sae}, %xmm8, %xmm17, %xmm3 + +// CHECK: vaddsd {rz-sae}, %xmm8, %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xd1,0xf7,0x70,0x58,0xd8] + vaddsd {rz-sae}, %xmm8, %xmm17, %xmm3 + +// CHECK: vaddsd (%rcx), %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x19] + vaddsd (%rcx), %xmm17, %xmm3 + +// CHECK: vaddsd 291(%rax,%r14,8), %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xb1,0xf7,0x00,0x58,0x9c,0xf0,0x23,0x01,0x00,0x00] + vaddsd 291(%rax,%r14,8), %xmm17, %xmm3 + +// CHECK: vaddsd 1016(%rdx), %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x5a,0x7f] + vaddsd 1016(%rdx), %xmm17, %xmm3 + +// CHECK: vaddsd 1024(%rdx), %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x9a,0x00,0x04,0x00,0x00] + vaddsd 1024(%rdx), %xmm17, %xmm3 + +// CHECK: vaddsd -1024(%rdx), %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x5a,0x80] + vaddsd -1024(%rdx), %xmm17, %xmm3 + +// CHECK: vaddsd -1032(%rdx), %xmm17, %xmm3 +// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x9a,0xf8,0xfb,0xff,0xff] + vaddsd -1032(%rdx), %xmm17, %xmm3 + +// CHECK: vaddss %xmm19, %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xb1,0x56,0x08,0x58,0xfb] + vaddss %xmm19, %xmm5, %xmm7 + +// CHECK: vaddss %xmm19, %xmm5, %xmm7 {%k2} +// CHECK: encoding: [0x62,0xb1,0x56,0x0a,0x58,0xfb] + vaddss %xmm19, %xmm5, %xmm7 {%k2} + +// CHECK: vaddss %xmm19, %xmm5, %xmm7 {%k2} {z} +// CHECK: encoding: [0x62,0xb1,0x56,0x8a,0x58,0xfb] + vaddss %xmm19, %xmm5, %xmm7 {%k2} {z} + +// CHECK: vaddss {rn-sae}, %xmm19, %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xb1,0x56,0x18,0x58,0xfb] + vaddss {rn-sae}, %xmm19, %xmm5, %xmm7 + +// CHECK: vaddss {ru-sae}, %xmm19, %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xb1,0x56,0x58,0x58,0xfb] + vaddss {ru-sae}, %xmm19, %xmm5, %xmm7 + +// CHECK: vaddss {rd-sae}, %xmm19, %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xb1,0x56,0x38,0x58,0xfb] + vaddss {rd-sae}, %xmm19, %xmm5, %xmm7 + +// CHECK: vaddss {rz-sae}, %xmm19, %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xb1,0x56,0x78,0x58,0xfb] + vaddss {rz-sae}, %xmm19, %xmm5, %xmm7 + +// CHECK: vaddss (%rcx), %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0x39] + vaddss (%rcx), %xmm5, %xmm7 + +// CHECK: vaddss 291(%rax,%r14,8), %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xb1,0x56,0x08,0x58,0xbc,0xf0,0x23,0x01,0x00,0x00] + vaddss 291(%rax,%r14,8), %xmm5, %xmm7 + +// CHECK: vaddss 508(%rdx), %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0x7a,0x7f] + vaddss 508(%rdx), %xmm5, %xmm7 + +// CHECK: vaddss 512(%rdx), %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0xba,0x00,0x02,0x00,0x00] + vaddss 512(%rdx), %xmm5, %xmm7 + +// CHECK: vaddss -512(%rdx), %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0x7a,0x80] + vaddss -512(%rdx), %xmm5, %xmm7 + +// CHECK: vaddss -516(%rdx), %xmm5, %xmm7 +// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0xba,0xfc,0xfd,0xff,0xff] + vaddss -516(%rdx), %xmm5, %xmm7 + +// CHECK: vdivpd {rn-sae}, %zmm11, %zmm6, %zmm18 +// CHECK: encoding: [0x62,0xc1,0xcd,0x18,0x5e,0xd3] + vdivpd {rn-sae}, %zmm11, %zmm6, %zmm18 + +// CHECK: vdivpd {ru-sae}, %zmm11, %zmm6, %zmm18 +// CHECK: encoding: [0x62,0xc1,0xcd,0x58,0x5e,0xd3] + vdivpd {ru-sae}, %zmm11, %zmm6, %zmm18 + +// CHECK: vdivpd {rd-sae}, %zmm11, %zmm6, %zmm18 +// CHECK: encoding: [0x62,0xc1,0xcd,0x38,0x5e,0xd3] + vdivpd {rd-sae}, %zmm11, %zmm6, %zmm18 + +// CHECK: vdivpd {rz-sae}, %zmm11, %zmm6, %zmm18 +// CHECK: encoding: [0x62,0xc1,0xcd,0x78,0x5e,0xd3] + vdivpd {rz-sae}, %zmm11, %zmm6, %zmm18 + +// CHECK: vdivps {rn-sae}, %zmm28, %zmm23, %zmm23 +// CHECK: encoding: [0x62,0x81,0x44,0x10,0x5e,0xfc] + vdivps {rn-sae}, %zmm28, %zmm23, %zmm23 + +// CHECK: vdivps {ru-sae}, %zmm28, %zmm23, %zmm23 +// CHECK: encoding: [0x62,0x81,0x44,0x50,0x5e,0xfc] + vdivps {ru-sae}, %zmm28, %zmm23, %zmm23 + +// CHECK: vdivps {rd-sae}, %zmm28, %zmm23, %zmm23 +// CHECK: encoding: [0x62,0x81,0x44,0x30,0x5e,0xfc] + vdivps {rd-sae}, %zmm28, %zmm23, %zmm23 + +// CHECK: vdivps {rz-sae}, %zmm28, %zmm23, %zmm23 +// CHECK: encoding: [0x62,0x81,0x44,0x70,0x5e,0xfc] + vdivps {rz-sae}, %zmm28, %zmm23, %zmm23 + +// CHECK: vdivsd %xmm22, %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x21,0x97,0x08,0x5e,0xee] + vdivsd %xmm22, %xmm13, %xmm29 + +// CHECK: vdivsd %xmm22, %xmm13, %xmm29 {%k3} +// CHECK: encoding: [0x62,0x21,0x97,0x0b,0x5e,0xee] + vdivsd %xmm22, %xmm13, %xmm29 {%k3} + +// CHECK: vdivsd %xmm22, %xmm13, %xmm29 {%k3} {z} +// CHECK: encoding: [0x62,0x21,0x97,0x8b,0x5e,0xee] + vdivsd %xmm22, %xmm13, %xmm29 {%k3} {z} + +// CHECK: vdivsd {rn-sae}, %xmm22, %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x21,0x97,0x18,0x5e,0xee] + vdivsd {rn-sae}, %xmm22, %xmm13, %xmm29 + +// CHECK: vdivsd {ru-sae}, %xmm22, %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x21,0x97,0x58,0x5e,0xee] + vdivsd {ru-sae}, %xmm22, %xmm13, %xmm29 + +// CHECK: vdivsd {rd-sae}, %xmm22, %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x21,0x97,0x38,0x5e,0xee] + vdivsd {rd-sae}, %xmm22, %xmm13, %xmm29 + +// CHECK: vdivsd {rz-sae}, %xmm22, %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x21,0x97,0x78,0x5e,0xee] + vdivsd {rz-sae}, %xmm22, %xmm13, %xmm29 + +// CHECK: vdivsd (%rcx), %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0x29] + vdivsd (%rcx), %xmm13, %xmm29 + +// CHECK: vdivsd 291(%rax,%r14,8), %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x21,0x97,0x08,0x5e,0xac,0xf0,0x23,0x01,0x00,0x00] + vdivsd 291(%rax,%r14,8), %xmm13, %xmm29 + +// CHECK: vdivsd 1016(%rdx), %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0x6a,0x7f] + vdivsd 1016(%rdx), %xmm13, %xmm29 + +// CHECK: vdivsd 1024(%rdx), %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0xaa,0x00,0x04,0x00,0x00] + vdivsd 1024(%rdx), %xmm13, %xmm29 + +// CHECK: vdivsd -1024(%rdx), %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0x6a,0x80] + vdivsd -1024(%rdx), %xmm13, %xmm29 + +// CHECK: vdivsd -1032(%rdx), %xmm13, %xmm29 +// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0xaa,0xf8,0xfb,0xff,0xff] + vdivsd -1032(%rdx), %xmm13, %xmm29 + +// CHECK: vdivss %xmm17, %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xa1,0x4e,0x08,0x5e,0xe9] + vdivss %xmm17, %xmm6, %xmm21 + +// CHECK: vdivss %xmm17, %xmm6, %xmm21 {%k5} +// CHECK: encoding: [0x62,0xa1,0x4e,0x0d,0x5e,0xe9] + vdivss %xmm17, %xmm6, %xmm21 {%k5} + +// CHECK: vdivss %xmm17, %xmm6, %xmm21 {%k5} {z} +// CHECK: encoding: [0x62,0xa1,0x4e,0x8d,0x5e,0xe9] + vdivss %xmm17, %xmm6, %xmm21 {%k5} {z} + +// CHECK: vdivss {rn-sae}, %xmm17, %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xa1,0x4e,0x18,0x5e,0xe9] + vdivss {rn-sae}, %xmm17, %xmm6, %xmm21 + +// CHECK: vdivss {ru-sae}, %xmm17, %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xa1,0x4e,0x58,0x5e,0xe9] + vdivss {ru-sae}, %xmm17, %xmm6, %xmm21 + +// CHECK: vdivss {rd-sae}, %xmm17, %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xa1,0x4e,0x38,0x5e,0xe9] + vdivss {rd-sae}, %xmm17, %xmm6, %xmm21 + +// CHECK: vdivss {rz-sae}, %xmm17, %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xa1,0x4e,0x78,0x5e,0xe9] + vdivss {rz-sae}, %xmm17, %xmm6, %xmm21 + +// CHECK: vdivss (%rcx), %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x29] + vdivss (%rcx), %xmm6, %xmm21 + +// CHECK: vdivss 291(%rax,%r14,8), %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xa1,0x4e,0x08,0x5e,0xac,0xf0,0x23,0x01,0x00,0x00] + vdivss 291(%rax,%r14,8), %xmm6, %xmm21 + +// CHECK: vdivss 508(%rdx), %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x6a,0x7f] + vdivss 508(%rdx), %xmm6, %xmm21 + +// CHECK: vdivss 512(%rdx), %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0xaa,0x00,0x02,0x00,0x00] + vdivss 512(%rdx), %xmm6, %xmm21 + +// CHECK: vdivss -512(%rdx), %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x6a,0x80] + vdivss -512(%rdx), %xmm6, %xmm21 + +// CHECK: vdivss -516(%rdx), %xmm6, %xmm21 +// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0xaa,0xfc,0xfd,0xff,0xff] + vdivss -516(%rdx), %xmm6, %xmm21 + +// CHECK: vmaxpd {sae}, %zmm20, %zmm28, %zmm30 +// CHECK: encoding: [0x62,0x21,0x9d,0x10,0x5f,0xf4] + vmaxpd {sae}, %zmm20, %zmm28, %zmm30 + +// CHECK: vmaxps {sae}, %zmm20, %zmm6, %zmm25 +// CHECK: encoding: [0x62,0x21,0x4c,0x18,0x5f,0xcc] + vmaxps {sae}, %zmm20, %zmm6, %zmm25 + +// CHECK: vmaxsd %xmm25, %xmm19, %xmm20 +// CHECK: encoding: [0x62,0x81,0xe7,0x00,0x5f,0xe1] + vmaxsd %xmm25, %xmm19, %xmm20 + +// CHECK: vmaxsd %xmm25, %xmm19, %xmm20 {%k3} +// CHECK: encoding: [0x62,0x81,0xe7,0x03,0x5f,0xe1] + vmaxsd %xmm25, %xmm19, %xmm20 {%k3} + +// CHECK: vmaxsd %xmm25, %xmm19, %xmm20 {%k3} {z} +// CHECK: encoding: [0x62,0x81,0xe7,0x83,0x5f,0xe1] + vmaxsd %xmm25, %xmm19, %xmm20 {%k3} {z} + +// CHECK: vmaxsd {sae}, %xmm25, %xmm19, %xmm20 +// CHECK: encoding: [0x62,0x81,0xe7,0x10,0x5f,0xe1] + vmaxsd {sae}, %xmm25, %xmm19, %xmm20 + +// CHECK: vmaxsd (%rcx), %xmm19, %xmm20 +// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x21] + vmaxsd (%rcx), %xmm19, %xmm20 + +// CHECK: vmaxsd 291(%rax,%r14,8), %xmm19, %xmm20 +// CHECK: encoding: [0x62,0xa1,0xe7,0x00,0x5f,0xa4,0xf0,0x23,0x01,0x00,0x00] + vmaxsd 291(%rax,%r14,8), %xmm19, %xmm20 + +// CHECK: vmaxsd 1016(%rdx), %xmm19, %xmm20 +// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x62,0x7f] + vmaxsd 1016(%rdx), %xmm19, %xmm20 + +// CHECK: vmaxsd 1024(%rdx), %xmm19, %xmm20 +// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0xa2,0x00,0x04,0x00,0x00] + vmaxsd 1024(%rdx), %xmm19, %xmm20 + +// CHECK: vmaxsd -1024(%rdx), %xmm19, %xmm20 +// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x62,0x80] + vmaxsd -1024(%rdx), %xmm19, %xmm20 + +// CHECK: vmaxsd -1032(%rdx), %xmm19, %xmm20 +// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0xa2,0xf8,0xfb,0xff,0xff] + vmaxsd -1032(%rdx), %xmm19, %xmm20 + +// CHECK: vmaxss %xmm6, %xmm4, %xmm8 +// CHECK: encoding: [0xc5,0x5a,0x5f,0xc6] + vmaxss %xmm6, %xmm4, %xmm8 + +// CHECK: vmaxss %xmm6, %xmm4, %xmm8 {%k4} +// CHECK: encoding: [0x62,0x71,0x5e,0x0c,0x5f,0xc6] + vmaxss %xmm6, %xmm4, %xmm8 {%k4} + +// CHECK: vmaxss %xmm6, %xmm4, %xmm8 {%k4} {z} +// CHECK: encoding: [0x62,0x71,0x5e,0x8c,0x5f,0xc6] + vmaxss %xmm6, %xmm4, %xmm8 {%k4} {z} + +// CHECK: vmaxss {sae}, %xmm6, %xmm4, %xmm8 +// CHECK: encoding: [0x62,0x71,0x5e,0x18,0x5f,0xc6] + vmaxss {sae}, %xmm6, %xmm4, %xmm8 + +// CHECK: vmaxss (%rcx), %xmm4, %xmm8 +// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x01] + vmaxss (%rcx), %xmm4, %xmm8 + +// CHECK: vmaxss 291(%rax,%r14,8), %xmm4, %xmm8 +// CHECK: encoding: [0x62,0x31,0x5e,0x08,0x5f,0x84,0xf0,0x23,0x01,0x00,0x00] + vmaxss 291(%rax,%r14,8), %xmm4, %xmm8 + +// CHECK: vmaxss 508(%rdx), %xmm4, %xmm8 +// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x42,0x7f] + vmaxss 508(%rdx), %xmm4, %xmm8 + +// CHECK: vmaxss 512(%rdx), %xmm4, %xmm8 +// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x82,0x00,0x02,0x00,0x00] + vmaxss 512(%rdx), %xmm4, %xmm8 + +// CHECK: vmaxss -512(%rdx), %xmm4, %xmm8 +// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x42,0x80] + vmaxss -512(%rdx), %xmm4, %xmm8 + +// CHECK: vmaxss -516(%rdx), %xmm4, %xmm8 +// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x82,0xfc,0xfd,0xff,0xff] + vmaxss -516(%rdx), %xmm4, %xmm8 + +// CHECK: vminpd {sae}, %zmm22, %zmm6, %zmm6 +// CHECK: encoding: [0x62,0xb1,0xcd,0x18,0x5d,0xf6] + vminpd {sae}, %zmm22, %zmm6, %zmm6 + +// CHECK: vminps {sae}, %zmm7, %zmm3, %zmm3 +// CHECK: encoding: [0x62,0xf1,0x64,0x18,0x5d,0xdf] + vminps {sae}, %zmm7, %zmm3, %zmm3 + +// CHECK: vminsd %xmm26, %xmm25, %xmm5 +// CHECK: encoding: [0x62,0x91,0xb7,0x00,0x5d,0xea] + vminsd %xmm26, %xmm25, %xmm5 + +// CHECK: vminsd %xmm26, %xmm25, %xmm5 {%k3} +// CHECK: encoding: [0x62,0x91,0xb7,0x03,0x5d,0xea] + vminsd %xmm26, %xmm25, %xmm5 {%k3} + +// CHECK: vminsd %xmm26, %xmm25, %xmm5 {%k3} {z} +// CHECK: encoding: [0x62,0x91,0xb7,0x83,0x5d,0xea] + vminsd %xmm26, %xmm25, %xmm5 {%k3} {z} + +// CHECK: vminsd {sae}, %xmm26, %xmm25, %xmm5 +// CHECK: encoding: [0x62,0x91,0xb7,0x10,0x5d,0xea] + vminsd {sae}, %xmm26, %xmm25, %xmm5 + +// CHECK: vminsd (%rcx), %xmm25, %xmm5 +// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x29] + vminsd (%rcx), %xmm25, %xmm5 + +// CHECK: vminsd 291(%rax,%r14,8), %xmm25, %xmm5 +// CHECK: encoding: [0x62,0xb1,0xb7,0x00,0x5d,0xac,0xf0,0x23,0x01,0x00,0x00] + vminsd 291(%rax,%r14,8), %xmm25, %xmm5 + +// CHECK: vminsd 1016(%rdx), %xmm25, %xmm5 +// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x6a,0x7f] + vminsd 1016(%rdx), %xmm25, %xmm5 + +// CHECK: vminsd 1024(%rdx), %xmm25, %xmm5 +// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0xaa,0x00,0x04,0x00,0x00] + vminsd 1024(%rdx), %xmm25, %xmm5 + +// CHECK: vminsd -1024(%rdx), %xmm25, %xmm5 +// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x6a,0x80] + vminsd -1024(%rdx), %xmm25, %xmm5 + +// CHECK: vminsd -1032(%rdx), %xmm25, %xmm5 +// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0xaa,0xf8,0xfb,0xff,0xff] + vminsd -1032(%rdx), %xmm25, %xmm5 + +// CHECK: vminss %xmm19, %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x31,0x76,0x00,0x5d,0xd3] + vminss %xmm19, %xmm17, %xmm10 + +// CHECK: vminss %xmm19, %xmm17, %xmm10 {%k5} +// CHECK: encoding: [0x62,0x31,0x76,0x05,0x5d,0xd3] + vminss %xmm19, %xmm17, %xmm10 {%k5} + +// CHECK: vminss %xmm19, %xmm17, %xmm10 {%k5} {z} +// CHECK: encoding: [0x62,0x31,0x76,0x85,0x5d,0xd3] + vminss %xmm19, %xmm17, %xmm10 {%k5} {z} + +// CHECK: vminss {sae}, %xmm19, %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x31,0x76,0x10,0x5d,0xd3] + vminss {sae}, %xmm19, %xmm17, %xmm10 + +// CHECK: vminss (%rcx), %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x11] + vminss (%rcx), %xmm17, %xmm10 + +// CHECK: vminss 291(%rax,%r14,8), %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x31,0x76,0x00,0x5d,0x94,0xf0,0x23,0x01,0x00,0x00] + vminss 291(%rax,%r14,8), %xmm17, %xmm10 + +// CHECK: vminss 508(%rdx), %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x52,0x7f] + vminss 508(%rdx), %xmm17, %xmm10 + +// CHECK: vminss 512(%rdx), %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x92,0x00,0x02,0x00,0x00] + vminss 512(%rdx), %xmm17, %xmm10 + +// CHECK: vminss -512(%rdx), %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x52,0x80] + vminss -512(%rdx), %xmm17, %xmm10 + +// CHECK: vminss -516(%rdx), %xmm17, %xmm10 +// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x92,0xfc,0xfd,0xff,0xff] + vminss -516(%rdx), %xmm17, %xmm10 + +// CHECK: vmulpd {rn-sae}, %zmm23, %zmm4, %zmm24 +// CHECK: encoding: [0x62,0x21,0xdd,0x18,0x59,0xc7] + vmulpd {rn-sae}, %zmm23, %zmm4, %zmm24 + +// CHECK: vmulpd {ru-sae}, %zmm23, %zmm4, %zmm24 +// CHECK: encoding: [0x62,0x21,0xdd,0x58,0x59,0xc7] + vmulpd {ru-sae}, %zmm23, %zmm4, %zmm24 + +// CHECK: vmulpd {rd-sae}, %zmm23, %zmm4, %zmm24 +// CHECK: encoding: [0x62,0x21,0xdd,0x38,0x59,0xc7] + vmulpd {rd-sae}, %zmm23, %zmm4, %zmm24 + +// CHECK: vmulpd {rz-sae}, %zmm23, %zmm4, %zmm24 +// CHECK: encoding: [0x62,0x21,0xdd,0x78,0x59,0xc7] + vmulpd {rz-sae}, %zmm23, %zmm4, %zmm24 + +// CHECK: vmulps {rn-sae}, %zmm24, %zmm6, %zmm3 +// CHECK: encoding: [0x62,0x91,0x4c,0x18,0x59,0xd8] + vmulps {rn-sae}, %zmm24, %zmm6, %zmm3 + +// CHECK: vmulps {ru-sae}, %zmm24, %zmm6, %zmm3 +// CHECK: encoding: [0x62,0x91,0x4c,0x58,0x59,0xd8] + vmulps {ru-sae}, %zmm24, %zmm6, %zmm3 + +// CHECK: vmulps {rd-sae}, %zmm24, %zmm6, %zmm3 +// CHECK: encoding: [0x62,0x91,0x4c,0x38,0x59,0xd8] + vmulps {rd-sae}, %zmm24, %zmm6, %zmm3 + +// CHECK: vmulps {rz-sae}, %zmm24, %zmm6, %zmm3 +// CHECK: encoding: [0x62,0x91,0x4c,0x78,0x59,0xd8] + vmulps {rz-sae}, %zmm24, %zmm6, %zmm3 + +// CHECK: vmulsd %xmm18, %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x31,0xdf,0x08,0x59,0xea] + vmulsd %xmm18, %xmm4, %xmm13 + +// CHECK: vmulsd %xmm18, %xmm4, %xmm13 {%k2} +// CHECK: encoding: [0x62,0x31,0xdf,0x0a,0x59,0xea] + vmulsd %xmm18, %xmm4, %xmm13 {%k2} + +// CHECK: vmulsd %xmm18, %xmm4, %xmm13 {%k2} {z} +// CHECK: encoding: [0x62,0x31,0xdf,0x8a,0x59,0xea] + vmulsd %xmm18, %xmm4, %xmm13 {%k2} {z} + +// CHECK: vmulsd {rn-sae}, %xmm18, %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x31,0xdf,0x18,0x59,0xea] + vmulsd {rn-sae}, %xmm18, %xmm4, %xmm13 + +// CHECK: vmulsd {ru-sae}, %xmm18, %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x31,0xdf,0x58,0x59,0xea] + vmulsd {ru-sae}, %xmm18, %xmm4, %xmm13 + +// CHECK: vmulsd {rd-sae}, %xmm18, %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x31,0xdf,0x38,0x59,0xea] + vmulsd {rd-sae}, %xmm18, %xmm4, %xmm13 + +// CHECK: vmulsd {rz-sae}, %xmm18, %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x31,0xdf,0x78,0x59,0xea] + vmulsd {rz-sae}, %xmm18, %xmm4, %xmm13 + +// CHECK: vmulsd (%rcx), %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0x29] + vmulsd (%rcx), %xmm4, %xmm13 + +// CHECK: vmulsd 291(%rax,%r14,8), %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x31,0xdf,0x08,0x59,0xac,0xf0,0x23,0x01,0x00,0x00] + vmulsd 291(%rax,%r14,8), %xmm4, %xmm13 + +// CHECK: vmulsd 1016(%rdx), %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0x6a,0x7f] + vmulsd 1016(%rdx), %xmm4, %xmm13 + +// CHECK: vmulsd 1024(%rdx), %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0xaa,0x00,0x04,0x00,0x00] + vmulsd 1024(%rdx), %xmm4, %xmm13 + +// CHECK: vmulsd -1024(%rdx), %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0x6a,0x80] + vmulsd -1024(%rdx), %xmm4, %xmm13 + +// CHECK: vmulsd -1032(%rdx), %xmm4, %xmm13 +// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0xaa,0xf8,0xfb,0xff,0xff] + vmulsd -1032(%rdx), %xmm4, %xmm13 + +// CHECK: vmulss %xmm14, %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x2e,0x08,0x59,0xf6] + vmulss %xmm14, %xmm10, %xmm22 + +// CHECK: vmulss %xmm14, %xmm10, %xmm22 {%k4} +// CHECK: encoding: [0x62,0xc1,0x2e,0x0c,0x59,0xf6] + vmulss %xmm14, %xmm10, %xmm22 {%k4} + +// CHECK: vmulss %xmm14, %xmm10, %xmm22 {%k4} {z} +// CHECK: encoding: [0x62,0xc1,0x2e,0x8c,0x59,0xf6] + vmulss %xmm14, %xmm10, %xmm22 {%k4} {z} + +// CHECK: vmulss {rn-sae}, %xmm14, %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x2e,0x18,0x59,0xf6] + vmulss {rn-sae}, %xmm14, %xmm10, %xmm22 + +// CHECK: vmulss {ru-sae}, %xmm14, %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x2e,0x58,0x59,0xf6] + vmulss {ru-sae}, %xmm14, %xmm10, %xmm22 + +// CHECK: vmulss {rd-sae}, %xmm14, %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x2e,0x38,0x59,0xf6] + vmulss {rd-sae}, %xmm14, %xmm10, %xmm22 + +// CHECK: vmulss {rz-sae}, %xmm14, %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x2e,0x78,0x59,0xf6] + vmulss {rz-sae}, %xmm14, %xmm10, %xmm22 + +// CHECK: vmulss (%rcx), %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0x31] + vmulss (%rcx), %xmm10, %xmm22 + +// CHECK: vmulss 291(%rax,%r14,8), %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xa1,0x2e,0x08,0x59,0xb4,0xf0,0x23,0x01,0x00,0x00] + vmulss 291(%rax,%r14,8), %xmm10, %xmm22 + +// CHECK: vmulss 508(%rdx), %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0x72,0x7f] + vmulss 508(%rdx), %xmm10, %xmm22 + +// CHECK: vmulss 512(%rdx), %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0xb2,0x00,0x02,0x00,0x00] + vmulss 512(%rdx), %xmm10, %xmm22 + +// CHECK: vmulss -512(%rdx), %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0x72,0x80] + vmulss -512(%rdx), %xmm10, %xmm22 + +// CHECK: vmulss -516(%rdx), %xmm10, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0xb2,0xfc,0xfd,0xff,0xff] + vmulss -516(%rdx), %xmm10, %xmm22 |