summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/llvm/IR/IntrinsicsX86.td72
-rw-r--r--lib/Target/X86/X86ISelLowering.h2
-rw-r--r--lib/Target/X86/X86InstrAVX512.td132
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td4
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h33
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll377
-rw-r--r--test/CodeGen/X86/avx512er-intrinsics.ll24
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll264
-rw-r--r--test/MC/X86/avx512-encodings.s583
9 files changed, 1391 insertions, 100 deletions
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 43aa89871e8..998fa7a6ce3 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3205,39 +3205,111 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// Arithmetic ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
+ def int_x86_avx512_mask_add_ps_128 : GCCBuiltin<"__builtin_ia32_addps128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_add_ps_256 : GCCBuiltin<"__builtin_ia32_addps256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+ llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_add_pd_128 : GCCBuiltin<"__builtin_ia32_addpd128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_add_pd_256 : GCCBuiltin<"__builtin_ia32_addpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+ llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_sub_ps_128 : GCCBuiltin<"__builtin_ia32_subps128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_sub_ps_256 : GCCBuiltin<"__builtin_ia32_subps256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+ llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_sub_pd_128 : GCCBuiltin<"__builtin_ia32_subpd128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_sub_pd_256 : GCCBuiltin<"__builtin_ia32_subpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+ llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_mul_ps_128 : GCCBuiltin<"__builtin_ia32_mulps128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_mul_ps_256 : GCCBuiltin<"__builtin_ia32_mulps256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+ llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_mul_pd_128 : GCCBuiltin<"__builtin_ia32_mulpd128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_mul_pd_256 : GCCBuiltin<"__builtin_ia32_mulpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+ llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_div_ps_128 : GCCBuiltin<"__builtin_ia32_divps128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_div_ps_256 : GCCBuiltin<"__builtin_ia32_divps256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+ llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_div_pd_128 : GCCBuiltin<"__builtin_ia32_divpd128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_div_pd_256 : GCCBuiltin<"__builtin_ia32_divpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+ llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_max_ps_128 : GCCBuiltin<"__builtin_ia32_maxps128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+ llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_max_pd_128 : GCCBuiltin<"__builtin_ia32_maxpd128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+ llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_min_ps_128 : GCCBuiltin<"__builtin_ia32_minps128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+ llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_min_pd_128 : GCCBuiltin<"__builtin_ia32_minpd128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+ llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index bc6138c67ec..90b49328f22 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -205,6 +205,8 @@ namespace llvm {
FSUB_RND,
FMUL_RND,
FDIV_RND,
+ FMAX_RND,
+ FMIN_RND,
// Integer add/sub with unsigned saturation.
ADDUS,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index bc8ab83fe31..4f9b467d3b2 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -180,21 +180,20 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
list<dag> Pattern,
list<dag> MaskingPattern,
list<dag> ZeroMaskingPattern,
- string Round = "",
string MaskingConstraint = "",
InstrItinClass itin = NoItinerary,
bit IsCommutable = 0> {
let isCommutable = IsCommutable in
def NAME: AVX512<O, F, Outs, Ins,
- OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
- "$dst "#Round#", "#IntelSrcAsm#"}",
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+ "$dst , "#IntelSrcAsm#"}",
Pattern, itin>;
// Prefer over VMOV*rrk Pat<>
let AddedComplexity = 20 in
def NAME#k: AVX512<O, F, Outs, MaskingIns,
- OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"#
- "$dst {${mask}}"#Round#", "#IntelSrcAsm#"}",
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+ "$dst {${mask}}, "#IntelSrcAsm#"}",
MaskingPattern, itin>,
EVEX_K {
// In case of the 3src subclass this is overridden with a let.
@@ -202,8 +201,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
}
let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
- OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"#
- "$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}",
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
ZeroMaskingPattern,
itin>,
EVEX_KZ;
@@ -217,7 +216,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskingRHS,
- SDNode Select = vselect, string Round = "",
+ SDNode Select = vselect,
string MaskingConstraint = "",
InstrItinClass itin = NoItinerary,
bit IsCommutable = 0> :
@@ -227,7 +226,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
[(set _.RC:$dst, MaskingRHS)],
[(set _.RC:$dst,
(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
- Round, MaskingConstraint, NoItinerary, IsCommutable>;
+ MaskingConstraint, NoItinerary, IsCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -235,7 +234,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, string Round = "",
+ dag RHS,
InstrItinClass itin = NoItinerary,
bit IsCommutable = 0> :
AVX512_maskable_common<O, F, _, Outs, Ins,
@@ -243,14 +242,14 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect,
- Round, "$src0 = $dst", itin, IsCommutable>;
+ "$src0 = $dst", itin, IsCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the scalar instruction.
multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, string Round = "",
+ dag RHS,
InstrItinClass itin = NoItinerary,
bit IsCommutable = 0> :
AVX512_maskable_common<O, F, _, Outs, Ins,
@@ -258,7 +257,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select,
- Round, "$src0 = $dst", itin, IsCommutable>;
+ "$src0 = $dst", itin, IsCommutable>;
// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved
@@ -284,7 +283,7 @@ multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
AVX512_maskable_custom<O, F, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
- OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "",
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
"$src0 = $dst">;
@@ -2963,7 +2962,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
- "", itins.rr, IsCommutable>,
+ itins.rr, IsCommutable>,
AVX512BIBase, EVEX_4V;
let mayLoad = 1 in
@@ -2972,7 +2971,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
(bitconvert (_.LdFrag addr:$src2)))),
- "", itins.rm>,
+ itins.rm>,
AVX512BIBase, EVEX_4V;
}
@@ -2988,7 +2987,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT (OpNode _.RC:$src1,
(X86VBroadcast
(_.ScalarLdFrag addr:$src2)))),
- "", itins.rm>,
+ itins.rm>,
AVX512BIBase, EVEX_4V, EVEX_B;
}
@@ -3090,7 +3089,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
- "",itins.rr, IsCommutable>,
+ itins.rr, IsCommutable>,
AVX512BIBase, EVEX_4V;
let mayLoad = 1 in {
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -3098,7 +3097,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
(bitconvert (_Src.LdFrag addr:$src2)))),
- "", itins.rm>,
+ itins.rm>,
AVX512BIBase, EVEX_4V;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -3109,7 +3108,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Dst.VT (X86VBroadcast
(_Dst.ScalarLdFrag addr:$src2)))))),
- "", itins.rm>,
+ itins.rm>,
AVX512BIBase, EVEX_4V, EVEX_B;
}
}
@@ -3165,8 +3164,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_Src.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Src.VT (X86VBroadcast
- (_Src.ScalarLdFrag addr:$src2)))))),
- "">,
+ (_Src.ScalarLdFrag addr:$src2))))))>,
EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
}
}
@@ -3179,15 +3177,15 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
"$src2, $src1","$src1, $src2",
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
- (_Src.VT _Src.RC:$src2))),
- "">, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
+ (_Src.VT _Src.RC:$src2)))>,
+ EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
let mayLoad = 1 in {
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
- (bitconvert (_Src.LdFrag addr:$src2)))),
- "">, EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
+ (bitconvert (_Src.LdFrag addr:$src2))))>,
+ EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
}
}
@@ -3390,7 +3388,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
"$src2, $src1", "$src1, $src2",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(i32 FROUND_CURRENT)),
- "", itins.rr, IsCommutable>;
+ itins.rr, IsCommutable>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
@@ -3398,7 +3396,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(VecNode (_.VT _.RC:$src1),
(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
(i32 FROUND_CURRENT)),
- "", itins.rm, IsCommutable>;
+ itins.rm, IsCommutable>;
let isCodeGenOnly = 1, isCommutable = IsCommutable,
Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -3421,7 +3419,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$rc)), "", itins.rr, IsCommutable>,
+ (i32 imm:$rc)), itins.rr, IsCommutable>,
EVEX_B, EVEX_RC;
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -3429,9 +3427,9 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
- "$src2, $src1", "$src1, $src2",
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
+ (i32 FROUND_NO_EXC))>, EVEX_B;
}
multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3500,6 +3498,16 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRn
EVEX_4V, EVEX_B, EVEX_RC;
}
+
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ X86VectorVTInfo _, bit IsCommutable> {
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, EVEX_B;
+}
+
multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
bit IsCommutable = 0> {
defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
@@ -3533,6 +3541,13 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+ defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
@@ -3541,33 +3556,17 @@ defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
-defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>;
-defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>,
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>,
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
let Predicates = [HasDQI] in {
defm VAND : avx512_fp_binop_p<0x54, "vand", X86fand, 1>;
defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>;
defm VOR : avx512_fp_binop_p<0x56, "vor", X86for, 1>;
defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>;
}
-def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1),
- (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
- (i16 -1), FROUND_CURRENT)),
- (VMAXPSZrr VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_max_pd_512 (v8f64 VR512:$src1),
- (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
- (i8 -1), FROUND_CURRENT)),
- (VMAXPDZrr VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_min_ps_512 (v16f32 VR512:$src1),
- (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
- (i16 -1), FROUND_CURRENT)),
- (VMINPSZrr VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1),
- (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
- (i8 -1), FROUND_CURRENT)),
- (VMINPDZrr VR512:$src1, VR512:$src2)>;
+
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
//===----------------------------------------------------------------------===//
@@ -3667,14 +3666,14 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
(ins _.RC:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
- " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V;
+ SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V;
let mayLoad = 1 in
defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
(i8 imm:$src2))),
- " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V;
+ SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V;
}
multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
@@ -3684,7 +3683,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
- " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B;
+ SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B;
}
multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3694,12 +3693,12 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, VR128X:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
- " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+ SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, i128mem:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
- " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
+ SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
EVEX_4V;
}
@@ -3798,13 +3797,13 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
- " ", SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+ SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
let mayLoad = 1 in
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))),
- " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
+ SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
EVEX_CD8<_.EltSize, CD8VF>;
}
@@ -3817,7 +3816,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src2))))),
- " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
+ SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
}
multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4775,9 +4774,9 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
- "$src2, $src1", "$src1, $src2",
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
+ (i32 FROUND_NO_EXC))>, EVEX_B;
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
@@ -4809,9 +4808,8 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr,
- "$src", "$src",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)),
- "{sae}">, EVEX_B;
+ "{sae}, $src", "$src, {sae}",
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
@@ -5051,9 +5049,9 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
- "$src3, $src2, $src1", "$src1, $src2, $src3",
+ "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}",
(_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3), (i32 FROUND_NO_EXC))), "{sae}">, EVEX_B;
+ (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
let mayLoad = 1 in
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 497bdf65315..d9eebc5ddfd 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -293,8 +293,8 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>;
def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
-def X86fmaxRnd : SDNode<"X86ISD::FMAX", SDTFPBinOpRound>;
-def X86fminRnd : SDNode<"X86ISD::FMIN", SDTFPBinOpRound>;
+def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
+def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 648769e7069..e4d82335ece 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -243,8 +243,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
@@ -322,8 +326,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
@@ -350,9 +358,28 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
-
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+ X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+ X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+ X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+ X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
@@ -512,8 +539,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::RNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::RNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index d4129e14c95..d99764e4007 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -434,15 +434,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
; fp min - max
-define <16 x float> @test_vmaxps(<16 x float> %a0, <16 x float> %a1) {
- ; CHECK: vmaxps
- %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1,
- <16 x float>zeroinitializer, i16 -1, i32 4)
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>,
- <16 x float>, i16, i32)
-
define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
; CHECK: vmaxpd
%res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -452,15 +443,6 @@ define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
<8 x double>, i8, i32)
-define <16 x float> @test_vminps(<16 x float> %a0, <16 x float> %a1) {
- ; CHECK: vminps
- %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1,
- <16 x float>zeroinitializer, i16 -1, i32 4)
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>,
- <16 x float>, i16, i32)
-
define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
; CHECK: vminpd
%res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2240,3 +2222,362 @@ define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
}
declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae
+ ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae
+ ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae
+ ;CHECK: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae
+ ;CHECK: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_add_round_ps_current
+ ;CHECK: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae
+ ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae
+ ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae
+ ;CHECK: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae
+ ;CHECK: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_add_round_ps_current
+ ;CHECK: vaddps %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_add_round_ps_rn_sae
+ ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_add_round_ps_rd_sae
+ ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_add_round_ps_ru_sae
+ ;CHECK: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_add_round_ps_rz_sae
+ ;CHECK: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_add_round_ps_current
+ ;CHECK: vaddps %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae
+ ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae
+ ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae
+ ;CHECK: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae
+ ;CHECK: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_sub_round_ps_current
+ ;CHECK: vsubps %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_sub_round_ps_rn_sae
+ ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_sub_round_ps_rd_sae
+ ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_sub_round_ps_ru_sae
+ ;CHECK: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_sub_round_ps_rz_sae
+ ;CHECK: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_sub_round_ps_current
+ ;CHECK: vsubps %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae
+ ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae
+ ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae
+ ;CHECK: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae
+ ;CHECK: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_div_round_ps_current
+ ;CHECK: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae
+ ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae
+ ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae
+ ;CHECK: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae
+ ;CHECK: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_div_round_ps_current
+ ;CHECK: vdivps %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_div_round_ps_rn_sae
+ ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_div_round_ps_rd_sae
+ ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
+ ret <16 x float> %res
+}
+define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_div_round_ps_ru_sae
+ ;CHECK: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_div_round_ps_rz_sae
+ ;CHECK: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_div_round_ps_current
+ ;CHECK: vdivps %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_min_round_ps_sae
+ ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_min_round_ps_current
+ ;CHECK: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_min_round_ps_sae
+ ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_min_round_ps_current
+ ;CHECK: vminps %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_min_round_ps_sae
+ ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_min_round_ps_current
+ ;CHECK: vminps %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_max_round_ps_sae
+ ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_max_round_ps_current
+ ;CHECK: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_max_round_ps_sae
+ ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_max_round_ps_current
+ ;CHECK: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_max_round_ps_sae
+ ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+ ;CHECK-LABEL: test_mm512_max_round_ps_current
+ ;CHECK: vmaxps %zmm1, %zmm0, %zmm0
+ %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index dcde9c4153e..827a56d76ae 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -1,14 +1,14 @@
; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s
define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
- ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+ ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
ret <16 x float> %res
}
define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
; CHECK: kmovw
- ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
+ ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
ret <16 x float> %res
}
@@ -27,7 +27,7 @@ define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
}
define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
- ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
+ ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8)
ret <16 x float> %res
}
@@ -36,61 +36,61 @@ define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
- ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+ ; CHECK: vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
- ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+ ; CHECK: vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
%res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
- ; CHECK: vexp2ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
+ ; CHECK: vexp2ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
%res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
- ; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
+ ; CHECK: vexp2pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
%res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
- ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+ ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
- ; CHECK: vrcp28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+ ; CHECK: vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
%res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) {
- ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
+ ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ;
ret <4 x float> %res
}
define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) {
- ; CHECK: vrsqrt28ss %xmm1, %xmm0, %xmm2 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
+ ; CHECK: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ;
ret <4 x float> %res
}
define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) {
- ; CHECK: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
+ ; CHECK: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
%res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ;
ret <2 x double> %res
}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 64ec8166d50..4a99a9699e3 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -2289,3 +2289,267 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)
+
+define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_add_ps_256
+ ;CHECK: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z}
+ %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_add_ps_256
+ ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1}
+ %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_add_ps_256
+ ;CHECK: vaddps %ymm1, %ymm0, %ymm0
+ %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_add_ps_128
+ ;CHECK: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_add_ps_128
+ ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_add_ps_128
+ ;CHECK: vaddps %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_sub_ps_256
+ ;CHECK: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z}
+ %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_sub_ps_256
+ ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1}
+ %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_sub_ps_256
+ ;CHECK: vsubps %ymm1, %ymm0, %ymm0
+ %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_sub_ps_128
+ ;CHECK: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_sub_ps_128
+ ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_sub_ps_128
+ ;CHECK: vsubps %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_mul_ps_256
+ ;CHECK: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z}
+ %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_mul_ps_256
+ ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1}
+ %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mul_ps_256
+ ;CHECK: vmulps %ymm1, %ymm0, %ymm0
+ %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_mul_ps_128
+ ;CHECK: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_mul_ps_128
+ ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mul_ps_128
+ ;CHECK: vmulps %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_div_ps_256
+ ;CHECK: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z}
+ %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_div_ps_256
+ ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1}
+ %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_div_ps_256
+ ;CHECK: vdivps %ymm1, %ymm0, %ymm0
+ %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_div_ps_128
+ ;CHECK: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_div_ps_128
+ ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_div_ps_128
+ ;CHECK: vdivps %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_max_ps_256
+ ;CHECK: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z}
+ %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_max_ps_256
+ ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1}
+ %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_max_ps_256
+ ;CHECK: vmaxps %ymm1, %ymm0, %ymm0
+ %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_max_ps_128
+ ;CHECK: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_max_ps_128
+ ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_max_ps_128
+ ;CHECK: vmaxps %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_min_ps_256
+ ;CHECK: vminps %ymm1, %ymm0, %ymm0 {%k1} {z}
+ %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_min_ps_256
+ ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1}
+ %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_min_ps_256
+ ;CHECK: vminps %ymm1, %ymm0, %ymm0
+ %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_maskz_min_ps_128
+ ;CHECK: vminps %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_mask_min_ps_128
+ ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+ ;CHECK-LABEL: test_mm512_min_ps_128
+ ;CHECK: vminps %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) \ No newline at end of file
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 91107cc6fee..04b736348dd 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -6112,3 +6112,586 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2
// CHECK: encoding: [0x62,0xf1,0x74,0x50,0xc2,0x92,0xfc,0xfd,0xff,0xff,0x7b]
vcmpps $0x7b, -516(%rdx){1to16}, %zmm17, %k2
+// CHECK: vaddpd {rn-sae}, %zmm6, %zmm27, %zmm8
+// CHECK: encoding: [0x62,0x71,0xa5,0x10,0x58,0xc6]
+ vaddpd {rn-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddpd {ru-sae}, %zmm6, %zmm27, %zmm8
+// CHECK: encoding: [0x62,0x71,0xa5,0x50,0x58,0xc6]
+ vaddpd {ru-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddpd {rd-sae}, %zmm6, %zmm27, %zmm8
+// CHECK: encoding: [0x62,0x71,0xa5,0x30,0x58,0xc6]
+ vaddpd {rd-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddpd {rz-sae}, %zmm6, %zmm27, %zmm8
+// CHECK: encoding: [0x62,0x71,0xa5,0x70,0x58,0xc6]
+ vaddpd {rz-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddps {rn-sae}, %zmm2, %zmm13, %zmm18
+// CHECK: encoding: [0x62,0xe1,0x14,0x18,0x58,0xd2]
+ vaddps {rn-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddps {ru-sae}, %zmm2, %zmm13, %zmm18
+// CHECK: encoding: [0x62,0xe1,0x14,0x58,0x58,0xd2]
+ vaddps {ru-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddps {rd-sae}, %zmm2, %zmm13, %zmm18
+// CHECK: encoding: [0x62,0xe1,0x14,0x38,0x58,0xd2]
+ vaddps {rd-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddps {rz-sae}, %zmm2, %zmm13, %zmm18
+// CHECK: encoding: [0x62,0xe1,0x14,0x78,0x58,0xd2]
+ vaddps {rz-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddsd %xmm8, %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xd1,0xf7,0x00,0x58,0xd8]
+ vaddsd %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd %xmm8, %xmm17, %xmm3 {%k3}
+// CHECK: encoding: [0x62,0xd1,0xf7,0x03,0x58,0xd8]
+ vaddsd %xmm8, %xmm17, %xmm3 {%k3}
+
+// CHECK: vaddsd %xmm8, %xmm17, %xmm3 {%k3} {z}
+// CHECK: encoding: [0x62,0xd1,0xf7,0x83,0x58,0xd8]
+ vaddsd %xmm8, %xmm17, %xmm3 {%k3} {z}
+
+// CHECK: vaddsd {rn-sae}, %xmm8, %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xd1,0xf7,0x10,0x58,0xd8]
+ vaddsd {rn-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd {ru-sae}, %xmm8, %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xd1,0xf7,0x50,0x58,0xd8]
+ vaddsd {ru-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd {rd-sae}, %xmm8, %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xd1,0xf7,0x30,0x58,0xd8]
+ vaddsd {rd-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd {rz-sae}, %xmm8, %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xd1,0xf7,0x70,0x58,0xd8]
+ vaddsd {rz-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd (%rcx), %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x19]
+ vaddsd (%rcx), %xmm17, %xmm3
+
+// CHECK: vaddsd 291(%rax,%r14,8), %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xb1,0xf7,0x00,0x58,0x9c,0xf0,0x23,0x01,0x00,0x00]
+ vaddsd 291(%rax,%r14,8), %xmm17, %xmm3
+
+// CHECK: vaddsd 1016(%rdx), %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x5a,0x7f]
+ vaddsd 1016(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddsd 1024(%rdx), %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x9a,0x00,0x04,0x00,0x00]
+ vaddsd 1024(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddsd -1024(%rdx), %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x5a,0x80]
+ vaddsd -1024(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddsd -1032(%rdx), %xmm17, %xmm3
+// CHECK: encoding: [0x62,0xf1,0xf7,0x00,0x58,0x9a,0xf8,0xfb,0xff,0xff]
+ vaddsd -1032(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddss %xmm19, %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xb1,0x56,0x08,0x58,0xfb]
+ vaddss %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss %xmm19, %xmm5, %xmm7 {%k2}
+// CHECK: encoding: [0x62,0xb1,0x56,0x0a,0x58,0xfb]
+ vaddss %xmm19, %xmm5, %xmm7 {%k2}
+
+// CHECK: vaddss %xmm19, %xmm5, %xmm7 {%k2} {z}
+// CHECK: encoding: [0x62,0xb1,0x56,0x8a,0x58,0xfb]
+ vaddss %xmm19, %xmm5, %xmm7 {%k2} {z}
+
+// CHECK: vaddss {rn-sae}, %xmm19, %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xb1,0x56,0x18,0x58,0xfb]
+ vaddss {rn-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss {ru-sae}, %xmm19, %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xb1,0x56,0x58,0x58,0xfb]
+ vaddss {ru-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss {rd-sae}, %xmm19, %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xb1,0x56,0x38,0x58,0xfb]
+ vaddss {rd-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss {rz-sae}, %xmm19, %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xb1,0x56,0x78,0x58,0xfb]
+ vaddss {rz-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss (%rcx), %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0x39]
+ vaddss (%rcx), %xmm5, %xmm7
+
+// CHECK: vaddss 291(%rax,%r14,8), %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xb1,0x56,0x08,0x58,0xbc,0xf0,0x23,0x01,0x00,0x00]
+ vaddss 291(%rax,%r14,8), %xmm5, %xmm7
+
+// CHECK: vaddss 508(%rdx), %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0x7a,0x7f]
+ vaddss 508(%rdx), %xmm5, %xmm7
+
+// CHECK: vaddss 512(%rdx), %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0xba,0x00,0x02,0x00,0x00]
+ vaddss 512(%rdx), %xmm5, %xmm7
+
+// CHECK: vaddss -512(%rdx), %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0x7a,0x80]
+ vaddss -512(%rdx), %xmm5, %xmm7
+
+// CHECK: vaddss -516(%rdx), %xmm5, %xmm7
+// CHECK: encoding: [0x62,0xf1,0x56,0x08,0x58,0xba,0xfc,0xfd,0xff,0xff]
+ vaddss -516(%rdx), %xmm5, %xmm7
+
+// CHECK: vdivpd {rn-sae}, %zmm11, %zmm6, %zmm18
+// CHECK: encoding: [0x62,0xc1,0xcd,0x18,0x5e,0xd3]
+ vdivpd {rn-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivpd {ru-sae}, %zmm11, %zmm6, %zmm18
+// CHECK: encoding: [0x62,0xc1,0xcd,0x58,0x5e,0xd3]
+ vdivpd {ru-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivpd {rd-sae}, %zmm11, %zmm6, %zmm18
+// CHECK: encoding: [0x62,0xc1,0xcd,0x38,0x5e,0xd3]
+ vdivpd {rd-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivpd {rz-sae}, %zmm11, %zmm6, %zmm18
+// CHECK: encoding: [0x62,0xc1,0xcd,0x78,0x5e,0xd3]
+ vdivpd {rz-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivps {rn-sae}, %zmm28, %zmm23, %zmm23
+// CHECK: encoding: [0x62,0x81,0x44,0x10,0x5e,0xfc]
+ vdivps {rn-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivps {ru-sae}, %zmm28, %zmm23, %zmm23
+// CHECK: encoding: [0x62,0x81,0x44,0x50,0x5e,0xfc]
+ vdivps {ru-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivps {rd-sae}, %zmm28, %zmm23, %zmm23
+// CHECK: encoding: [0x62,0x81,0x44,0x30,0x5e,0xfc]
+ vdivps {rd-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivps {rz-sae}, %zmm28, %zmm23, %zmm23
+// CHECK: encoding: [0x62,0x81,0x44,0x70,0x5e,0xfc]
+ vdivps {rz-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivsd %xmm22, %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x21,0x97,0x08,0x5e,0xee]
+ vdivsd %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd %xmm22, %xmm13, %xmm29 {%k3}
+// CHECK: encoding: [0x62,0x21,0x97,0x0b,0x5e,0xee]
+ vdivsd %xmm22, %xmm13, %xmm29 {%k3}
+
+// CHECK: vdivsd %xmm22, %xmm13, %xmm29 {%k3} {z}
+// CHECK: encoding: [0x62,0x21,0x97,0x8b,0x5e,0xee]
+ vdivsd %xmm22, %xmm13, %xmm29 {%k3} {z}
+
+// CHECK: vdivsd {rn-sae}, %xmm22, %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x21,0x97,0x18,0x5e,0xee]
+ vdivsd {rn-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd {ru-sae}, %xmm22, %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x21,0x97,0x58,0x5e,0xee]
+ vdivsd {ru-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd {rd-sae}, %xmm22, %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x21,0x97,0x38,0x5e,0xee]
+ vdivsd {rd-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd {rz-sae}, %xmm22, %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x21,0x97,0x78,0x5e,0xee]
+ vdivsd {rz-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd (%rcx), %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0x29]
+ vdivsd (%rcx), %xmm13, %xmm29
+
+// CHECK: vdivsd 291(%rax,%r14,8), %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x21,0x97,0x08,0x5e,0xac,0xf0,0x23,0x01,0x00,0x00]
+ vdivsd 291(%rax,%r14,8), %xmm13, %xmm29
+
+// CHECK: vdivsd 1016(%rdx), %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0x6a,0x7f]
+ vdivsd 1016(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivsd 1024(%rdx), %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0xaa,0x00,0x04,0x00,0x00]
+ vdivsd 1024(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivsd -1024(%rdx), %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0x6a,0x80]
+ vdivsd -1024(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivsd -1032(%rdx), %xmm13, %xmm29
+// CHECK: encoding: [0x62,0x61,0x97,0x08,0x5e,0xaa,0xf8,0xfb,0xff,0xff]
+ vdivsd -1032(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivss %xmm17, %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xa1,0x4e,0x08,0x5e,0xe9]
+ vdivss %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss %xmm17, %xmm6, %xmm21 {%k5}
+// CHECK: encoding: [0x62,0xa1,0x4e,0x0d,0x5e,0xe9]
+ vdivss %xmm17, %xmm6, %xmm21 {%k5}
+
+// CHECK: vdivss %xmm17, %xmm6, %xmm21 {%k5} {z}
+// CHECK: encoding: [0x62,0xa1,0x4e,0x8d,0x5e,0xe9]
+ vdivss %xmm17, %xmm6, %xmm21 {%k5} {z}
+
+// CHECK: vdivss {rn-sae}, %xmm17, %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xa1,0x4e,0x18,0x5e,0xe9]
+ vdivss {rn-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss {ru-sae}, %xmm17, %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xa1,0x4e,0x58,0x5e,0xe9]
+ vdivss {ru-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss {rd-sae}, %xmm17, %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xa1,0x4e,0x38,0x5e,0xe9]
+ vdivss {rd-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss {rz-sae}, %xmm17, %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xa1,0x4e,0x78,0x5e,0xe9]
+ vdivss {rz-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss (%rcx), %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x29]
+ vdivss (%rcx), %xmm6, %xmm21
+
+// CHECK: vdivss 291(%rax,%r14,8), %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xa1,0x4e,0x08,0x5e,0xac,0xf0,0x23,0x01,0x00,0x00]
+ vdivss 291(%rax,%r14,8), %xmm6, %xmm21
+
+// CHECK: vdivss 508(%rdx), %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x6a,0x7f]
+ vdivss 508(%rdx), %xmm6, %xmm21
+
+// CHECK: vdivss 512(%rdx), %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0xaa,0x00,0x02,0x00,0x00]
+ vdivss 512(%rdx), %xmm6, %xmm21
+
+// CHECK: vdivss -512(%rdx), %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x6a,0x80]
+ vdivss -512(%rdx), %xmm6, %xmm21
+
+// CHECK: vdivss -516(%rdx), %xmm6, %xmm21
+// CHECK: encoding: [0x62,0xe1,0x4e,0x08,0x5e,0xaa,0xfc,0xfd,0xff,0xff]
+ vdivss -516(%rdx), %xmm6, %xmm21
+
+// CHECK: vmaxpd {sae}, %zmm20, %zmm28, %zmm30
+// CHECK: encoding: [0x62,0x21,0x9d,0x10,0x5f,0xf4]
+ vmaxpd {sae}, %zmm20, %zmm28, %zmm30
+
+// CHECK: vmaxps {sae}, %zmm20, %zmm6, %zmm25
+// CHECK: encoding: [0x62,0x21,0x4c,0x18,0x5f,0xcc]
+ vmaxps {sae}, %zmm20, %zmm6, %zmm25
+
+// CHECK: vmaxsd %xmm25, %xmm19, %xmm20
+// CHECK: encoding: [0x62,0x81,0xe7,0x00,0x5f,0xe1]
+ vmaxsd %xmm25, %xmm19, %xmm20
+
+// CHECK: vmaxsd %xmm25, %xmm19, %xmm20 {%k3}
+// CHECK: encoding: [0x62,0x81,0xe7,0x03,0x5f,0xe1]
+ vmaxsd %xmm25, %xmm19, %xmm20 {%k3}
+
+// CHECK: vmaxsd %xmm25, %xmm19, %xmm20 {%k3} {z}
+// CHECK: encoding: [0x62,0x81,0xe7,0x83,0x5f,0xe1]
+ vmaxsd %xmm25, %xmm19, %xmm20 {%k3} {z}
+
+// CHECK: vmaxsd {sae}, %xmm25, %xmm19, %xmm20
+// CHECK: encoding: [0x62,0x81,0xe7,0x10,0x5f,0xe1]
+ vmaxsd {sae}, %xmm25, %xmm19, %xmm20
+
+// CHECK: vmaxsd (%rcx), %xmm19, %xmm20
+// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x21]
+ vmaxsd (%rcx), %xmm19, %xmm20
+
+// CHECK: vmaxsd 291(%rax,%r14,8), %xmm19, %xmm20
+// CHECK: encoding: [0x62,0xa1,0xe7,0x00,0x5f,0xa4,0xf0,0x23,0x01,0x00,0x00]
+ vmaxsd 291(%rax,%r14,8), %xmm19, %xmm20
+
+// CHECK: vmaxsd 1016(%rdx), %xmm19, %xmm20
+// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x62,0x7f]
+ vmaxsd 1016(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxsd 1024(%rdx), %xmm19, %xmm20
+// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0xa2,0x00,0x04,0x00,0x00]
+ vmaxsd 1024(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxsd -1024(%rdx), %xmm19, %xmm20
+// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x62,0x80]
+ vmaxsd -1024(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxsd -1032(%rdx), %xmm19, %xmm20
+// CHECK: encoding: [0x62,0xe1,0xe7,0x00,0x5f,0xa2,0xf8,0xfb,0xff,0xff]
+ vmaxsd -1032(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxss %xmm6, %xmm4, %xmm8
+// CHECK: encoding: [0xc5,0x5a,0x5f,0xc6]
+ vmaxss %xmm6, %xmm4, %xmm8
+
+// CHECK: vmaxss %xmm6, %xmm4, %xmm8 {%k4}
+// CHECK: encoding: [0x62,0x71,0x5e,0x0c,0x5f,0xc6]
+ vmaxss %xmm6, %xmm4, %xmm8 {%k4}
+
+// CHECK: vmaxss %xmm6, %xmm4, %xmm8 {%k4} {z}
+// CHECK: encoding: [0x62,0x71,0x5e,0x8c,0x5f,0xc6]
+ vmaxss %xmm6, %xmm4, %xmm8 {%k4} {z}
+
+// CHECK: vmaxss {sae}, %xmm6, %xmm4, %xmm8
+// CHECK: encoding: [0x62,0x71,0x5e,0x18,0x5f,0xc6]
+ vmaxss {sae}, %xmm6, %xmm4, %xmm8
+
+// CHECK: vmaxss (%rcx), %xmm4, %xmm8
+// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x01]
+ vmaxss (%rcx), %xmm4, %xmm8
+
+// CHECK: vmaxss 291(%rax,%r14,8), %xmm4, %xmm8
+// CHECK: encoding: [0x62,0x31,0x5e,0x08,0x5f,0x84,0xf0,0x23,0x01,0x00,0x00]
+ vmaxss 291(%rax,%r14,8), %xmm4, %xmm8
+
+// CHECK: vmaxss 508(%rdx), %xmm4, %xmm8
+// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x42,0x7f]
+ vmaxss 508(%rdx), %xmm4, %xmm8
+
+// CHECK: vmaxss 512(%rdx), %xmm4, %xmm8
+// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x82,0x00,0x02,0x00,0x00]
+ vmaxss 512(%rdx), %xmm4, %xmm8
+
+// CHECK: vmaxss -512(%rdx), %xmm4, %xmm8
+// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x42,0x80]
+ vmaxss -512(%rdx), %xmm4, %xmm8
+
+// CHECK: vmaxss -516(%rdx), %xmm4, %xmm8
+// CHECK: encoding: [0x62,0x71,0x5e,0x08,0x5f,0x82,0xfc,0xfd,0xff,0xff]
+ vmaxss -516(%rdx), %xmm4, %xmm8
+
+// CHECK: vminpd {sae}, %zmm22, %zmm6, %zmm6
+// CHECK: encoding: [0x62,0xb1,0xcd,0x18,0x5d,0xf6]
+ vminpd {sae}, %zmm22, %zmm6, %zmm6
+
+// CHECK: vminps {sae}, %zmm7, %zmm3, %zmm3
+// CHECK: encoding: [0x62,0xf1,0x64,0x18,0x5d,0xdf]
+ vminps {sae}, %zmm7, %zmm3, %zmm3
+
+// CHECK: vminsd %xmm26, %xmm25, %xmm5
+// CHECK: encoding: [0x62,0x91,0xb7,0x00,0x5d,0xea]
+ vminsd %xmm26, %xmm25, %xmm5
+
+// CHECK: vminsd %xmm26, %xmm25, %xmm5 {%k3}
+// CHECK: encoding: [0x62,0x91,0xb7,0x03,0x5d,0xea]
+ vminsd %xmm26, %xmm25, %xmm5 {%k3}
+
+// CHECK: vminsd %xmm26, %xmm25, %xmm5 {%k3} {z}
+// CHECK: encoding: [0x62,0x91,0xb7,0x83,0x5d,0xea]
+ vminsd %xmm26, %xmm25, %xmm5 {%k3} {z}
+
+// CHECK: vminsd {sae}, %xmm26, %xmm25, %xmm5
+// CHECK: encoding: [0x62,0x91,0xb7,0x10,0x5d,0xea]
+ vminsd {sae}, %xmm26, %xmm25, %xmm5
+
+// CHECK: vminsd (%rcx), %xmm25, %xmm5
+// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x29]
+ vminsd (%rcx), %xmm25, %xmm5
+
+// CHECK: vminsd 291(%rax,%r14,8), %xmm25, %xmm5
+// CHECK: encoding: [0x62,0xb1,0xb7,0x00,0x5d,0xac,0xf0,0x23,0x01,0x00,0x00]
+ vminsd 291(%rax,%r14,8), %xmm25, %xmm5
+
+// CHECK: vminsd 1016(%rdx), %xmm25, %xmm5
+// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x6a,0x7f]
+ vminsd 1016(%rdx), %xmm25, %xmm5
+
+// CHECK: vminsd 1024(%rdx), %xmm25, %xmm5
+// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0xaa,0x00,0x04,0x00,0x00]
+ vminsd 1024(%rdx), %xmm25, %xmm5
+
+// CHECK: vminsd -1024(%rdx), %xmm25, %xmm5
+// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x6a,0x80]
+ vminsd -1024(%rdx), %xmm25, %xmm5
+
+// CHECK: vminsd -1032(%rdx), %xmm25, %xmm5
+// CHECK: encoding: [0x62,0xf1,0xb7,0x00,0x5d,0xaa,0xf8,0xfb,0xff,0xff]
+ vminsd -1032(%rdx), %xmm25, %xmm5
+
+// CHECK: vminss %xmm19, %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x31,0x76,0x00,0x5d,0xd3]
+ vminss %xmm19, %xmm17, %xmm10
+
+// CHECK: vminss %xmm19, %xmm17, %xmm10 {%k5}
+// CHECK: encoding: [0x62,0x31,0x76,0x05,0x5d,0xd3]
+ vminss %xmm19, %xmm17, %xmm10 {%k5}
+
+// CHECK: vminss %xmm19, %xmm17, %xmm10 {%k5} {z}
+// CHECK: encoding: [0x62,0x31,0x76,0x85,0x5d,0xd3]
+ vminss %xmm19, %xmm17, %xmm10 {%k5} {z}
+
+// CHECK: vminss {sae}, %xmm19, %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x31,0x76,0x10,0x5d,0xd3]
+ vminss {sae}, %xmm19, %xmm17, %xmm10
+
+// CHECK: vminss (%rcx), %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x11]
+ vminss (%rcx), %xmm17, %xmm10
+
+// CHECK: vminss 291(%rax,%r14,8), %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x31,0x76,0x00,0x5d,0x94,0xf0,0x23,0x01,0x00,0x00]
+ vminss 291(%rax,%r14,8), %xmm17, %xmm10
+
+// CHECK: vminss 508(%rdx), %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x52,0x7f]
+ vminss 508(%rdx), %xmm17, %xmm10
+
+// CHECK: vminss 512(%rdx), %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x92,0x00,0x02,0x00,0x00]
+ vminss 512(%rdx), %xmm17, %xmm10
+
+// CHECK: vminss -512(%rdx), %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x52,0x80]
+ vminss -512(%rdx), %xmm17, %xmm10
+
+// CHECK: vminss -516(%rdx), %xmm17, %xmm10
+// CHECK: encoding: [0x62,0x71,0x76,0x00,0x5d,0x92,0xfc,0xfd,0xff,0xff]
+ vminss -516(%rdx), %xmm17, %xmm10
+
+// CHECK: vmulpd {rn-sae}, %zmm23, %zmm4, %zmm24
+// CHECK: encoding: [0x62,0x21,0xdd,0x18,0x59,0xc7]
+ vmulpd {rn-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulpd {ru-sae}, %zmm23, %zmm4, %zmm24
+// CHECK: encoding: [0x62,0x21,0xdd,0x58,0x59,0xc7]
+ vmulpd {ru-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulpd {rd-sae}, %zmm23, %zmm4, %zmm24
+// CHECK: encoding: [0x62,0x21,0xdd,0x38,0x59,0xc7]
+ vmulpd {rd-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulpd {rz-sae}, %zmm23, %zmm4, %zmm24
+// CHECK: encoding: [0x62,0x21,0xdd,0x78,0x59,0xc7]
+ vmulpd {rz-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulps {rn-sae}, %zmm24, %zmm6, %zmm3
+// CHECK: encoding: [0x62,0x91,0x4c,0x18,0x59,0xd8]
+ vmulps {rn-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulps {ru-sae}, %zmm24, %zmm6, %zmm3
+// CHECK: encoding: [0x62,0x91,0x4c,0x58,0x59,0xd8]
+ vmulps {ru-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulps {rd-sae}, %zmm24, %zmm6, %zmm3
+// CHECK: encoding: [0x62,0x91,0x4c,0x38,0x59,0xd8]
+ vmulps {rd-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulps {rz-sae}, %zmm24, %zmm6, %zmm3
+// CHECK: encoding: [0x62,0x91,0x4c,0x78,0x59,0xd8]
+ vmulps {rz-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulsd %xmm18, %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x31,0xdf,0x08,0x59,0xea]
+ vmulsd %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd %xmm18, %xmm4, %xmm13 {%k2}
+// CHECK: encoding: [0x62,0x31,0xdf,0x0a,0x59,0xea]
+ vmulsd %xmm18, %xmm4, %xmm13 {%k2}
+
+// CHECK: vmulsd %xmm18, %xmm4, %xmm13 {%k2} {z}
+// CHECK: encoding: [0x62,0x31,0xdf,0x8a,0x59,0xea]
+ vmulsd %xmm18, %xmm4, %xmm13 {%k2} {z}
+
+// CHECK: vmulsd {rn-sae}, %xmm18, %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x31,0xdf,0x18,0x59,0xea]
+ vmulsd {rn-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd {ru-sae}, %xmm18, %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x31,0xdf,0x58,0x59,0xea]
+ vmulsd {ru-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd {rd-sae}, %xmm18, %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x31,0xdf,0x38,0x59,0xea]
+ vmulsd {rd-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd {rz-sae}, %xmm18, %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x31,0xdf,0x78,0x59,0xea]
+ vmulsd {rz-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd (%rcx), %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0x29]
+ vmulsd (%rcx), %xmm4, %xmm13
+
+// CHECK: vmulsd 291(%rax,%r14,8), %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x31,0xdf,0x08,0x59,0xac,0xf0,0x23,0x01,0x00,0x00]
+ vmulsd 291(%rax,%r14,8), %xmm4, %xmm13
+
+// CHECK: vmulsd 1016(%rdx), %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0x6a,0x7f]
+ vmulsd 1016(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulsd 1024(%rdx), %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0xaa,0x00,0x04,0x00,0x00]
+ vmulsd 1024(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulsd -1024(%rdx), %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0x6a,0x80]
+ vmulsd -1024(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulsd -1032(%rdx), %xmm4, %xmm13
+// CHECK: encoding: [0x62,0x71,0xdf,0x08,0x59,0xaa,0xf8,0xfb,0xff,0xff]
+ vmulsd -1032(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulss %xmm14, %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x2e,0x08,0x59,0xf6]
+ vmulss %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss %xmm14, %xmm10, %xmm22 {%k4}
+// CHECK: encoding: [0x62,0xc1,0x2e,0x0c,0x59,0xf6]
+ vmulss %xmm14, %xmm10, %xmm22 {%k4}
+
+// CHECK: vmulss %xmm14, %xmm10, %xmm22 {%k4} {z}
+// CHECK: encoding: [0x62,0xc1,0x2e,0x8c,0x59,0xf6]
+ vmulss %xmm14, %xmm10, %xmm22 {%k4} {z}
+
+// CHECK: vmulss {rn-sae}, %xmm14, %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x2e,0x18,0x59,0xf6]
+ vmulss {rn-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss {ru-sae}, %xmm14, %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x2e,0x58,0x59,0xf6]
+ vmulss {ru-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss {rd-sae}, %xmm14, %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x2e,0x38,0x59,0xf6]
+ vmulss {rd-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss {rz-sae}, %xmm14, %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x2e,0x78,0x59,0xf6]
+ vmulss {rz-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss (%rcx), %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0x31]
+ vmulss (%rcx), %xmm10, %xmm22
+
+// CHECK: vmulss 291(%rax,%r14,8), %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xa1,0x2e,0x08,0x59,0xb4,0xf0,0x23,0x01,0x00,0x00]
+ vmulss 291(%rax,%r14,8), %xmm10, %xmm22
+
+// CHECK: vmulss 508(%rdx), %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0x72,0x7f]
+ vmulss 508(%rdx), %xmm10, %xmm22
+
+// CHECK: vmulss 512(%rdx), %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0xb2,0x00,0x02,0x00,0x00]
+ vmulss 512(%rdx), %xmm10, %xmm22
+
+// CHECK: vmulss -512(%rdx), %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0x72,0x80]
+ vmulss -512(%rdx), %xmm10, %xmm22
+
+// CHECK: vmulss -516(%rdx), %xmm10, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x2e,0x08,0x59,0xb2,0xfc,0xfd,0xff,0xff]
+ vmulss -516(%rdx), %xmm10, %xmm22