9 files changed, 1391 insertions, 100 deletions
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 43aa89871e8..998fa7a6ce3 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3205,39 +3205,111 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
+  def int_x86_avx512_mask_add_ps_128 : GCCBuiltin<"__builtin_ia32_addps128_mask">,
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_add_ps_256 : GCCBuiltin<"__builtin_ia32_addps256_mask">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_add_pd_128 : GCCBuiltin<"__builtin_ia32_addpd128_mask">,
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_add_pd_256 : GCCBuiltin<"__builtin_ia32_addpd256_mask">,
+          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_sub_ps_128 : GCCBuiltin<"__builtin_ia32_subps128_mask">,
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_sub_ps_256 : GCCBuiltin<"__builtin_ia32_subps256_mask">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_sub_pd_128 : GCCBuiltin<"__builtin_ia32_subpd128_mask">,
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_sub_pd_256 : GCCBuiltin<"__builtin_ia32_subpd256_mask">,
+          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_mul_ps_128 : GCCBuiltin<"__builtin_ia32_mulps128_mask">,
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_mul_ps_256 : GCCBuiltin<"__builtin_ia32_mulps256_mask">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_mul_pd_128 : GCCBuiltin<"__builtin_ia32_mulpd128_mask">,
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_mul_pd_256 : GCCBuiltin<"__builtin_ia32_mulpd256_mask">,
+          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_div_ps_128 : GCCBuiltin<"__builtin_ia32_divps128_mask">,
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_div_ps_256 : GCCBuiltin<"__builtin_ia32_divps256_mask">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_div_pd_128 : GCCBuiltin<"__builtin_ia32_divpd128_mask">,
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_div_pd_256 : GCCBuiltin<"__builtin_ia32_divpd256_mask">,
+          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_max_ps_128 : GCCBuiltin<"__builtin_ia32_maxps128_mask">,
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256_mask">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_max_pd_128 : GCCBuiltin<"__builtin_ia32_maxpd128_mask">,
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256_mask">,
+          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_min_ps_128 : GCCBuiltin<"__builtin_ia32_minps128_mask">,
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256_mask">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_min_pd_128 : GCCBuiltin<"__builtin_ia32_minpd128_mask">,
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256_mask">,
+          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index bc6138c67ec..90b49328f22 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -205,6 +205,8 @@ namespace llvm {
       FSUB_RND,
       FMUL_RND,
       FDIV_RND,
+      FMAX_RND,
+      FMIN_RND,
       
       // Integer add/sub with unsigned saturation.
       ADDUS,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index bc8ab83fe31..4f9b467d3b2 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -180,21 +180,20 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                                   list<dag> Pattern,
                                   list<dag> MaskingPattern,
                                   list<dag> ZeroMaskingPattern,
-                                  string Round = "",
                                   string MaskingConstraint = "",
                                   InstrItinClass itin = NoItinerary,
                                   bit IsCommutable = 0> {
   let isCommutable = IsCommutable in
     def NAME: AVX512<O, F, Outs, Ins,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
-                                     "$dst "#Round#", "#IntelSrcAsm#"}",
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+                                     "$dst , "#IntelSrcAsm#"}",
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
   let AddedComplexity = 20 in
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"#
-                                     "$dst {${mask}}"#Round#", "#IntelSrcAsm#"}",
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
                        MaskingPattern, itin>,
               EVEX_K {
       // In case of the 3src subclass this is overridden with a let.
@@ -202,8 +201,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
   }
   let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"#
-                                     "$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}",
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+                                     "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
                        ZeroMaskingPattern,
                        itin>,
               EVEX_KZ;
@@ -217,7 +216,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   dag RHS, dag MaskingRHS,
-                                  SDNode Select = vselect, string Round = "",
+                                  SDNode Select = vselect,
                                   string MaskingConstraint = "",
                                   InstrItinClass itin = NoItinerary,
                                   bit IsCommutable = 0> :
@@ -227,7 +226,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                          [(set _.RC:$dst, MaskingRHS)],
                          [(set _.RC:$dst,
                                (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
-                         Round, MaskingConstraint, NoItinerary, IsCommutable>;
+                         MaskingConstraint, NoItinerary, IsCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -235,7 +234,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
 multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
-                           dag RHS, string Round = "",
+                           dag RHS,
                            InstrItinClass itin = NoItinerary,
                            bit IsCommutable = 0> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
@@ -243,14 +242,14 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect,
-                          Round, "$src0 = $dst", itin, IsCommutable>;
+                          "$src0 = $dst", itin, IsCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the scalar instruction.
 multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
-                           dag RHS, string Round = "",
+                           dag RHS,
                            InstrItinClass itin = NoItinerary,
                            bit IsCommutable = 0> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
@@ -258,7 +257,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select,
-                          Round, "$src0 = $dst", itin, IsCommutable>;
+                          "$src0 = $dst", itin, IsCommutable>;
 
 // Similar to AVX512_maskable but in this case one of the source operands
 // ($src1) is already tied to $dst so we just use that for the preserved
@@ -284,7 +283,7 @@ multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
    AVX512_maskable_custom<O, F, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
-                          OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "",
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
                           "$src0 = $dst">;
 
 
@@ -2963,7 +2962,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                     "$src2, $src1", "$src1, $src2",
                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
-                    "", itins.rr, IsCommutable>,
+                    itins.rr, IsCommutable>,
             AVX512BIBase, EVEX_4V;
 
   let mayLoad = 1 in
@@ -2972,7 +2971,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "$src2, $src1", "$src1, $src2",
                     (_.VT (OpNode _.RC:$src1,
                                   (bitconvert (_.LdFrag addr:$src2)))),
-                    "", itins.rm>,
+                    itins.rm>,
               AVX512BIBase, EVEX_4V;
 }
 
@@ -2988,7 +2987,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (_.VT (OpNode _.RC:$src1,
                                   (X86VBroadcast
                                       (_.ScalarLdFrag addr:$src2)))),
-                    "", itins.rm>,
+                    itins.rm>,
                AVX512BIBase, EVEX_4V, EVEX_B;
 }
 
@@ -3090,7 +3089,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                             (_Dst.VT (OpNode 
                                          (_Src.VT _Src.RC:$src1), 
                                          (_Src.VT _Src.RC:$src2))),
-                            "",itins.rr, IsCommutable>, 
+                            itins.rr, IsCommutable>, 
                             AVX512BIBase, EVEX_4V;
   let mayLoad = 1 in {
       defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -3098,7 +3097,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                             "$src2, $src1", "$src1, $src2",
                             (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
                                           (bitconvert (_Src.LdFrag addr:$src2)))),
-                            "", itins.rm>,
+                            itins.rm>,
                             AVX512BIBase, EVEX_4V;
 
       defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -3109,7 +3108,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert 
                                      (_Dst.VT (X86VBroadcast 
                                               (_Dst.ScalarLdFrag addr:$src2)))))),
-                        "", itins.rm>,
+                        itins.rm>,
                         AVX512BIBase, EVEX_4V, EVEX_B;
   }
 }
@@ -3165,8 +3164,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          "$src1, ${src2}"##_Src.BroadcastStr,
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert 
                                      (_Src.VT (X86VBroadcast 
-                                              (_Src.ScalarLdFrag addr:$src2)))))),
-                        "">,
+                                              (_Src.ScalarLdFrag addr:$src2))))))>,
                         EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
   }
 }
@@ -3179,15 +3177,15 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                             "$src2, $src1","$src1, $src2", 
                             (_Dst.VT (OpNode 
                                          (_Src.VT _Src.RC:$src1), 
-                                         (_Src.VT _Src.RC:$src2))),
-                            "">,  EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
+                                         (_Src.VT _Src.RC:$src2)))>,
+                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
   let mayLoad = 1 in {
     defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                           (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                           "$src2, $src1", "$src1, $src2",
                           (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                        (bitconvert (_Src.LdFrag addr:$src2)))),
-                            "">, EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
+                                        (bitconvert (_Src.LdFrag addr:$src2))))>,
+                           EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
   }
 }
 
@@ -3390,7 +3388,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                            "$src2, $src1", "$src1, $src2",
                            (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                            (i32 FROUND_CURRENT)),
-                           "", itins.rr, IsCommutable>;
+                           itins.rr, IsCommutable>;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
@@ -3398,7 +3396,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (VecNode (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                            (i32 FROUND_CURRENT)),
-                         "", itins.rm, IsCommutable>;
+                         itins.rm, IsCommutable>;
   let isCodeGenOnly = 1, isCommutable = IsCommutable,
       Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -3421,7 +3419,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
                           (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
                           "$rc, $src2, $src1", "$src1, $src2, $rc",
                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                          (i32 imm:$rc)), "", itins.rr, IsCommutable>,
+                          (i32 imm:$rc)), itins.rr, IsCommutable>,
                           EVEX_B, EVEX_RC;
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -3429,9 +3427,9 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 
   defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
-                            "$src2, $src1", "$src1, $src2",
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                             (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                            (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
+                            (i32 FROUND_NO_EXC))>, EVEX_B;
 }
 
 multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3500,6 +3498,16 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRn
                   EVEX_4V, EVEX_B, EVEX_RC;
 }
 
+
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+                            X86VectorVTInfo _, bit IsCommutable> {
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+                  EVEX_4V, EVEX_B;
+}
+
 multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 
                              bit IsCommutable = 0> {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
@@ -3533,6 +3541,13 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
                               EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
 }
 
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+  defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>,
+                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>,
+                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
 defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
             avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
 defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
@@ -3541,33 +3556,17 @@ defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
             avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
 defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
             avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
-defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>;
-defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>,
+            avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>,
+            avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
 let Predicates = [HasDQI] in {
   defm VAND  : avx512_fp_binop_p<0x54, "vand", X86fand, 1>;
   defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>;
   defm VOR   : avx512_fp_binop_p<0x56, "vor", X86for, 1>;
   defm VXOR  : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>;
 }
-def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1),
-                   (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
-                   (i16 -1), FROUND_CURRENT)),
-          (VMAXPSZrr VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_max_pd_512 (v8f64 VR512:$src1),
-                   (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
-                   (i8 -1), FROUND_CURRENT)),
-          (VMAXPDZrr VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_min_ps_512 (v16f32 VR512:$src1),
-                   (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
-                   (i16 -1), FROUND_CURRENT)),
-          (VMINPSZrr VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1),
-                   (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
-                   (i8 -1), FROUND_CURRENT)),
-          (VMINPDZrr VR512:$src1, VR512:$src2)>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  VPTESTM instructions
 //===----------------------------------------------------------------------===//
@@ -3667,14 +3666,14 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
                    (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
-                   " ",  SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V;
+                   SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V;
   let mayLoad = 1 in
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
                           (i8 imm:$src2))),
-                   " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V;
+                   SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V;
 }
 
 multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
@@ -3684,7 +3683,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
                    (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
       "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
      (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
-     " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B;
+     SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B;
 }
 
 multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3694,12 +3693,12 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
-                   " ",  SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+                   SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
-                   " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
+                   SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
                    EVEX_4V;
 }
 
@@ -3798,13 +3797,13 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
-                   " ",  SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+                   SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
   let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))),
-                   " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
+                   SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
                    EVEX_CD8<_.EltSize, CD8VF>;
 }
 
@@ -3817,7 +3816,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "$src1, ${src2}"##_.BroadcastStr,
                     (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
                                                 (_.ScalarLdFrag addr:$src2))))),
-                    " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
+                    SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
                     EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
 }
 multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4775,9 +4774,9 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 
   defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
-                            "$src2, $src1", "$src1, $src2",
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                             (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                            (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
+                            (i32 FROUND_NO_EXC))>, EVEX_B;
 
   defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
@@ -4809,9 +4808,8 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 
   defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src), OpcodeStr,
-                        "$src", "$src",
-                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)),
-                        "{sae}">, EVEX_B;
+                        "{sae}, $src", "$src, {sae}",
+                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
 
   defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
@@ -5051,9 +5049,9 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
   defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
-                         "$src3, $src2, $src1", "$src1, $src2, $src3",
+                         "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}",
                          (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                         (i32 imm:$src3), (i32 FROUND_NO_EXC))), "{sae}">, EVEX_B;
+                         (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
 
   let mayLoad = 1 in
   defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 497bdf65315..d9eebc5ddfd 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -293,8 +293,8 @@ def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
 def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
 def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
 def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
-def X86fmaxRnd   : SDNode<"X86ISD::FMAX",      SDTFPBinOpRound>;
-def X86fminRnd   : SDNode<"X86ISD::FMIN",      SDTFPBinOpRound>;
+def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",      SDTFPBinOpRound>;
+def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",      SDTFPBinOpRound>;
 
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 648769e7069..e4d82335ece 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -243,8 +243,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
@@ -322,8 +326,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_compress_q_512,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::COMPRESS, 0),
 
+  X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
                      X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
                      X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
@@ -350,9 +358,28 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_q_512,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
-  
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+                     X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+                     X86ISD::FMAX_RND),  
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+                     X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+                     X86ISD::FMIN_RND),  
+  X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
@@ -512,8 +539,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::RNDSCALE, 0),
   X86_INTRINSIC_DATA(avx512_mask_rndscale_ss,   INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::RNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
                      X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
                      X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index d4129e14c95..d99764e4007 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -434,15 +434,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
  declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
 
  ; fp min - max
-define <16 x float> @test_vmaxps(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK: vmaxps
-  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1,
-                    <16 x float>zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>,
-                    <16 x float>, i16, i32)
-
 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
   ; CHECK: vmaxpd
   %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -452,15 +443,6 @@ define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
 declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
                     <8 x double>, i8, i32)
 
-define <16 x float> @test_vminps(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK: vminps
-  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1,
-                    <16 x float>zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>,
-                    <16 x float>, i16, i32)
-
 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
   ; CHECK: vminpd
   %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2240,3 +2222,362 @@ define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 }
 
 declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae
+  ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae
+  ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae
+  ;CHECK: vaddps  {ru-sae}, %zmm1, %zmm0, %zmm0  {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae
+  ;CHECK: vaddps  {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_current
+  ;CHECK: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae
+  ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae
+  ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae
+  ;CHECK: vaddps  {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae
+  ;CHECK: vaddps  {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_add_round_ps_current
+  ;CHECK: vaddps %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_add_round_ps_rn_sae
+  ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_add_round_ps_rd_sae
+  ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_add_round_ps_ru_sae
+  ;CHECK: vaddps  {ru-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_add_round_ps_rz_sae
+  ;CHECK: vaddps  {rz-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_add_round_ps_current
+  ;CHECK: vaddps %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae
+  ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae
+  ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae
+  ;CHECK: vsubps  {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae
+  ;CHECK: vsubps  {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_current
+  ;CHECK: vsubps %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_sub_round_ps_rn_sae
+  ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_sub_round_ps_rd_sae
+  ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_sub_round_ps_ru_sae
+  ;CHECK: vsubps  {ru-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_sub_round_ps_rz_sae
+  ;CHECK: vsubps  {rz-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_sub_round_ps_current
+  ;CHECK: vsubps %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae
+  ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae
+  ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae
+  ;CHECK: vdivps  {ru-sae}, %zmm1, %zmm0, %zmm0  {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae
+  ;CHECK: vdivps  {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_current
+  ;CHECK: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae
+  ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae
+  ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae
+  ;CHECK: vdivps  {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae
+  ;CHECK: vdivps  {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_div_round_ps_current
+  ;CHECK: vdivps %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_div_round_ps_rn_sae
+  ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_div_round_ps_rd_sae
+  ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_div_round_ps_ru_sae
+  ;CHECK: vdivps  {ru-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_div_round_ps_rz_sae
+  ;CHECK: vdivps  {rz-sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_div_round_ps_current
+  ;CHECK: vdivps %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_min_round_ps_sae
+  ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_min_round_ps_current
+  ;CHECK: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_min_round_ps_sae
+  ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_min_round_ps_current
+  ;CHECK: vminps %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_min_round_ps_sae
+  ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_min_round_ps_current
+  ;CHECK: vminps %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_max_round_ps_sae
+  ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_max_round_ps_current
+  ;CHECK: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_max_round_ps_sae
+  ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_max_round_ps_current
+  ;CHECK: vmaxps %zmm1, %zmm0, %zmm2 {%k1} 
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_max_round_ps_sae
+  ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0 
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ;CHECK-LABEL: test_mm512_max_round_ps_current
+  ;CHECK: vmaxps %zmm1, %zmm0, %zmm0
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index dcde9c4153e..827a56d76ae 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s
 
 define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+  ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
   %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
   ; CHECK: kmovw
-  ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
+  ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
   %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
   ret <16 x float> %res
 }
@@ -27,7 +27,7 @@ define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
 }
 
 define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
+  ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
   %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) 
   ret <16 x float> %res
 }
@@ -36,61 +36,61 @@ define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
 declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+  ; CHECK: vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
   %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+  ; CHECK: vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
   %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) 
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
-  ; CHECK: vexp2ps %zmm0, %zmm0 {sae}      # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
+  ; CHECK: vexp2ps {sae}, %zmm0, %zmm0     # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
   %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
-  ; CHECK: vexp2pd %zmm0, %zmm0 {sae}      # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
+  ; CHECK: vexp2pd {sae}, %zmm0, %zmm0      # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
   %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+  ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-  ; CHECK: vrcp28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+  ; CHECK: vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
+  ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; 
   ret <4 x float> %res
 }
 
 define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) {
-  ; CHECK: vrsqrt28ss %xmm1, %xmm0, %xmm2 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
+  ; CHECK: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ;
   ret <4 x float> %res
 }
 
 define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) {
-  ; CHECK: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
+  ; CHECK: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
   %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; 
   ret <2 x double> %res
 }
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 64ec8166d50..4a99a9699e3 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -2289,3 +2289,267 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
    ret i8 %res
  }
  declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)
+
+define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_add_ps_256
+  ;CHECK: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_add_ps_256
+  ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1} 
+  %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_add_ps_256
+  ;CHECK: vaddps %ymm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_add_ps_128
+  ;CHECK: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_add_ps_128
+  ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1} 
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_add_ps_128
+  ;CHECK: vaddps %xmm1, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_sub_ps_256
+  ;CHECK: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_sub_ps_256
+  ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1} 
+  %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_sub_ps_256
+  ;CHECK: vsubps %ymm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_sub_ps_128
+  ;CHECK: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_sub_ps_128
+  ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1} 
+  %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_sub_ps_128
+  ;CHECK: vsubps %xmm1, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_mul_ps_256
+  ;CHECK: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_mul_ps_256
+  ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1} 
+  %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mul_ps_256
+  ;CHECK: vmulps %ymm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_mul_ps_128
+  ;CHECK: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_mul_ps_128
+  ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1} 
+  %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mul_ps_128
+  ;CHECK: vmulps %xmm1, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_div_ps_256
+  ;CHECK: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_div_ps_256
+  ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1} 
+  %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_div_ps_256
+  ;CHECK: vdivps %ymm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_div_ps_128
+  ;CHECK: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_div_ps_128
+  ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1} 
+  %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_div_ps_128
+  ;CHECK: vdivps %xmm1, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_max_ps_256
+  ;CHECK: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_max_ps_256
+  ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1} 
+  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_max_ps_256
+  ;CHECK: vmaxps %ymm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_max_ps_128
+  ;CHECK: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_max_ps_128
+  ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1} 
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_max_ps_128
+  ;CHECK: vmaxps %xmm1, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_min_ps_256
+  ;CHECK: vminps %ymm1, %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_min_ps_256
+  ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1} 
+  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_min_ps_256
+  ;CHECK: vminps %ymm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_maskz_min_ps_128
+  ;CHECK: vminps %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_mask_min_ps_128
+  ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1} 
+  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: test_mm512_min_ps_128
+  ;CHECK: vminps %xmm1, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+\ No newline at end of file
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 91107cc6fee..04b736348dd 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -6112,3 +6112,586 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2
 // CHECK:  encoding: [0x62,0xf1,0x74,0x50,0xc2,0x92,0xfc,0xfd,0xff,0xff,0x7b]
           vcmpps $0x7b, -516(%rdx){1to16}, %zmm17, %k2
 
+// CHECK: vaddpd {rn-sae}, %zmm6, %zmm27, %zmm8
+// CHECK:  encoding: [0x62,0x71,0xa5,0x10,0x58,0xc6]
+          vaddpd {rn-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddpd {ru-sae}, %zmm6, %zmm27, %zmm8
+// CHECK:  encoding: [0x62,0x71,0xa5,0x50,0x58,0xc6]
+          vaddpd {ru-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddpd {rd-sae}, %zmm6, %zmm27, %zmm8
+// CHECK:  encoding: [0x62,0x71,0xa5,0x30,0x58,0xc6]
+          vaddpd {rd-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddpd {rz-sae}, %zmm6, %zmm27, %zmm8
+// CHECK:  encoding: [0x62,0x71,0xa5,0x70,0x58,0xc6]
+          vaddpd {rz-sae}, %zmm6, %zmm27, %zmm8
+
+// CHECK: vaddps {rn-sae}, %zmm2, %zmm13, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0x14,0x18,0x58,0xd2]
+          vaddps {rn-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddps {ru-sae}, %zmm2, %zmm13, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0x14,0x58,0x58,0xd2]
+          vaddps {ru-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddps {rd-sae}, %zmm2, %zmm13, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0x14,0x38,0x58,0xd2]
+          vaddps {rd-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddps {rz-sae}, %zmm2, %zmm13, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0x14,0x78,0x58,0xd2]
+          vaddps {rz-sae}, %zmm2, %zmm13, %zmm18
+
+// CHECK: vaddsd %xmm8, %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xd1,0xf7,0x00,0x58,0xd8]
+          vaddsd %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd %xmm8, %xmm17, %xmm3 {%k3}
+// CHECK:  encoding: [0x62,0xd1,0xf7,0x03,0x58,0xd8]
+          vaddsd %xmm8, %xmm17, %xmm3 {%k3}
+
+// CHECK: vaddsd %xmm8, %xmm17, %xmm3 {%k3} {z}
+// CHECK:  encoding: [0x62,0xd1,0xf7,0x83,0x58,0xd8]
+          vaddsd %xmm8, %xmm17, %xmm3 {%k3} {z}
+
+// CHECK: vaddsd {rn-sae}, %xmm8, %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xd1,0xf7,0x10,0x58,0xd8]
+          vaddsd {rn-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd {ru-sae}, %xmm8, %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xd1,0xf7,0x50,0x58,0xd8]
+          vaddsd {ru-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd {rd-sae}, %xmm8, %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xd1,0xf7,0x30,0x58,0xd8]
+          vaddsd {rd-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd {rz-sae}, %xmm8, %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xd1,0xf7,0x70,0x58,0xd8]
+          vaddsd {rz-sae}, %xmm8, %xmm17, %xmm3
+
+// CHECK: vaddsd (%rcx), %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xf1,0xf7,0x00,0x58,0x19]
+          vaddsd (%rcx), %xmm17, %xmm3
+
+// CHECK: vaddsd 291(%rax,%r14,8), %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xb1,0xf7,0x00,0x58,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vaddsd 291(%rax,%r14,8), %xmm17, %xmm3
+
+// CHECK: vaddsd 1016(%rdx), %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xf1,0xf7,0x00,0x58,0x5a,0x7f]
+          vaddsd 1016(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddsd 1024(%rdx), %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xf1,0xf7,0x00,0x58,0x9a,0x00,0x04,0x00,0x00]
+          vaddsd 1024(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddsd -1024(%rdx), %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xf1,0xf7,0x00,0x58,0x5a,0x80]
+          vaddsd -1024(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddsd -1032(%rdx), %xmm17, %xmm3
+// CHECK:  encoding: [0x62,0xf1,0xf7,0x00,0x58,0x9a,0xf8,0xfb,0xff,0xff]
+          vaddsd -1032(%rdx), %xmm17, %xmm3
+
+// CHECK: vaddss %xmm19, %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xb1,0x56,0x08,0x58,0xfb]
+          vaddss %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss %xmm19, %xmm5, %xmm7 {%k2}
+// CHECK:  encoding: [0x62,0xb1,0x56,0x0a,0x58,0xfb]
+          vaddss %xmm19, %xmm5, %xmm7 {%k2}
+
+// CHECK: vaddss %xmm19, %xmm5, %xmm7 {%k2} {z}
+// CHECK:  encoding: [0x62,0xb1,0x56,0x8a,0x58,0xfb]
+          vaddss %xmm19, %xmm5, %xmm7 {%k2} {z}
+
+// CHECK: vaddss {rn-sae}, %xmm19, %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xb1,0x56,0x18,0x58,0xfb]
+          vaddss {rn-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss {ru-sae}, %xmm19, %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xb1,0x56,0x58,0x58,0xfb]
+          vaddss {ru-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss {rd-sae}, %xmm19, %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xb1,0x56,0x38,0x58,0xfb]
+          vaddss {rd-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss {rz-sae}, %xmm19, %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xb1,0x56,0x78,0x58,0xfb]
+          vaddss {rz-sae}, %xmm19, %xmm5, %xmm7
+
+// CHECK: vaddss (%rcx), %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xf1,0x56,0x08,0x58,0x39]
+          vaddss (%rcx), %xmm5, %xmm7
+
+// CHECK: vaddss 291(%rax,%r14,8), %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xb1,0x56,0x08,0x58,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vaddss 291(%rax,%r14,8), %xmm5, %xmm7
+
+// CHECK: vaddss 508(%rdx), %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xf1,0x56,0x08,0x58,0x7a,0x7f]
+          vaddss 508(%rdx), %xmm5, %xmm7
+
+// CHECK: vaddss 512(%rdx), %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xf1,0x56,0x08,0x58,0xba,0x00,0x02,0x00,0x00]
+          vaddss 512(%rdx), %xmm5, %xmm7
+
+// CHECK: vaddss -512(%rdx), %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xf1,0x56,0x08,0x58,0x7a,0x80]
+          vaddss -512(%rdx), %xmm5, %xmm7
+
+// CHECK: vaddss -516(%rdx), %xmm5, %xmm7
+// CHECK:  encoding: [0x62,0xf1,0x56,0x08,0x58,0xba,0xfc,0xfd,0xff,0xff]
+          vaddss -516(%rdx), %xmm5, %xmm7
+
+// CHECK: vdivpd {rn-sae}, %zmm11, %zmm6, %zmm18
+// CHECK:  encoding: [0x62,0xc1,0xcd,0x18,0x5e,0xd3]
+          vdivpd {rn-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivpd {ru-sae}, %zmm11, %zmm6, %zmm18
+// CHECK:  encoding: [0x62,0xc1,0xcd,0x58,0x5e,0xd3]
+          vdivpd {ru-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivpd {rd-sae}, %zmm11, %zmm6, %zmm18
+// CHECK:  encoding: [0x62,0xc1,0xcd,0x38,0x5e,0xd3]
+          vdivpd {rd-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivpd {rz-sae}, %zmm11, %zmm6, %zmm18
+// CHECK:  encoding: [0x62,0xc1,0xcd,0x78,0x5e,0xd3]
+          vdivpd {rz-sae}, %zmm11, %zmm6, %zmm18
+
+// CHECK: vdivps {rn-sae}, %zmm28, %zmm23, %zmm23
+// CHECK:  encoding: [0x62,0x81,0x44,0x10,0x5e,0xfc]
+          vdivps {rn-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivps {ru-sae}, %zmm28, %zmm23, %zmm23
+// CHECK:  encoding: [0x62,0x81,0x44,0x50,0x5e,0xfc]
+          vdivps {ru-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivps {rd-sae}, %zmm28, %zmm23, %zmm23
+// CHECK:  encoding: [0x62,0x81,0x44,0x30,0x5e,0xfc]
+          vdivps {rd-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivps {rz-sae}, %zmm28, %zmm23, %zmm23
+// CHECK:  encoding: [0x62,0x81,0x44,0x70,0x5e,0xfc]
+          vdivps {rz-sae}, %zmm28, %zmm23, %zmm23
+
+// CHECK: vdivsd %xmm22, %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x97,0x08,0x5e,0xee]
+          vdivsd %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd %xmm22, %xmm13, %xmm29 {%k3}
+// CHECK:  encoding: [0x62,0x21,0x97,0x0b,0x5e,0xee]
+          vdivsd %xmm22, %xmm13, %xmm29 {%k3}
+
+// CHECK: vdivsd %xmm22, %xmm13, %xmm29 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0x97,0x8b,0x5e,0xee]
+          vdivsd %xmm22, %xmm13, %xmm29 {%k3} {z}
+
+// CHECK: vdivsd {rn-sae}, %xmm22, %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x97,0x18,0x5e,0xee]
+          vdivsd {rn-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd {ru-sae}, %xmm22, %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x97,0x58,0x5e,0xee]
+          vdivsd {ru-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd {rd-sae}, %xmm22, %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x97,0x38,0x5e,0xee]
+          vdivsd {rd-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd {rz-sae}, %xmm22, %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x97,0x78,0x5e,0xee]
+          vdivsd {rz-sae}, %xmm22, %xmm13, %xmm29
+
+// CHECK: vdivsd (%rcx), %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x97,0x08,0x5e,0x29]
+          vdivsd (%rcx), %xmm13, %xmm29
+
+// CHECK: vdivsd 291(%rax,%r14,8), %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x97,0x08,0x5e,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vdivsd 291(%rax,%r14,8), %xmm13, %xmm29
+
+// CHECK: vdivsd 1016(%rdx), %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x97,0x08,0x5e,0x6a,0x7f]
+          vdivsd 1016(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivsd 1024(%rdx), %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x97,0x08,0x5e,0xaa,0x00,0x04,0x00,0x00]
+          vdivsd 1024(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivsd -1024(%rdx), %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x97,0x08,0x5e,0x6a,0x80]
+          vdivsd -1024(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivsd -1032(%rdx), %xmm13, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x97,0x08,0x5e,0xaa,0xf8,0xfb,0xff,0xff]
+          vdivsd -1032(%rdx), %xmm13, %xmm29
+
+// CHECK: vdivss %xmm17, %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x08,0x5e,0xe9]
+          vdivss %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss %xmm17, %xmm6, %xmm21 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x0d,0x5e,0xe9]
+          vdivss %xmm17, %xmm6, %xmm21 {%k5}
+
+// CHECK: vdivss %xmm17, %xmm6, %xmm21 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x8d,0x5e,0xe9]
+          vdivss %xmm17, %xmm6, %xmm21 {%k5} {z}
+
+// CHECK: vdivss {rn-sae}, %xmm17, %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x18,0x5e,0xe9]
+          vdivss {rn-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss {ru-sae}, %xmm17, %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x58,0x5e,0xe9]
+          vdivss {ru-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss {rd-sae}, %xmm17, %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x38,0x5e,0xe9]
+          vdivss {rd-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss {rz-sae}, %xmm17, %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x78,0x5e,0xe9]
+          vdivss {rz-sae}, %xmm17, %xmm6, %xmm21
+
+// CHECK: vdivss (%rcx), %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x29]
+          vdivss (%rcx), %xmm6, %xmm21
+
+// CHECK: vdivss 291(%rax,%r14,8), %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x4e,0x08,0x5e,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vdivss 291(%rax,%r14,8), %xmm6, %xmm21
+
+// CHECK: vdivss 508(%rdx), %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x6a,0x7f]
+          vdivss 508(%rdx), %xmm6, %xmm21
+
+// CHECK: vdivss 512(%rdx), %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x4e,0x08,0x5e,0xaa,0x00,0x02,0x00,0x00]
+          vdivss 512(%rdx), %xmm6, %xmm21
+
+// CHECK: vdivss -512(%rdx), %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x4e,0x08,0x5e,0x6a,0x80]
+          vdivss -512(%rdx), %xmm6, %xmm21
+
+// CHECK: vdivss -516(%rdx), %xmm6, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x4e,0x08,0x5e,0xaa,0xfc,0xfd,0xff,0xff]
+          vdivss -516(%rdx), %xmm6, %xmm21
+
+// CHECK: vmaxpd {sae}, %zmm20, %zmm28, %zmm30
+// CHECK:  encoding: [0x62,0x21,0x9d,0x10,0x5f,0xf4]
+          vmaxpd {sae}, %zmm20, %zmm28, %zmm30
+
+// CHECK: vmaxps {sae}, %zmm20, %zmm6, %zmm25
+// CHECK:  encoding: [0x62,0x21,0x4c,0x18,0x5f,0xcc]
+          vmaxps {sae}, %zmm20, %zmm6, %zmm25
+
+// CHECK: vmaxsd %xmm25, %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0x81,0xe7,0x00,0x5f,0xe1]
+          vmaxsd %xmm25, %xmm19, %xmm20
+
+// CHECK: vmaxsd %xmm25, %xmm19, %xmm20 {%k3}
+// CHECK:  encoding: [0x62,0x81,0xe7,0x03,0x5f,0xe1]
+          vmaxsd %xmm25, %xmm19, %xmm20 {%k3}
+
+// CHECK: vmaxsd %xmm25, %xmm19, %xmm20 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0xe7,0x83,0x5f,0xe1]
+          vmaxsd %xmm25, %xmm19, %xmm20 {%k3} {z}
+
+// CHECK: vmaxsd {sae}, %xmm25, %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0x81,0xe7,0x10,0x5f,0xe1]
+          vmaxsd {sae}, %xmm25, %xmm19, %xmm20
+
+// CHECK: vmaxsd (%rcx), %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x21]
+          vmaxsd (%rcx), %xmm19, %xmm20
+
+// CHECK: vmaxsd 291(%rax,%r14,8), %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0xa1,0xe7,0x00,0x5f,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmaxsd 291(%rax,%r14,8), %xmm19, %xmm20
+
+// CHECK: vmaxsd 1016(%rdx), %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x62,0x7f]
+          vmaxsd 1016(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxsd 1024(%rdx), %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xe7,0x00,0x5f,0xa2,0x00,0x04,0x00,0x00]
+          vmaxsd 1024(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxsd -1024(%rdx), %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xe7,0x00,0x5f,0x62,0x80]
+          vmaxsd -1024(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxsd -1032(%rdx), %xmm19, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xe7,0x00,0x5f,0xa2,0xf8,0xfb,0xff,0xff]
+          vmaxsd -1032(%rdx), %xmm19, %xmm20
+
+// CHECK: vmaxss %xmm6, %xmm4, %xmm8
+// CHECK:  encoding: [0xc5,0x5a,0x5f,0xc6]
+          vmaxss %xmm6, %xmm4, %xmm8
+
+// CHECK: vmaxss %xmm6, %xmm4, %xmm8 {%k4}
+// CHECK:  encoding: [0x62,0x71,0x5e,0x0c,0x5f,0xc6]
+          vmaxss %xmm6, %xmm4, %xmm8 {%k4}
+
+// CHECK: vmaxss %xmm6, %xmm4, %xmm8 {%k4} {z}
+// CHECK:  encoding: [0x62,0x71,0x5e,0x8c,0x5f,0xc6]
+          vmaxss %xmm6, %xmm4, %xmm8 {%k4} {z}
+
+// CHECK: vmaxss {sae}, %xmm6, %xmm4, %xmm8
+// CHECK:  encoding: [0x62,0x71,0x5e,0x18,0x5f,0xc6]
+          vmaxss {sae}, %xmm6, %xmm4, %xmm8
+
+// CHECK: vmaxss (%rcx), %xmm4, %xmm8
+// CHECK:  encoding: [0x62,0x71,0x5e,0x08,0x5f,0x01]
+          vmaxss (%rcx), %xmm4, %xmm8
+
+// CHECK: vmaxss 291(%rax,%r14,8), %xmm4, %xmm8
+// CHECK:  encoding: [0x62,0x31,0x5e,0x08,0x5f,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmaxss 291(%rax,%r14,8), %xmm4, %xmm8
+
+// CHECK: vmaxss 508(%rdx), %xmm4, %xmm8
+// CHECK:  encoding: [0x62,0x71,0x5e,0x08,0x5f,0x42,0x7f]
+          vmaxss 508(%rdx), %xmm4, %xmm8
+
+// CHECK: vmaxss 512(%rdx), %xmm4, %xmm8
+// CHECK:  encoding: [0x62,0x71,0x5e,0x08,0x5f,0x82,0x00,0x02,0x00,0x00]
+          vmaxss 512(%rdx), %xmm4, %xmm8
+
+// CHECK: vmaxss -512(%rdx), %xmm4, %xmm8
+// CHECK:  encoding: [0x62,0x71,0x5e,0x08,0x5f,0x42,0x80]
+          vmaxss -512(%rdx), %xmm4, %xmm8
+
+// CHECK: vmaxss -516(%rdx), %xmm4, %xmm8
+// CHECK:  encoding: [0x62,0x71,0x5e,0x08,0x5f,0x82,0xfc,0xfd,0xff,0xff]
+          vmaxss -516(%rdx), %xmm4, %xmm8
+
+// CHECK: vminpd {sae}, %zmm22, %zmm6, %zmm6
+// CHECK:  encoding: [0x62,0xb1,0xcd,0x18,0x5d,0xf6]
+          vminpd {sae}, %zmm22, %zmm6, %zmm6
+
+// CHECK: vminps {sae}, %zmm7, %zmm3, %zmm3
+// CHECK:  encoding: [0x62,0xf1,0x64,0x18,0x5d,0xdf]
+          vminps {sae}, %zmm7, %zmm3, %zmm3
+
+// CHECK: vminsd %xmm26, %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0x91,0xb7,0x00,0x5d,0xea]
+          vminsd %xmm26, %xmm25, %xmm5
+
+// CHECK: vminsd %xmm26, %xmm25, %xmm5 {%k3}
+// CHECK:  encoding: [0x62,0x91,0xb7,0x03,0x5d,0xea]
+          vminsd %xmm26, %xmm25, %xmm5 {%k3}
+
+// CHECK: vminsd %xmm26, %xmm25, %xmm5 {%k3} {z}
+// CHECK:  encoding: [0x62,0x91,0xb7,0x83,0x5d,0xea]
+          vminsd %xmm26, %xmm25, %xmm5 {%k3} {z}
+
+// CHECK: vminsd {sae}, %xmm26, %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0x91,0xb7,0x10,0x5d,0xea]
+          vminsd {sae}, %xmm26, %xmm25, %xmm5
+
+// CHECK: vminsd (%rcx), %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x29]
+          vminsd (%rcx), %xmm25, %xmm5
+
+// CHECK: vminsd 291(%rax,%r14,8), %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0xb1,0xb7,0x00,0x5d,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vminsd 291(%rax,%r14,8), %xmm25, %xmm5
+
+// CHECK: vminsd 1016(%rdx), %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x6a,0x7f]
+          vminsd 1016(%rdx), %xmm25, %xmm5
+
+// CHECK: vminsd 1024(%rdx), %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0xf1,0xb7,0x00,0x5d,0xaa,0x00,0x04,0x00,0x00]
+          vminsd 1024(%rdx), %xmm25, %xmm5
+
+// CHECK: vminsd -1024(%rdx), %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0xf1,0xb7,0x00,0x5d,0x6a,0x80]
+          vminsd -1024(%rdx), %xmm25, %xmm5
+
+// CHECK: vminsd -1032(%rdx), %xmm25, %xmm5
+// CHECK:  encoding: [0x62,0xf1,0xb7,0x00,0x5d,0xaa,0xf8,0xfb,0xff,0xff]
+          vminsd -1032(%rdx), %xmm25, %xmm5
+
+// CHECK: vminss %xmm19, %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x31,0x76,0x00,0x5d,0xd3]
+          vminss %xmm19, %xmm17, %xmm10
+
+// CHECK: vminss %xmm19, %xmm17, %xmm10 {%k5}
+// CHECK:  encoding: [0x62,0x31,0x76,0x05,0x5d,0xd3]
+          vminss %xmm19, %xmm17, %xmm10 {%k5}
+
+// CHECK: vminss %xmm19, %xmm17, %xmm10 {%k5} {z}
+// CHECK:  encoding: [0x62,0x31,0x76,0x85,0x5d,0xd3]
+          vminss %xmm19, %xmm17, %xmm10 {%k5} {z}
+
+// CHECK: vminss {sae}, %xmm19, %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x31,0x76,0x10,0x5d,0xd3]
+          vminss {sae}, %xmm19, %xmm17, %xmm10
+
+// CHECK: vminss (%rcx), %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x71,0x76,0x00,0x5d,0x11]
+          vminss (%rcx), %xmm17, %xmm10
+
+// CHECK: vminss 291(%rax,%r14,8), %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x31,0x76,0x00,0x5d,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vminss 291(%rax,%r14,8), %xmm17, %xmm10
+
+// CHECK: vminss 508(%rdx), %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x71,0x76,0x00,0x5d,0x52,0x7f]
+          vminss 508(%rdx), %xmm17, %xmm10
+
+// CHECK: vminss 512(%rdx), %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x71,0x76,0x00,0x5d,0x92,0x00,0x02,0x00,0x00]
+          vminss 512(%rdx), %xmm17, %xmm10
+
+// CHECK: vminss -512(%rdx), %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x71,0x76,0x00,0x5d,0x52,0x80]
+          vminss -512(%rdx), %xmm17, %xmm10
+
+// CHECK: vminss -516(%rdx), %xmm17, %xmm10
+// CHECK:  encoding: [0x62,0x71,0x76,0x00,0x5d,0x92,0xfc,0xfd,0xff,0xff]
+          vminss -516(%rdx), %xmm17, %xmm10
+
+// CHECK: vmulpd {rn-sae}, %zmm23, %zmm4, %zmm24
+// CHECK:  encoding: [0x62,0x21,0xdd,0x18,0x59,0xc7]
+          vmulpd {rn-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulpd {ru-sae}, %zmm23, %zmm4, %zmm24
+// CHECK:  encoding: [0x62,0x21,0xdd,0x58,0x59,0xc7]
+          vmulpd {ru-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulpd {rd-sae}, %zmm23, %zmm4, %zmm24
+// CHECK:  encoding: [0x62,0x21,0xdd,0x38,0x59,0xc7]
+          vmulpd {rd-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulpd {rz-sae}, %zmm23, %zmm4, %zmm24
+// CHECK:  encoding: [0x62,0x21,0xdd,0x78,0x59,0xc7]
+          vmulpd {rz-sae}, %zmm23, %zmm4, %zmm24
+
+// CHECK: vmulps {rn-sae}, %zmm24, %zmm6, %zmm3
+// CHECK:  encoding: [0x62,0x91,0x4c,0x18,0x59,0xd8]
+          vmulps {rn-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulps {ru-sae}, %zmm24, %zmm6, %zmm3
+// CHECK:  encoding: [0x62,0x91,0x4c,0x58,0x59,0xd8]
+          vmulps {ru-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulps {rd-sae}, %zmm24, %zmm6, %zmm3
+// CHECK:  encoding: [0x62,0x91,0x4c,0x38,0x59,0xd8]
+          vmulps {rd-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulps {rz-sae}, %zmm24, %zmm6, %zmm3
+// CHECK:  encoding: [0x62,0x91,0x4c,0x78,0x59,0xd8]
+          vmulps {rz-sae}, %zmm24, %zmm6, %zmm3
+
+// CHECK: vmulsd %xmm18, %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x31,0xdf,0x08,0x59,0xea]
+          vmulsd %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd %xmm18, %xmm4, %xmm13 {%k2}
+// CHECK:  encoding: [0x62,0x31,0xdf,0x0a,0x59,0xea]
+          vmulsd %xmm18, %xmm4, %xmm13 {%k2}
+
+// CHECK: vmulsd %xmm18, %xmm4, %xmm13 {%k2} {z}
+// CHECK:  encoding: [0x62,0x31,0xdf,0x8a,0x59,0xea]
+          vmulsd %xmm18, %xmm4, %xmm13 {%k2} {z}
+
+// CHECK: vmulsd {rn-sae}, %xmm18, %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x31,0xdf,0x18,0x59,0xea]
+          vmulsd {rn-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd {ru-sae}, %xmm18, %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x31,0xdf,0x58,0x59,0xea]
+          vmulsd {ru-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd {rd-sae}, %xmm18, %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x31,0xdf,0x38,0x59,0xea]
+          vmulsd {rd-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd {rz-sae}, %xmm18, %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x31,0xdf,0x78,0x59,0xea]
+          vmulsd {rz-sae}, %xmm18, %xmm4, %xmm13
+
+// CHECK: vmulsd (%rcx), %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x71,0xdf,0x08,0x59,0x29]
+          vmulsd (%rcx), %xmm4, %xmm13
+
+// CHECK: vmulsd 291(%rax,%r14,8), %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x31,0xdf,0x08,0x59,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmulsd 291(%rax,%r14,8), %xmm4, %xmm13
+
+// CHECK: vmulsd 1016(%rdx), %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x71,0xdf,0x08,0x59,0x6a,0x7f]
+          vmulsd 1016(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulsd 1024(%rdx), %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x71,0xdf,0x08,0x59,0xaa,0x00,0x04,0x00,0x00]
+          vmulsd 1024(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulsd -1024(%rdx), %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x71,0xdf,0x08,0x59,0x6a,0x80]
+          vmulsd -1024(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulsd -1032(%rdx), %xmm4, %xmm13
+// CHECK:  encoding: [0x62,0x71,0xdf,0x08,0x59,0xaa,0xf8,0xfb,0xff,0xff]
+          vmulsd -1032(%rdx), %xmm4, %xmm13
+
+// CHECK: vmulss %xmm14, %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x2e,0x08,0x59,0xf6]
+          vmulss %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss %xmm14, %xmm10, %xmm22 {%k4}
+// CHECK:  encoding: [0x62,0xc1,0x2e,0x0c,0x59,0xf6]
+          vmulss %xmm14, %xmm10, %xmm22 {%k4}
+
+// CHECK: vmulss %xmm14, %xmm10, %xmm22 {%k4} {z}
+// CHECK:  encoding: [0x62,0xc1,0x2e,0x8c,0x59,0xf6]
+          vmulss %xmm14, %xmm10, %xmm22 {%k4} {z}
+
+// CHECK: vmulss {rn-sae}, %xmm14, %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x2e,0x18,0x59,0xf6]
+          vmulss {rn-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss {ru-sae}, %xmm14, %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x2e,0x58,0x59,0xf6]
+          vmulss {ru-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss {rd-sae}, %xmm14, %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x2e,0x38,0x59,0xf6]
+          vmulss {rd-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss {rz-sae}, %xmm14, %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x2e,0x78,0x59,0xf6]
+          vmulss {rz-sae}, %xmm14, %xmm10, %xmm22
+
+// CHECK: vmulss (%rcx), %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x2e,0x08,0x59,0x31]
+          vmulss (%rcx), %xmm10, %xmm22
+
+// CHECK: vmulss 291(%rax,%r14,8), %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xa1,0x2e,0x08,0x59,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmulss 291(%rax,%r14,8), %xmm10, %xmm22
+
+// CHECK: vmulss 508(%rdx), %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x2e,0x08,0x59,0x72,0x7f]
+          vmulss 508(%rdx), %xmm10, %xmm22
+
+// CHECK: vmulss 512(%rdx), %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x2e,0x08,0x59,0xb2,0x00,0x02,0x00,0x00]
+          vmulss 512(%rdx), %xmm10, %xmm22
+
+// CHECK: vmulss -512(%rdx), %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x2e,0x08,0x59,0x72,0x80]
+          vmulss -512(%rdx), %xmm10, %xmm22
+
+// CHECK: vmulss -516(%rdx), %xmm10, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x2e,0x08,0x59,0xb2,0xfc,0xfd,0xff,0xff]
+          vmulss -516(%rdx), %xmm10, %xmm22