diff options
author | Søren Sandmann Pedersen <ssp@redhat.com> | 2013-02-20 17:33:03 -0500 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2013-02-20 17:33:03 -0500 |
commit | 655a156a424da6ae409cc4311ac72057d44fb87b (patch) | |
tree | 737b12480e762fc37474985715eab66a1ad282ec | |
parent | 64535c7dabb4cd3dac639c659feec49cd4548152 (diff) |
Organize instructino table by sse level
-rw-r--r-- | simplex86.c | 335 |
1 files changed, 173 insertions, 162 deletions
diff --git a/simplex86.c b/simplex86.c index 8035b0a..512fba6 100644 --- a/simplex86.c +++ b/simplex86.c @@ -231,34 +231,6 @@ static const variant_t variants[] = * different name, and movabs is what GNU as uses. */ { I_movabs, { A_R64, A_I32, A_I32 }, F_386, E_O, 0xb8 }, - { I_movq, { A_MMX, A_MMXM }, F_MMX, E_RM, 0x0f6f, NO_REX_W }, - { I_movq, { A_MMXM, A_MMX }, F_MMX, E_MR, 0x0f7f, NO_REX_W }, - { I_movq, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f7e, PRE_F3 | NO_REX_W }, - { I_movq, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0fd6, PRE_66 | NO_REX_W }, - { I_movq2dq, { A_SSE, A_MMX }, F_SSE2, E_RM, 0x0fd6, PRE_F3 | NO_REX_W }, - { I_movdqa, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f6f, PRE_66 }, - { I_movdqa, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0f7f, PRE_66 }, - { I_vmovdqa, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x6f, VEX(128,0x66,0x0f,WIG) }, - { I_vmovdqa, { A_SSEM, A_SSE }, F_AVX, E_MR, 0x7f, VEX(128,0x66,0x0f,WIG) }, - { I_vmovdqa, { A_AVX, A_AVXM }, F_AVX, E_RM, 0x6f, VEX(256,0x66,0x0f,WIG) }, - { I_vmovdqa, { A_AVXM, A_AVX }, F_AVX, E_MR, 0x7f, VEX(256,0x66,0x0f,WIG) }, - { I_movdqu, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f6f, PRE_F3 }, - { I_movdqu, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0f7f, PRE_F3 }, - { I_vmovdqu, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x6f, VEX(128,0xf3,0x0f,WIG) }, - { I_vmovdqu, { A_SSEM, A_SSE }, F_AVX, E_MR, 0x7f, VEX(128,0xf3,0x0f,WIG) }, - { I_vmovdqu, { A_AVX, A_AVXM }, F_AVX, E_RM, 0x6f, VEX(256,0xf3,0x0f,WIG) }, - { I_vmovdqu, { A_AVXM, A_AVX }, F_AVX, E_MR, 0x7f, VEX(256,0xf3,0x0f,WIG) }, - { I_lddqu, { A_SSE, A_MEM }, F_SSE3, E_RM, 0x0ff0, PRE_F2 }, - { I_vlddqu, { A_SSE, A_MEM }, F_AVX, E_RM, 0xf0, VEX(128,0xf2,0x0f,WIG) }, - { I_vlddqu, { A_AVX, A_MEM }, F_AVX, E_RM, 0xf0, VEX(256,0xf2,0x0f,WIG) }, - { I_vmaskmovps, { A_SSE, A_SSE, A_MEM }, F_AVX, E_RVM, 0x2c, VEX(128,0x66,0x0f38,W0) }, - { I_vmaskmovps, { A_AVX, A_AVX, A_MEM }, F_AVX, E_RVM, 0x2c, VEX(256,0x66,0x0f38,W0) }, - { I_vmaskmovps, { A_MEM, A_SSE, A_SSE }, F_AVX, E_MVR, 0x2e, VEX(128,0x66,0x0f38,W0) }, - { I_vmaskmovps, { A_MEM, A_AVX, A_AVX }, F_AVX, E_MVR, 0x2e, VEX(256,0x66,0x0f38,W0) }, - { I_vmaskmovpd, { A_SSE, A_SSE, A_MEM }, F_AVX, E_RVM, 0x2d, VEX(128,0x66,0x0f38,W0) }, - { I_vmaskmovpd, { A_AVX, A_AVX, A_MEM }, F_AVX, E_RVM, 0x2d, VEX(256,0x66,0x0f38,W0) }, - { I_vmaskmovpd, { A_MEM, A_SSE, A_SSE }, F_AVX, E_MVR, 0x2f, VEX(128,0x66,0x0f38,W0) }, - { I_vmaskmovpd, { A_MEM, A_AVX, A_AVX }, F_AVX, E_MVR, 0x2f, VEX(256,0x66,0x0f38,W0) }, #define ALU_OPS(name, opc) \ { name, { A_RM8, A_R8 }, F_386, E_MR, (opc << 3) + 0 }, \ @@ -394,91 +366,11 @@ static const variant_t variants[] = { I_imul3, { A_R32, A_RM32, A_I32 }, F_386, E_RM, 0x69 }, { I_imul3, { A_R64, A_RM64, A_I32 }, F_386, E_RM, 0x69 }, - /* SIMD instructions */ - { I_palignr, { A_MMX, A_MMXM, A_I8 }, F_SSSE3, E_RM, 0x0f3a0f }, - { I_palignr, { A_SSE, A_SSEM, A_I8 }, F_SSSE3, E_RM, 0x0f3a0f, PRE_66 }, - { I_vpalignr, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0f, VEX(128,0x66,0x0f3a,WIG) }, - -#define SIMD_FLOAT_OPS(name, opc) \ - { I_##name##ps, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc) }, \ - { I_v##name##ps, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x00,0x0f,WIG) }, \ - { I_v##name##ps, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, opc, VEX(256,0x00,0x0f,WIG) }, \ - { I_##name##pd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ - { I_v##name##pd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ - { I_v##name##pd, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f,WIG) }, \ - { I_##name##ss, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ - { I_v##name##ss, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(LIG,0xf3,0x0f,WIG) }, \ - { I_##name##sd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ - { I_v##name##sd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(LIG,0xf2,0x0f,WIG) } - - SIMD_FLOAT_OPS (add, 0x58), - SIMD_FLOAT_OPS (and, 0x54), - SIMD_FLOAT_OPS (andn, 0x55), - SIMD_FLOAT_OPS (div, 0x5e), - SIMD_FLOAT_OPS (max, 0x5f), - SIMD_FLOAT_OPS (min, 0x5d), - SIMD_FLOAT_OPS (mul, 0x59), - SIMD_FLOAT_OPS (or, 0x56), - SIMD_FLOAT_OPS (sub, 0x5c), - SIMD_FLOAT_OPS (xor, 0x57), - - { I_hsubps, { A_SSE, A_SSEM }, F_SSE3, E_RM, 0x0f7d, PRE_F2 }, - { I_vhsubps, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x7d, VEX(128,0xf2,0x0f,WIG) }, - { I_vhsubps, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, 0x7d, VEX(256,0Xf2,0x0f,WIG) }, - { I_hsubpd, { A_SSE, A_SSEM }, F_SSE3, E_RM, 0x0f7d, PRE_66 }, - { I_vhsubpd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x7d, VEX(128,0x66,0x0f,WIG) }, - { I_vhsubpd, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, 0x7d, VEX(256,0x66,0x0f,WIG) }, - -#define SIMD_FLOAT_OPS_IMM(name, opc) \ - { I_##name##ps, { A_SSE, A_SSEM, A_I8 }, F_SSE, E_RM, (0x0f00 + opc) }, \ - { I_v##name##ps, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x00,0x0f,WIG) }, \ - { I_v##name##ps, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x00,0x0f,WIG) }, \ - { I_##name##pd, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ - { I_v##name##pd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ - { I_v##name##pd, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f,WIG) }, \ - { I_##name##ss, { A_SSE, A_SSEM, A_I8 }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ - { I_v##name##ss, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(LIG,0xf3,0x0f,WIG) }, \ - { I_##name##sd, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ - { I_v##name##sd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(LIG,0xf2,0x0f,WIG) } - - SIMD_FLOAT_OPS_IMM (cmp, 0xc2), - SIMD_FLOAT_OPS_IMM (shuf, 0xc6), - -#define SIMD_FLOAT_OPS_UNARY_SINGLE(name, opc) \ - { I_##name##ps, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc) }, \ - { I_v##name##ps, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x00,0x0f,WIG) }, \ - { I_v##name##ps, { A_AVX, A_AVXM }, F_AVX, E_RM, opc, VEX(256,0x00,0x0f,WIG) }, \ - { I_##name##ss, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ - { I_v##name##ss, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(LIG,0xf3,0x0f,WIG) } - -#define SIMD_FLOAT_OPS_UNARY_DOUBLE(name, opc) \ - { I_##name##pd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ - { I_v##name##pd, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x66,0x0f,WIG) }, \ - { I_v##name##pd, { A_AVX, A_AVXM }, F_AVX, E_RM, opc, VEX(256,0x66,0x0f,WIG) }, \ - { I_##name##sd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ - { I_v##name##sd, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(LIG,0xf2,0x0f,WIG) } - - SIMD_FLOAT_OPS_UNARY_SINGLE (sqrt, 0x51), - SIMD_FLOAT_OPS_UNARY_DOUBLE (sqrt, 0x51), - SIMD_FLOAT_OPS_UNARY_SINGLE (rsqrt, 0x52), - SIMD_FLOAT_OPS_UNARY_SINGLE (rcp, 0x53), - -#define SIMD_FLOAT_OPSi(name, opc) \ - { I_##name##ps, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, (0x0f3a00 + opc), PRE_66 }, \ - { I_v##name##ps, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f3a,WIG) }, \ - { I_v##name##ps, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f3a,WIG) }, \ - \ - { I_##name##pd, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, (0x0f3a00 + opc + 1), PRE_66 }, \ - { I_v##name##pd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc + 1, VEX(128,0x66,0x0f3a,WIG) }, \ - { I_v##name##pd, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc + 1, VEX(256,0x66,0x0f3a,WIG) } - - SIMD_FLOAT_OPSi (blend, 0x0c), - - SIMD_FLOAT_OPSi (round, 0x08), - { I_roundss, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, 0x0f3a0a, PRE_66 }, - { I_vroundss, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0a, VEX(LIG,0x66,0x0f3a,WIG) }, - { I_roundsd, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, 0x0f3a0b, PRE_66 }, - { I_vroundsd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0b, VEX(LIG,0x66,0x0f3a,WIG) }, + /* MMX */ + { I_movq, { A_MMX, A_MMXM }, F_MMX, E_RM, 0x0f6f, NO_REX_W }, + { I_movq, { A_MMXM, A_MMX }, F_MMX, E_MR, 0x0f7f, NO_REX_W }, + { I_movq, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f7e, PRE_F3 | NO_REX_W }, + { I_movq, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0fd6, PRE_66 | NO_REX_W }, #define MMX_TRINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_MMX, E_RM, (0x0f00 + opc) }, \ @@ -542,42 +434,147 @@ static const variant_t variants[] = MMX_SHIFT (psraw, 0xe1, 4), MMX_SHIFT (psrad, 0xe2, 4), -#define SIMD_0F_OPS(name, opc, mmx_flag) \ - { I_##name, { A_MMX, A_MMXM }, mmx_flag, E_RM, (0x0f00 + opc) }, \ + /* pmaddubsw is special because it wasn't extended to SSE registers until SSSE3 */ + { I_pmaddubsw, { A_MMX, A_MMXM }, F_MMX, E_RM, 0x0f3804 }, + { I_pmaddubsw, { A_SSE, A_SSEM }, F_SSSE3, E_RM, 0x0f3804, PRE_66 }, + { I_vpmaddubsw, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x04, VEX(128,0x66,0x0f38,WIG) }, + { I_vpmaddubsw, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, 0x04, VEX(256,0x66,0x0f38,WIG) }, + + /* AMD's MMX extensions */ +#define MMX_EX_TRINARY(name, opc) \ + { I_##name, { A_MMX, A_MMXM }, F_MMX_EX, E_RM, (0x0f00 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f,WIG) } - - /* AMD's MMX extensions */ - SIMD_0F_OPS (pavgb, 0xe0, F_MMX_EX), - SIMD_0F_OPS (pavgw, 0xe3, F_MMX_EX), - SIMD_0F_OPS (pmaxsw, 0xee, F_MMX_EX), - SIMD_0F_OPS (pmaxub, 0xde, F_MMX_EX), - SIMD_0F_OPS (pminsw, 0xea, F_MMX_EX), - SIMD_0F_OPS (pminub, 0xda, F_MMX_EX), - SIMD_0F_OPS (pmulhuw, 0xe4, F_MMX_EX), - SIMD_0F_OPS (psadbw, 0xf6, F_MMX_EX), - + + MMX_EX_TRINARY (pavgb, 0xe0), + MMX_EX_TRINARY (pavgw, 0xe3), + MMX_EX_TRINARY (pmaxsw, 0xee), + MMX_EX_TRINARY (pmaxub, 0xde), + MMX_EX_TRINARY (pminsw, 0xea), + MMX_EX_TRINARY (pminub, 0xda), + MMX_EX_TRINARY (pmulhuw, 0xe4), + MMX_EX_TRINARY (psadbw, 0xf6), + + /* pinsrw, pextrw are special because they don't have AVX2 variants and + * because SSE4.1 added new variants of pextrw. pmovmskb also doesn't + * have an AVX2 version, and it is binary. + */ { I_pinsrw, { A_MMX, A_RM32, A_I8 }, F_MMX_EX, E_RM, 0x0fc4 }, { I_pinsrw, { A_SSE, A_RM32, A_I8 }, F_SSE2, E_RM, 0x0fc4, PRE_66 }, { I_vpinsrw, { A_SSE, A_SSE, A_RM32, A_I8 }, F_AVX, E_RVM, 0xc4, VEX(128,0x66,0x0f,W0) }, - /* No AVX2 version for pinsrw */ { I_pextrw, { A_R, A_MMX, A_I8 }, F_MMX_EX, E_RM, 0x0fc5 }, { I_pextrw, { A_R, A_SSE, A_I8 }, F_SSE2, E_RM, 0x0fc5, PRE_66 }, { I_pextrw, { A_RM, A_SSE, A_I8 }, F_SSE41, E_MR, 0x0f3a15, PRE_66 }, { I_vpextrw, { A_R, A_AVX, A_I8 }, F_AVX, E_RM, 0xc5, VEX(128,0x66,0x0f,W0) }, { I_vpextrw, { A_RM, A_AVX, A_I8 }, F_AVX, E_MR, 0x15, VEX(128,0x66,0x0f3a,W0) }, - /* No AVX2 version for pextrw */ - { I_pmovmskb, { A_R, A_MMX }, F_MMX_EX, E_RM, 0x0fd7 }, { I_pmovmskb, { A_R, A_SSE }, F_SSE2, E_RM, 0x0fd7, PRE_66 }, { I_vpmovmskb, { A_R, A_SSE }, F_AVX, E_RM, 0xd7, VEX(128,0x66,0x0f,WIG) }, - - /* MMX version appeared with SSE2 */ - SIMD_0F_OPS (paddq, 0xd4, F_SSE2), - SIMD_0F_OPS (psubq, 0xfb, F_SSE2), - SIMD_0F_OPS (pmuludq, 0xf4, F_SSE2), + /* SSE */ +#define SSE_TRINARY(name, opc) \ + { I_##name##ps, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc) }, \ + { I_v##name##ps, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x00,0x0f,WIG) }, \ + { I_v##name##ps, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, opc, VEX(256,0x00,0x0f,WIG) }, \ + { I_##name##pd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ + { I_v##name##pd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ + { I_v##name##pd, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f,WIG) }, \ + { I_##name##ss, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ + { I_v##name##ss, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(LIG,0xf3,0x0f,WIG) }, \ + { I_##name##sd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ + { I_v##name##sd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(LIG,0xf2,0x0f,WIG) } + + SSE_TRINARY (add, 0x58), + SSE_TRINARY (and, 0x54), + SSE_TRINARY (andn, 0x55), + SSE_TRINARY (div, 0x5e), + SSE_TRINARY (max, 0x5f), + SSE_TRINARY (min, 0x5d), + SSE_TRINARY (mul, 0x59), + SSE_TRINARY (or, 0x56), + SSE_TRINARY (sub, 0x5c), + SSE_TRINARY (xor, 0x57), + +#define SSE_TRINARY_IMM(name, opc) \ + { I_##name##ps, { A_SSE, A_SSEM, A_I8 }, F_SSE, E_RM, (0x0f00 + opc) }, \ + { I_v##name##ps, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x00,0x0f,WIG) }, \ + { I_v##name##ps, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x00,0x0f,WIG) }, \ + { I_##name##pd, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ + { I_v##name##pd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ + { I_v##name##pd, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f,WIG) }, \ + { I_##name##ss, { A_SSE, A_SSEM, A_I8 }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ + { I_v##name##ss, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(LIG,0xf3,0x0f,WIG) }, \ + { I_##name##sd, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ + { I_v##name##sd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(LIG,0xf2,0x0f,WIG) } + + SSE_TRINARY_IMM (cmp, 0xc2), + SSE_TRINARY_IMM (shuf, 0xc6), + +#define SSE_UNARY_SINGLE(name, opc) \ + { I_##name##ps, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc) }, \ + { I_v##name##ps, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x00,0x0f,WIG) }, \ + { I_v##name##ps, { A_AVX, A_AVXM }, F_AVX, E_RM, opc, VEX(256,0x00,0x0f,WIG) }, \ + { I_##name##ss, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ + { I_v##name##ss, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(LIG,0xf3,0x0f,WIG) } + +#define SSE_UNARY_DOUBLE(name, opc) \ + { I_##name##pd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ + { I_v##name##pd, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x66,0x0f,WIG) }, \ + { I_v##name##pd, { A_AVX, A_AVXM }, F_AVX, E_RM, opc, VEX(256,0x66,0x0f,WIG) }, \ + { I_##name##sd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ + { I_v##name##sd, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(LIG,0xf2,0x0f,WIG) } + + SSE_UNARY_SINGLE (sqrt, 0x51), + SSE_UNARY_DOUBLE (sqrt, 0x51), + SSE_UNARY_SINGLE (rsqrt, 0x52), + SSE_UNARY_SINGLE (rcp, 0x53), + + /* SSE 2 */ + { I_movq2dq, { A_SSE, A_MMX }, F_SSE2, E_RM, 0x0fd6, PRE_F3 | NO_REX_W }, + { I_movdqa, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f6f, PRE_66 }, + { I_movdqa, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0f7f, PRE_66 }, + { I_vmovdqa, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x6f, VEX(128,0x66,0x0f,WIG) }, + { I_vmovdqa, { A_SSEM, A_SSE }, F_AVX, E_MR, 0x7f, VEX(128,0x66,0x0f,WIG) }, + { I_vmovdqa, { A_AVX, A_AVXM }, F_AVX, E_RM, 0x6f, VEX(256,0x66,0x0f,WIG) }, + { I_vmovdqa, { A_AVXM, A_AVX }, F_AVX, E_MR, 0x7f, VEX(256,0x66,0x0f,WIG) }, + { I_movdqu, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f6f, PRE_F3 }, + { I_movdqu, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0f7f, PRE_F3 }, + { I_vmovdqu, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x6f, VEX(128,0xf3,0x0f,WIG) }, + { I_vmovdqu, { A_SSEM, A_SSE }, F_AVX, E_MR, 0x7f, VEX(128,0xf3,0x0f,WIG) }, + { I_vmovdqu, { A_AVX, A_AVXM }, F_AVX, E_RM, 0x6f, VEX(256,0xf3,0x0f,WIG) }, + { I_vmovdqu, { A_AVXM, A_AVX }, F_AVX, E_MR, 0x7f, VEX(256,0xf3,0x0f,WIG) }, + +#define SSE2_TRINARY(name, opc) \ + { I_##name, { A_MMX, A_MMXM }, F_SSE2, E_RM, (0x0f00 + opc) }, \ + { I_##name, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ + { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ + { I_v##name, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f,WIG) } + + SSE2_TRINARY (paddq, 0xd4), + SSE2_TRINARY (psubq, 0xfb), + SSE2_TRINARY (pmuludq, 0xf4), + + /* These don't have MMX variants */ + { I_pshufhw, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, 0x0f70, PRE_F3 }, + { I_vpshufhw, { A_SSE, A_SSEM, A_I8 }, F_AVX, E_RM, 0x70, VEX(128,0xf3,0x0f,WIG) }, + { I_vpshufhw, { A_AVX, A_AVXM, A_I8 }, F_AVX2, E_RM, 0x70, VEX(256,0xf3,0x0f,WIG) }, + { I_pshuflw, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, 0x0f70, PRE_F2 }, + { I_vpshuflw, { A_SSE, A_SSEM, A_I8 }, F_AVX, E_RM, 0x70, VEX(128,0xf2,0x0f,WIG) }, + { I_vpshuflw, { A_AVX, A_AVXM, A_I8 }, F_AVX2, E_RM, 0x70, VEX(256,0xf2,0x0f,WIG) }, + + /* SSE3 */ + { I_lddqu, { A_SSE, A_MEM }, F_SSE3, E_RM, 0x0ff0, PRE_F2 }, + { I_vlddqu, { A_SSE, A_MEM }, F_AVX, E_RM, 0xf0, VEX(128,0xf2,0x0f,WIG) }, + { I_vlddqu, { A_AVX, A_MEM }, F_AVX, E_RM, 0xf0, VEX(256,0xf2,0x0f,WIG) }, + { I_hsubps, { A_SSE, A_SSEM }, F_SSE3, E_RM, 0x0f7d, PRE_F2 }, + { I_vhsubps, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x7d, VEX(128,0xf2,0x0f,WIG) }, + { I_vhsubps, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, 0x7d, VEX(256,0Xf2,0x0f,WIG) }, + { I_hsubpd, { A_SSE, A_SSEM }, F_SSE3, E_RM, 0x0f7d, PRE_66 }, + { I_vhsubpd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x7d, VEX(128,0x66,0x0f,WIG) }, + { I_vhsubpd, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, 0x7d, VEX(256,0x66,0x0f,WIG) }, + + /* SSSE 3 */ #define SSSE3_BINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_SSSE3, E_RM, (0x0f3800 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSSE3, E_RM, (0x0f3800 + opc), PRE_66 }, \ @@ -591,8 +588,8 @@ static const variant_t variants[] = #define SSSE3_TRINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_SSSE3, E_RM, (0x0f3800 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSSE3, E_RM, (0x0f3800 + opc), PRE_66 }, \ - { I_v##name, { A_SSE, A_SSEM, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f38,WIG) }, \ - { I_v##name, { A_AVX, A_SSEM, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f38,WIG) } + { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f38,WIG) }, \ + { I_v##name, { A_AVX, A_SSE, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f38,WIG) } SSSE3_TRINARY (psignb, 0x08), SSSE3_TRINARY (psignw, 0x09), @@ -606,17 +603,21 @@ static const variant_t variants[] = SSSE3_TRINARY (pmulhrsw, 0x0b), SSSE3_TRINARY (pshufb, 0x00), + { I_palignr, { A_MMX, A_MMXM, A_I8 }, F_SSSE3, E_RM, 0x0f3a0f }, + { I_palignr, { A_SSE, A_SSEM, A_I8 }, F_SSSE3, E_RM, 0x0f3a0f, PRE_66 }, + { I_vpalignr, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0f, VEX(128,0x66,0x0f3a,WIG) }, + + /* SSE 4.1 */ #define SSE41_BINARY(name, opc) \ { I_##name, { A_SSE, A_SSEM }, F_SSE41, E_RM, (0x0f3800 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x66,0x0f38,WIG) }, \ { I_v##name, { A_AVX, A_AVXM }, F_AVX2, E_RM, opc, VEX(256,0x66,0x0f38,WIG) } - SSE41_BINARY (ptest, 0x17), - #define SSE41_PMOVX(type, opc) \ - SSE41_BINARY (pmovsx##type, opc), \ + SSE41_BINARY (pmovsx##type, opc), \ SSE41_BINARY (pmovzx##type, opc + 0x10) + SSE41_BINARY (ptest, 0x17), SSE41_PMOVX (bw, 0x20), SSE41_PMOVX (bd, 0x21), SSE41_PMOVX (bq, 0x22), @@ -624,10 +625,14 @@ static const variant_t variants[] = SSE41_PMOVX (wq, 0x24), SSE41_PMOVX (dq, 0x25), + /* No AVX2 variant for phminposuw */ + { I_phminposuw, { A_SSE, A_SSEM }, F_SSE41, E_RM, 0x0f3841, PRE_66 }, + { I_vphminposuw, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x41, VEX(128,0x66,0x0f38,WIG) }, + #define SSE41_TRINARY(name, opc) \ { I_##name, { A_SSE, A_SSEM }, F_SSE41, E_RM, (0x0f3800 + opc), PRE_66 }, \ - { I_v##name, { A_SSE, A_SSEM, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f38,WIG) }, \ - { I_v##name, { A_AVX, A_SSEM, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f38,WIG) } + { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f38,WIG) }, \ + { I_v##name, { A_AVX, A_SSE, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f38,WIG) } SSE41_TRINARY (packusdw, 0x2b), SSE41_TRINARY (pcmpeqq, 0x29), @@ -642,31 +647,37 @@ static const variant_t variants[] = SSE41_TRINARY (pmuldq, 0x28), SSE41_TRINARY (pmulld, 0x40), - { I_phminposuw, { A_SSE, A_SSEM }, F_SSE41, E_RM, 0x0f3841, PRE_66 }, - { I_vphminposuw, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x41, VEX(128,0x66,0x0f38,WIG) }, - /* No AVX2 variant for phminposuw */ - +#define SSE41_TRINARY_IMM(name, opc) \ + { I_##name##ps, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, (0x0f3a00 + opc), PRE_66 }, \ + { I_v##name##ps, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f3a,WIG) }, \ + { I_v##name##ps, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f3a,WIG) }, \ + { I_##name##pd, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, (0x0f3a00 + opc + 1), PRE_66 }, \ + { I_v##name##pd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc + 1, VEX(128,0x66,0x0f3a,WIG) }, \ + { I_v##name##pd, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc + 1, VEX(256,0x66,0x0f3a,WIG) } - /* SSE 42 */ -#define SSE42_TRINARY(name, opc) \ - { I_##name, { A_SSE, A_SSEM }, F_SSE42, E_RM, (0x0f3800 + opc), PRE_66 }, \ - { I_v##name, { A_SSE, A_SSEM, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f38,WIG) }, \ - { I_v##name, { A_AVX, A_SSEM, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f38,WIG) } + SSE41_TRINARY_IMM (blend, 0x0c), + SSE41_TRINARY_IMM (round, 0x08), + { I_roundss, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, 0x0f3a0a, PRE_66 }, + { I_vroundss, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0a, VEX(LIG,0x66,0x0f3a,WIG) }, + { I_roundsd, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, 0x0f3a0b, PRE_66 }, + { I_vroundsd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0b, VEX(LIG,0x66,0x0f3a,WIG) }, - SSE42_TRINARY (pcmpgtq, 0x37), + /* SSE 4.2 */ + { I_pcmpgtq, { A_SSE, A_SSEM }, F_SSE42, E_RM, 0x0f3837, PRE_66 }, + { I_vpcmpgtq, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x37, VEX(128,0x66,0x0f38,WIG) }, + { I_vpcmpgtq, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, 0x37, VEX(256,0x66,0x0f38,WIG) }, - { I_pmaddubsw, { A_MMX, A_MMXM }, F_MMX, E_RM, 0x0f3804 }, - { I_pmaddubsw, { A_SSE, A_SSEM }, F_SSSE3, E_RM, 0x0f3804, PRE_66 }, - { I_vpmaddubsw, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x04, VEX(128,0x66,0x0f38,WIG) }, - { I_vpmaddubsw, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, 0x04, VEX(256,0x66,0x0f38,WIG) }, + /* AVX */ + { I_vmaskmovps, { A_SSE, A_SSE, A_MEM }, F_AVX, E_RVM, 0x2c, VEX(128,0x66,0x0f38,W0) }, + { I_vmaskmovps, { A_AVX, A_AVX, A_MEM }, F_AVX, E_RVM, 0x2c, VEX(256,0x66,0x0f38,W0) }, + { I_vmaskmovps, { A_MEM, A_SSE, A_SSE }, F_AVX, E_MVR, 0x2e, VEX(128,0x66,0x0f38,W0) }, + { I_vmaskmovps, { A_MEM, A_AVX, A_AVX }, F_AVX, E_MVR, 0x2e, VEX(256,0x66,0x0f38,W0) }, + { I_vmaskmovpd, { A_SSE, A_SSE, A_MEM }, F_AVX, E_RVM, 0x2d, VEX(128,0x66,0x0f38,W0) }, + { I_vmaskmovpd, { A_AVX, A_AVX, A_MEM }, F_AVX, E_RVM, 0x2d, VEX(256,0x66,0x0f38,W0) }, + { I_vmaskmovpd, { A_MEM, A_SSE, A_SSE }, F_AVX, E_MVR, 0x2f, VEX(128,0x66,0x0f38,W0) }, + { I_vmaskmovpd, { A_MEM, A_AVX, A_AVX }, F_AVX, E_MVR, 0x2f, VEX(256,0x66,0x0f38,W0) }, - { I_pshufhw, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, 0x0f70, PRE_F3 }, - { I_vpshufhw, { A_SSE, A_SSEM, A_I8 }, F_AVX, E_RM, 0x70, VEX(128,0xf3,0x0f,WIG) }, - { I_vpshufhw, { A_AVX, A_AVXM, A_I8 }, F_AVX2, E_RM, 0x70, VEX(256,0xf3,0x0f,WIG) }, - { I_pshuflw, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, 0x0f70, PRE_F2 }, - { I_vpshuflw, { A_SSE, A_SSEM, A_I8 }, F_AVX, E_RM, 0x70, VEX(128,0xf2,0x0f,WIG) }, - { I_vpshuflw, { A_AVX, A_AVXM, A_I8 }, F_AVX2, E_RM, 0x70, VEX(256,0xf2,0x0f,WIG) }, - + /* CVT16 */ { I_vcvtph2ps, { A_SSE, A_SSEM }, F_CVT16, E_RM, 0x13, VEX(128,0x66,0x0f38,W0) }, { I_vcvtph2ps, { A_AVX, A_SSEM }, F_CVT16, E_RM, 0x13, VEX(256,0x66,0x0f38,W0) }, { I_vcvtps2ph, { A_SSEM, A_AVX, A_I8 }, F_CVT16, E_MR, 0x1d, VEX(256,0x66,0x0f3a,W0) }, |