#include #include #include #include #include #include #include #include #include #include "simplex86.h" #include "code-manager.h" typedef struct variant_t variant_t; #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif /* There are up to 32 different operand types, and * arg_type_t is an enum of values whose bits * indicate which of the operand types are accepted * for this kind of argument. */ typedef enum { A_CL = (1 << OP_CL), A_AL = (1 << OP_AL), A_AX = (1 << OP_AX), A_EAX = (1 << OP_EAX), A_RAX = (1 << OP_RAX), A_MEM8 = (1 << OP_MEM8) | (1 << OP_RIP_REL8), A_MEM16 = (1 << OP_MEM16) | (1 << OP_RIP_REL16), A_MEM32 = (1 << OP_MEM32) | (1 << OP_RIP_REL32), A_MEM64 = (1 << OP_MEM64) | (1 << OP_RIP_REL64), A_MSIZE = A_MEM8 | A_MEM16 | A_MEM32 | A_MEM64, A_MEM = (1 << OP_MEM) | (1 << OP_RIP_REL) | A_MSIZE, A_R8 = A_CL | A_AL | (1 << OP_GP8), A_R16 = A_AX | (1 << OP_GP16), A_R32 = A_EAX | (1 << OP_GP32), A_R64 = A_RAX | (1 << OP_GP64), A_RM8 = A_R8 | A_MEM8, A_RM16 = A_R16 | A_MEM16, A_RM32 = A_R32 | A_MEM32, A_RM64 = A_R64 | A_MEM64, A_R = A_R16 | A_R32 | A_R64, A_RM = A_R | A_MEM, A_MMX = (1 << OP_MM), A_MMXM = A_MMX | A_MEM, A_SSE = (1 << OP_XMM), A_SSEM = A_SSE | A_MEM, A_AVX = (1 << OP_YMM), A_AVXM = A_AVX | A_MEM, A_1 = (1 << OP_1), A_I8 = A_1 | (1 << OP_I8), A_I16 = A_I8 | (1 << OP_I16), A_I32 = A_I16 | (1 << OP_I32), A_LABEL = (1 << OP_LABEL_REF) } arg_type_t; typedef enum { E_RM, /* dest in reg, source in regm */ E_MR, /* dest in regm, source in reg */ E_RVM, /* dest in reg, src1 in VEX, src2 in regm */ E_MVR, /* dest in regm, src1 in VEX, src2 in reg */ E_RMV, /* dest in reg, src1 in regm, src2 in VEX */ E_RVMI, /* RVM, with additional register operand in imm8[7:4] */ E_RVIM, /* op1:reg, op2:vex, op3:imm8[7:4], op4:regm */ E_RVMX, /* RVM, with opcode extension encoded as immediate */ E_M, /* dest in regm, opcode extension in reg */ E_VM, /* dest in VEX, src1 in regm, opcode extension in reg */ E_O, /* register op added to opcode */ E_NP, /* opcode only */ E_D, /* immediate data, no opcode */ E_ANNOTATE, /* generate an annotation */ E_N_ENCODINGS } encoding_type_t; typedef enum { NONE, LABEL, JCC, JUMP, CALL, RIP_REF, ALIGN, FRAGMENT } annotation_type_t; typedef enum { F_386 = (1 << 0), F_MMX = (1 << 1), F_MMX_EX = (1 << 2), F_SSE = (1 << 3), F_SSE2 = (1 << 4), F_SSE3 = (1 << 5), F_SSSE3 = (1 << 6), F_SSE41 = (1 << 7), F_SSE42 = (1 << 8), F_AVX = (1 << 9), F_AVX2 = (1 << 10), F_OSXSAVE = (1 << 11), F_F16C = (1 << 12), F_MULTINOP = (1 << 13), F_RDTSC = (1 << 14), F_RDTSCP = (1 << 15), F_CLFLUSH = (1 << 16), F_CMOV = (1 << 17), F_CPUID = (1 << 18), F_MOVBE = (1 << 19), F_BMI1 = (1 << 20), F_BMI2 = (1 << 21), F_LZCNT = (1 << 22), F_POPCNT = (1 << 23), F_TBM = (1 << 24), F_XOP = (1 << 25), F_FMA = (1 << 26), F_FMA4 = (1 << 27), F_FXSR = (1 << 28), F_DAZ = (1 << 29), F_LAST, F_MASK = (F_LAST - 1) | (F_LAST - 2) } feature_t; struct variant_t { instruction_t inst; arg_type_t ops[4]; feature_t feature; encoding_type_t encoding; uint32_t opcode; /* Layout of the info field from lsb to msb: * * 0: modrm_op : 4 - fixed register op (/n in the Intel manuals), * or opcode ext for RVMX * 4: mandatory_66 : 1 - if there is a mandatory 0x66 prefix * 5: mandatory_f3 : 1 - if there is a mandatory 0xf3 prefix * 6: mandatory_f2 : 1 - if there is a mandatory 0xf2 prefix * 7: disallow_w : 1 - if REX.W is disallowed * 8: vex_len : 1 - 0 for 128bit instruction, 1 for 256 * 9: vex_prefix : 2 - 1 for 0x66, 2 for 0xf3, 3 for 0xf2 * 11: unused : 2 * 13: vex_w : 1 - The W bit for vex prefixes * 14: need_vex : 1 - Whether VEX should be used * 15: ann.type : 3 - The type of annotation generated * 18: need_xop : 1 - Whether XOP should be used * 19: vex opcode : 5 - The m-mmmm bits in VEX */ uint32_t info; }; #define PRE_66 (1 << 4) #define PRE_F3 (1 << 5) #define PRE_F2 (1 << 6) #define NO_REX_W (1 << 7) #define NEED_VEX (1 << 14) #define ANN_CALL (CALL << 15) #define ANN_JCC (JCC << 15) #define ANN_ALIGN (ALIGN << 15) #define ANN_JUMP (JUMP << 15) #define ANN_LABEL (LABEL << 15) #define NEED_XOP (1 << 18) #define GET_OP_EXTENSION(info) \ ((info) & 0xf) #define GET_ANNOTATION_TYPE(info) \ (((info) >> 15) & 0x7) #define XOPVEX(len, prefix, opcodes, w) \ ((((len == 128)? 0 : 1) << 8) | \ (((prefix == 0x66)? 0x01 : \ (prefix == 0xf3)? 0x02 : \ (prefix == 0xf2)? 0x03 : 0) << 9) | \ (((opcodes == 0x0f)? 0x01 : \ (opcodes == 0x0f38)? 0x02 : \ (opcodes == 0x0f3a)? 0x03 : \ (opcodes == 0x08)? 0x08 : \ (opcodes == 0x09)? 0x09 : \ (opcodes == 0x0a)? 0x0a : 0) << 19) | \ ((w) << 13)) #define VEX(len, prefix, opcodes, w) \ (XOPVEX (len, prefix, opcodes, w) | NEED_VEX) #define XOP(len, prefix, opcodes, w) \ (XOPVEX (len, prefix, opcodes, w) | NEED_XOP) #define GET_VEX_LEN(info) (((info) >> 8) & 1) #define GET_VEX_PREFIX(info) (((info) >> 9) & 3) #define GET_VEX_OPCODE(info) (((info) >> 19) & 0x1f) #define GET_VEX_W(info) (((info) >> 13) & 1) #define WIG 0x00 #define W0 0x00 #define W1 0x01 /* For VEX prefixes where L is ignored */ #define LIG 128 /* For VEX prefixes where L must be zero */ #define LZ 128 static const variant_t variants[] = { /* Pseudo-instructions */ { I_label, { A_LABEL }, F_386, E_ANNOTATE, 0, ANN_LABEL }, { I_align, { A_I8 }, F_386, E_ANNOTATE, 0, ANN_ALIGN }, { I_dq, { A_I32, A_I32 }, F_386, E_D }, { I_dd, { A_I32 }, F_386, E_D }, { I_dw, { A_I16 }, F_386, E_D }, { I_db, { A_I8 }, F_386, E_D }, /* i386 instructions */ { I_lock, { 0 }, F_386, E_NP, 0xf0 }, { I_pushf, { 0 }, F_386, E_NP, 0x9c }, { I_popf, { 0 }, F_386, E_NP, 0x9d }, { I_nop1, { A_RM16 }, F_MULTINOP, E_M, 0x0f1f }, { I_nop1, { A_RM32 }, F_MULTINOP, E_M, 0x0f1f }, { I_ret, { 0 }, F_386, E_NP, 0xc3 }, { I_nop, { 0 }, F_386, E_NP, 0x90 }, { I_clc, { 0 }, F_386, E_NP, 0xf8 }, { I_cld, { 0 }, F_386, E_NP, 0xfc }, { I_cbw, { 0 }, F_386, E_NP, 0x6698 }, { I_cwde, { 0 }, F_386, E_NP, 0x98 }, { I_cdqe, { 0 }, F_386, E_NP, 0x4898 }, { I_bsf, { A_R, A_RM }, F_386, E_RM, 0x0fbc }, { I_bsr, { A_R, A_RM }, F_386, E_RM, 0x0fbd }, { I_push, { A_R }, F_386, E_O, 0x50, NO_REX_W }, { I_push, { A_RM }, F_386, E_M, 0xff, 6 | NO_REX_W }, { I_pop, { A_R }, F_386, E_O, 0x58, NO_REX_W }, { I_xchg, { A_R, A_RM }, F_386, E_RM, 0x87 }, { I_xchg, { A_RM, A_R }, F_386, E_MR, 0x87 }, { I_lea, { A_R, A_MEM }, F_386, E_RM, 0x8d }, { I_mov, { A_R8, A_I8 }, F_386, E_O, 0xb0 }, { I_mov, { A_R16, A_I16 }, F_386, E_O, 0xb8 }, { I_mov, { A_R32, A_I32 }, F_386, E_O, 0xb8 }, { I_mov, { A_RM16, A_I16 }, F_386, E_M, 0xc7, 0 }, { I_mov, { A_RM32 | A_RM64, A_I32 }, F_386, E_M, 0xc7, 0 }, { I_mov, { A_RM, A_R }, F_386, E_MR, 0x89 }, { I_mov, { A_R, A_RM }, F_386, E_RM, 0x8b }, /* movabs is simply mov in the Intel manuals, but we need * three operands to represent a 64 bit immediate, so it has * to have a different name, and movabs is what GNU as uses. */ { I_movabs, { A_R64, A_I32, A_I32 }, F_386, E_O, 0xb8 }, #define ALU_OPS(name, opc) \ { name, { A_AL, A_I8 }, F_386, E_NP, (opc << 3) + 4 }, \ { name, { A_RM8, A_I8 }, F_386, E_M, 0x80, opc }, \ { name, { A_RM16, A_I8 }, F_386, E_M, 0x83, opc }, \ { name, { A_RM32, A_I8 }, F_386, E_M, 0x83, opc }, \ { name, { A_RM64, A_I8 }, F_386, E_M, 0x83, opc }, \ { name, { A_AX, A_I16 }, F_386, E_NP, (opc << 3) + 5 }, \ { name, { A_RM16, A_I16 }, F_386, E_M, 0x81, opc }, \ { name, { A_EAX, A_I32 }, F_386, E_NP, (opc << 3) + 5 }, \ { name, { A_RAX, A_I32 }, F_386, E_NP, (opc << 3) + 5 }, \ { name, { A_RM32, A_I32 }, F_386, E_M, 0x81, opc }, \ { name, { A_RM64, A_I32 }, F_386, E_M, 0x81, opc }, \ { name, { (A_R8 | A_MEM), A_R8 }, F_386, E_MR, (opc << 3) + 0 }, \ { name, { A_RM, A_R }, F_386, E_MR, (opc << 3) + 1 }, \ { name, { A_R8, (A_R8 | A_MEM) }, F_386, E_RM, (opc << 3) + 2 }, \ { name, { A_R, A_RM }, F_386, E_RM, (opc << 3) + 3 } ALU_OPS (I_add, 0), ALU_OPS (I_or, 1), ALU_OPS (I_adc, 2), ALU_OPS (I_sbb, 3), ALU_OPS (I_and, 4), ALU_OPS (I_sub, 5), ALU_OPS (I_xor, 6), ALU_OPS (I_cmp, 7), { I_test, { A_AL, A_I8 }, F_386, E_NP, 0xa8 }, { I_test, { A_AX, A_I16 }, F_386, E_NP, 0xa9 }, { I_test, { A_EAX, A_I32 }, F_386, E_NP, 0xa9 }, { I_test, { A_RAX, A_I32 }, F_386, E_NP, 0xa9 }, { I_test, { A_RM8, A_I8, }, F_386, E_M, 0xf6, 0 }, { I_test, { A_RM16, A_I16 }, F_386, E_M, 0xf7, 0 }, { I_test, { A_RM32, A_I32 }, F_386, E_M, 0xf7, 0 }, { I_test, { A_RM64, A_I32 }, F_386, E_M, 0xf7, 0 }, { I_test, { (A_R8 | A_MEM), A_R8 }, F_386, E_MR, 0x84 }, { I_test, { A_RM, A_R }, F_386, E_MR, 0x85 }, { I_test, { A_R, A_RM }, F_386, E_RM, 0x85 }, #define SHIFT_OPS(name, opc) \ { name, { A_RM, A_1 }, F_386, E_M, 0xd1, opc }, \ { name, { A_RM, A_I8 }, F_386, E_M, 0xc1, opc }, \ { name, { A_RM, A_CL }, F_386, E_M, 0xd3, opc } SHIFT_OPS (I_rol, 0), SHIFT_OPS (I_ror, 1), SHIFT_OPS (I_rcl, 2), SHIFT_OPS (I_rcr, 3), SHIFT_OPS (I_shl, 4), SHIFT_OPS (I_shr, 5), SHIFT_OPS (I_sal, 4), SHIFT_OPS (I_sar, 7), #define DBL_SHIFT(name, opc) \ { name, { A_RM, A_R16, A_I8 }, F_386, E_MR, opc }, \ { name, { A_RM, A_R16, A_CL }, F_386, E_MR, opc + 1 }, \ { name, { A_RM, A_R32, A_I8 }, F_386, E_MR, opc }, \ { name, { A_RM, A_R32, A_CL }, F_386, E_MR, opc + 1 }, \ { name, { A_RM, A_R64, A_I8 }, F_386, E_MR, opc }, \ { name, { A_RM, A_R64, A_CL }, F_386, E_MR, opc + 1 } DBL_SHIFT (I_shrd, 0x0fac), DBL_SHIFT (I_shld, 0x0fa4), { I_movzx, { A_R, A_RM8 }, F_386, E_RM, 0x0fb6 }, { I_movzx, { A_R32 | A_R64, A_RM16 }, F_386, E_RM, 0x0fb7 }, { I_movsx, { A_R, A_RM8 }, F_386, E_RM, 0x0fbe }, { I_movsx, { A_R32 | A_R64, A_RM16 }, F_386, E_RM, 0x0fbf }, { I_movsx, { A_R64, A_RM32 }, F_386, E_RM, 0x63 }, /* Unconditional jumps */ { I_jmp, { A_LABEL }, F_386, E_ANNOTATE, 0x0, ANN_JUMP }, { I_jmp, { A_RM }, F_386, E_M, 0xff, 4 | NO_REX_W }, { I_call, { A_LABEL }, F_386, E_ANNOTATE, 0x0, ANN_CALL }, { I_call, { A_RM }, F_386, E_M, 0xff, 2 | NO_REX_W }, /* Conditional jumps */ { I_ja, { A_LABEL }, F_386, E_ANNOTATE, 0x77, ANN_JCC }, { I_jae, { A_LABEL }, F_386, E_ANNOTATE, 0x73, ANN_JCC }, { I_jb, { A_LABEL }, F_386, E_ANNOTATE, 0x72, ANN_JCC }, { I_jbe, { A_LABEL }, F_386, E_ANNOTATE, 0x76, ANN_JCC }, { I_jc, { A_LABEL }, F_386, E_ANNOTATE, 0x72, ANN_JCC }, { I_je, { A_LABEL }, F_386, E_ANNOTATE, 0x74, ANN_JCC }, { I_jg, { A_LABEL }, F_386, E_ANNOTATE, 0x7f, ANN_JCC }, { I_jge, { A_LABEL }, F_386, E_ANNOTATE, 0x7d, ANN_JCC }, { I_jl, { A_LABEL }, F_386, E_ANNOTATE, 0x7c, ANN_JCC }, { I_jle, { A_LABEL }, F_386, E_ANNOTATE, 0x7e, ANN_JCC }, { I_jna, { A_LABEL }, F_386, E_ANNOTATE, 0x76, ANN_JCC }, { I_jnae, { A_LABEL }, F_386, E_ANNOTATE, 0x72, ANN_JCC }, { I_jnb, { A_LABEL }, F_386, E_ANNOTATE, 0x73, ANN_JCC }, { I_jnbe, { A_LABEL }, F_386, E_ANNOTATE, 0x77, ANN_JCC }, { I_jnc, { A_LABEL }, F_386, E_ANNOTATE, 0x73, ANN_JCC }, { I_jne, { A_LABEL }, F_386, E_ANNOTATE, 0x75, ANN_JCC }, { I_jng, { A_LABEL }, F_386, E_ANNOTATE, 0x7e, ANN_JCC }, { I_jnge, { A_LABEL }, F_386, E_ANNOTATE, 0x7c, ANN_JCC }, { I_jnl, { A_LABEL }, F_386, E_ANNOTATE, 0x7d, ANN_JCC }, { I_jnle, { A_LABEL }, F_386, E_ANNOTATE, 0x7f, ANN_JCC }, { I_jno, { A_LABEL }, F_386, E_ANNOTATE, 0x71, ANN_JCC }, { I_jnp, { A_LABEL }, F_386, E_ANNOTATE, 0x7b, ANN_JCC }, { I_jns, { A_LABEL }, F_386, E_ANNOTATE, 0x79, ANN_JCC }, { I_jnz, { A_LABEL }, F_386, E_ANNOTATE, 0x75, ANN_JCC }, { I_jo, { A_LABEL }, F_386, E_ANNOTATE, 0x70, ANN_JCC }, { I_jp, { A_LABEL }, F_386, E_ANNOTATE, 0x7a, ANN_JCC }, { I_jpe, { A_LABEL }, F_386, E_ANNOTATE, 0x7a, ANN_JCC }, { I_jpo, { A_LABEL }, F_386, E_ANNOTATE, 0x7b, ANN_JCC }, { I_js, { A_LABEL }, F_386, E_ANNOTATE, 0x78, ANN_JCC }, { I_jz, { A_LABEL }, F_386, E_ANNOTATE, 0x74, ANN_JCC }, /* Unary ops (neg, not, mul, div) */ #define UNARY(name, op) \ { I_##name, { A_RM8 }, F_386, E_M, 0xf6, op }, \ { I_##name, { A_RM16 }, F_386, E_M, 0xf7, op }, \ { I_##name, { A_RM32 }, F_386, E_M, 0xf7, op }, \ { I_##name, { A_RM64 }, F_386, E_M, 0xf7, op } UNARY (not, 2), UNARY (neg, 3), UNARY (mul, 4), UNARY (imul1, 5), UNARY (div, 6), UNARY (idiv, 7), /* Other imul variants */ { I_imul2, { A_R, A_RM }, F_386, E_RM, 0x0faf }, { I_imul3, { A_R, A_RM, A_I8 }, F_386, E_RM, 0x6B }, { I_imul3, { A_R16, A_RM, A_I16 }, F_386, E_RM, 0x69 }, { I_imul3, { A_R32 | A_R64, A_RM, A_I32 }, F_386, E_RM, 0x69 }, /* Misc. extensions. We use F_CPUID for 486 */ { I_cpuid, { 0 }, F_CPUID, E_NP, 0x0fa2 }, { I_bswap, { A_R32 | A_R64 }, F_CPUID, E_O, 0x0fc8 }, { I_clflush, { A_MEM }, F_CLFLUSH, E_M, 0x0fae, 7 }, { I_rdtsc, { 0 }, F_RDTSC, E_NP, 0x0f31 }, { I_rdtscp, { 0 }, F_RDTSCP, E_NP, 0x0f01f9 }, { I_xgetbv, { 0 }, F_OSXSAVE, E_NP, 0x0f01d0 }, { I_movbe, { A_R, A_MEM }, F_MOVBE, E_RM, 0x0f38f0 }, { I_movbe, { A_MEM, A_R }, F_MOVBE, E_MR, 0x0f38f1 }, { I_lzcnt, { A_R, A_RM }, F_LZCNT, E_RM, 0x0fbd, PRE_F3 }, { I_popcnt, { A_R, A_RM }, F_POPCNT, E_RM, 0x0fb8, PRE_F3 }, /* MMX */ /* FIXME: Consider relaxing the RM32/RM64 to just RM and instead * add "mandatory REX.W" to I_movq */ { I_movd, { A_MMX, A_RM32 }, F_MMX, E_RM, 0x0f6e }, { I_movd, { A_RM32, A_MMX }, F_MMX, E_MR, 0x0f7e }, { I_movd, { A_SSE, A_RM32 }, F_SSE2, E_RM, 0x0f6e, PRE_66 }, { I_movd, { A_RM32, A_SSE }, F_SSE2, E_MR, 0x0f7e, PRE_66 }, { I_vmovd, { A_SSE, A_RM32 }, F_AVX, E_RM, 0x6e, VEX(128,0x66,0x0f,W0) }, { I_vmovd, { A_RM32, A_SSE }, F_AVX, E_MR, 0x7e, VEX(128,0x66,0x0f,W0) }, { I_vmovd, { A_SSE, A_RM32 }, F_AVX, E_RM, 0x6e, VEX(128,0x66,0x0f,W0) }, { I_vmovd, { A_RM32, A_SSE }, F_AVX, E_MR, 0x7e, VEX(128,0x66,0x0f,W0) }, { I_movq, { A_MMX, A_RM64 }, F_MMX, E_RM, 0x0f6e }, { I_movq, { A_RM64, A_MMX }, F_MMX, E_MR, 0x0f7e }, { I_movq, { A_SSE, A_RM64 }, F_SSE2, E_RM, 0x0f6e, PRE_66 }, { I_movq, { A_RM64, A_SSE }, F_SSE2, E_MR, 0x0f7e, PRE_66 }, { I_movq, { A_MMX, A_MMXM }, F_MMX, E_RM, 0x0f6f, NO_REX_W }, { I_movq, { A_MMXM, A_MMX }, F_MMX, E_MR, 0x0f7f, NO_REX_W }, { I_movq, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f7e, PRE_F3 | NO_REX_W }, { I_movq, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0fd6, PRE_66 | NO_REX_W }, { I_vmovq, { A_SSE, A_RM64 }, F_AVX, E_RM, 0x6e, VEX(128,0x66,0x0f,W1) }, { I_vmovq, { A_RM64, A_SSE }, F_AVX, E_MR, 0x7e, VEX(128,0x66,0x0f,W1) }, { I_vmovq, { A_SSE, A_RM64 }, F_AVX, E_RM, 0x6e, VEX(128,0x66,0x0f,W1) }, { I_vmovq, { A_SSE, A_RM64 }, F_AVX, E_MR, 0x7e, VEX(128,0x66,0x0f,W1) }, { I_vmovq, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x7e, VEX(128,0xf3,0x0f,WIG) }, { I_vmovq, { A_SSEM, A_SSE }, F_AVX, E_MR, 0xd6, VEX(128,0x66,0x0f,WIG) }, #define MMX_TRINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_MMX, E_RM, (0x0f00 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f,WIG) } MMX_TRINARY (packsswb, 0x63), MMX_TRINARY (packssdw, 0x6b), MMX_TRINARY (packuswb, 0x67), MMX_TRINARY (pand, 0xdb), MMX_TRINARY (pandn, 0xdf), MMX_TRINARY (paddb, 0xfc), MMX_TRINARY (paddw, 0xfd), MMX_TRINARY (paddd, 0xfe), MMX_TRINARY (por, 0xeb), MMX_TRINARY (pxor, 0xef), MMX_TRINARY (paddsb, 0xec), MMX_TRINARY (paddsw, 0xed), MMX_TRINARY (paddusb, 0xdc), MMX_TRINARY (paddusw, 0xdd), MMX_TRINARY (pcmpeqb, 0x74), MMX_TRINARY (pcmpeqw, 0x75), MMX_TRINARY (pcmpeqd, 0x76), MMX_TRINARY (pcmpgtb, 0x64), MMX_TRINARY (pcmpgtw, 0x65), MMX_TRINARY (pcmpgtd, 0x66), MMX_TRINARY (pmaddwd, 0xf5), MMX_TRINARY (pmulhw, 0xe5), MMX_TRINARY (pmullw, 0xd5), MMX_TRINARY (psubb, 0xf8), MMX_TRINARY (psubw, 0xf9), MMX_TRINARY (psubd, 0xfa), MMX_TRINARY (psubsb, 0xe8), MMX_TRINARY (psubsw, 0xe9), MMX_TRINARY (psubusb, 0xd8), MMX_TRINARY (psubusw, 0xd9), MMX_TRINARY (punpckhbw, 0x68), MMX_TRINARY (punpckhwd, 0x69), MMX_TRINARY (punpckhdq, 0x6a), MMX_TRINARY (punpcklbw, 0x60), MMX_TRINARY (punpcklwd, 0x61), MMX_TRINARY (punpckldq, 0x62), #define MMX_SHIFT(name, opc, reg) \ { I_##name, { A_MMX, A_MMXM }, F_MMX, E_RM, (0x0f00 + opc) }, \ { I_##name, { A_MMX, A_I8 }, F_MMX, E_M, (0x0f00 + ((opc & 0xf) | 0x70)), reg }, \ { I_##name, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_##name, { A_SSE, A_I8 }, F_SSE2, E_M, (0x0f00 + ((opc & 0xf) | 0x70)), PRE_66 | reg }, \ { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name, { A_SSE, A_SSE, A_I8 }, F_AVX, E_VM, (opc & 0xf) | 0x70, VEX(128,0x66,0x0f,WIG) | reg }, \ { I_v##name, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f,WIG) }, \ { I_v##name, { A_AVX, A_AVX, A_I8 }, F_AVX2, E_VM, (opc & 0xf) | 0x70, VEX(256,0x66,0x0f,WIG) | reg } MMX_SHIFT (psllw, 0xf1, 6), MMX_SHIFT (pslld, 0xf2, 6), MMX_SHIFT (psllq, 0xf3, 6), MMX_SHIFT (psrlw, 0xd1, 2), MMX_SHIFT (psrld, 0xd2, 2), MMX_SHIFT (psrlq, 0xd3, 2), MMX_SHIFT (psraw, 0xe1, 4), MMX_SHIFT (psrad, 0xe2, 4), /* pmaddubsw is special because it wasn't extended to SSE registers until SSSE3 */ { I_pmaddubsw, { A_MMX, A_MMXM }, F_MMX, E_RM, 0x0f3804 }, { I_pmaddubsw, { A_SSE, A_SSEM }, F_SSSE3, E_RM, 0x0f3804, PRE_66 }, { I_vpmaddubsw, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x04, VEX(128,0x66,0x0f38,WIG) }, { I_vpmaddubsw, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, 0x04, VEX(256,0x66,0x0f38,WIG) }, /* AMD's MMX extensions */ #define MMX_EX_TRINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_MMX_EX, E_RM, (0x0f00 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f,WIG) } MMX_EX_TRINARY (pavgb, 0xe0), MMX_EX_TRINARY (pavgw, 0xe3), MMX_EX_TRINARY (pmaxsw, 0xee), MMX_EX_TRINARY (pmaxub, 0xde), MMX_EX_TRINARY (pminsw, 0xea), MMX_EX_TRINARY (pminub, 0xda), MMX_EX_TRINARY (pmulhuw, 0xe4), MMX_EX_TRINARY (psadbw, 0xf6), /* pinsrw, pextrw are special because they don't have AVX2 variants and * because SSE4.1 added new variants of pextrw. pmovmskb also doesn't * have an AVX2 version, and it is binary. */ { I_pinsrw, { A_MMX, A_RM, A_I8 }, F_MMX_EX, E_RM, 0x0fc4 }, { I_pinsrw, { A_SSE, A_RM, A_I8 }, F_SSE2, E_RM, 0x0fc4, PRE_66 }, { I_vpinsrw, { A_SSE, A_SSE, A_RM, A_I8 }, F_AVX, E_RVM, 0xc4, VEX(128,0x66,0x0f,W0) }, { I_pextrw, { A_R, A_MMX, A_I8 }, F_MMX_EX, E_RM, 0x0fc5 }, { I_pextrw, { A_R, A_SSE, A_I8 }, F_SSE2, E_RM, 0x0fc5, PRE_66 }, { I_pextrw, { A_RM, A_SSE, A_I8 }, F_SSE41, E_MR, 0x0f3a15, PRE_66 }, { I_vpextrw, { A_R, A_AVX, A_I8 }, F_AVX, E_RM, 0xc5, VEX(128,0x66,0x0f,W0) }, { I_vpextrw, { A_RM, A_AVX, A_I8 }, F_AVX, E_MR, 0x15, VEX(128,0x66,0x0f3a,W0) }, { I_pmovmskb, { A_R, A_MMX }, F_MMX_EX, E_RM, 0x0fd7 }, { I_pmovmskb, { A_R, A_SSE }, F_SSE2, E_RM, 0x0fd7, PRE_66 }, { I_vpmovmskb, { A_R, A_SSE }, F_AVX, E_RM, 0xd7, VEX(128,0x66,0x0f,WIG) }, /* SSE */ { I_fxsave, { A_MEM }, F_FXSR, E_M, 0x0fae, 0 }, { I_fxrstor, { A_MEM }, F_FXSR, E_M, 0x0fae, 1 }, { I_ldmxcsr, { A_MEM }, F_SSE, E_M, 0x0fae, 2 }, { I_vldmxcsr, { A_MEM }, F_AVX, E_M, 0xae, 2 | VEX(LZ,0x00,0x0f,WIG) }, { I_stmxcsr, { A_MEM }, F_SSE, E_M, 0x0fae, 3 }, { I_vstmxcsr, { A_MEM }, F_AVX, E_M, 0xae, 3 | VEX(LZ,0x00,0x0f,WIG) }, #define SSE_TRINARY(name, opc) \ { I_##name##ps, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc) }, \ { I_v##name##ps, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x00,0x0f,WIG) }, \ { I_v##name##ps, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, opc, VEX(256,0x00,0x0f,WIG) }, \ { I_##name##pd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_v##name##pd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name##pd, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f,WIG) }, \ { I_##name##ss, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ { I_v##name##ss, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(LIG,0xf3,0x0f,WIG) }, \ { I_##name##sd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ { I_v##name##sd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(LIG,0xf2,0x0f,WIG) } SSE_TRINARY (add, 0x58), SSE_TRINARY (and, 0x54), SSE_TRINARY (andn, 0x55), SSE_TRINARY (div, 0x5e), SSE_TRINARY (max, 0x5f), SSE_TRINARY (min, 0x5d), SSE_TRINARY (mul, 0x59), SSE_TRINARY (or, 0x56), SSE_TRINARY (sub, 0x5c), SSE_TRINARY (xor, 0x57), #define SSE_TRINARY_IMM(name, opc) \ { I_##name##ps, { A_SSE, A_SSEM, A_I8 }, F_SSE, E_RM, (0x0f00 + opc) }, \ { I_v##name##ps, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x00,0x0f,WIG) }, \ { I_v##name##ps, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x00,0x0f,WIG) }, \ { I_##name##pd, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_v##name##pd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name##pd, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f,WIG) }, \ { I_##name##ss, { A_SSE, A_SSEM, A_I8 }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ { I_v##name##ss, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(LIG,0xf3,0x0f,WIG) }, \ { I_##name##sd, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ { I_v##name##sd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(LIG,0xf2,0x0f,WIG) } SSE_TRINARY_IMM (cmp, 0xc2), SSE_TRINARY_IMM (shuf, 0xc6), #define SSE_UNARY_SINGLE(name, opc) \ { I_##name##ps, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc) }, \ { I_v##name##ps, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x00,0x0f,WIG) }, \ { I_v##name##ps, { A_AVX, A_AVXM }, F_AVX, E_RM, opc, VEX(256,0x00,0x0f,WIG) }, \ { I_##name##ss, { A_SSE, A_SSEM }, F_SSE, E_RM, (0x0f00 + opc), PRE_F3 }, \ { I_v##name##ss, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(LIG,0xf3,0x0f,WIG) } #define SSE_UNARY_DOUBLE(name, opc) \ { I_##name##pd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_v##name##pd, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name##pd, { A_AVX, A_AVXM }, F_AVX, E_RM, opc, VEX(256,0x66,0x0f,WIG) }, \ { I_##name##sd, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_F2 }, \ { I_v##name##sd, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(LIG,0xf2,0x0f,WIG) } SSE_UNARY_SINGLE (sqrt, 0x51), SSE_UNARY_DOUBLE (sqrt, 0x51), SSE_UNARY_SINGLE (rsqrt, 0x52), SSE_UNARY_SINGLE (rcp, 0x53), /* SSE 2 */ { I_lfence, { 0 }, F_SSE2, E_NP, 0x0faee8 }, { I_mfence, { 0 }, F_SSE2, E_NP, 0x0faef0 }, { I_movq2dq, { A_SSE, A_MMX }, F_SSE2, E_RM, 0x0fd6, PRE_F3 | NO_REX_W }, { I_movdqa, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f6f, PRE_66 }, { I_movdqa, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0f7f, PRE_66 }, { I_vmovdqa, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x6f, VEX(128,0x66,0x0f,WIG) }, { I_vmovdqa, { A_SSEM, A_SSE }, F_AVX, E_MR, 0x7f, VEX(128,0x66,0x0f,WIG) }, { I_vmovdqa, { A_AVX, A_AVXM }, F_AVX, E_RM, 0x6f, VEX(256,0x66,0x0f,WIG) }, { I_vmovdqa, { A_AVXM, A_AVX }, F_AVX, E_MR, 0x7f, VEX(256,0x66,0x0f,WIG) }, { I_movdqu, { A_SSE, A_SSEM }, F_SSE2, E_RM, 0x0f6f, PRE_F3 }, { I_movdqu, { A_SSEM, A_SSE }, F_SSE2, E_MR, 0x0f7f, PRE_F3 }, { I_vmovdqu, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x6f, VEX(128,0xf3,0x0f,WIG) }, { I_vmovdqu, { A_SSEM, A_SSE }, F_AVX, E_MR, 0x7f, VEX(128,0xf3,0x0f,WIG) }, { I_vmovdqu, { A_AVX, A_AVXM }, F_AVX, E_RM, 0x6f, VEX(256,0xf3,0x0f,WIG) }, { I_vmovdqu, { A_AVXM, A_AVX }, F_AVX, E_MR, 0x7f, VEX(256,0xf3,0x0f,WIG) }, #define SSE2_TRINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_SSE2, E_RM, (0x0f00 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSE2, E_RM, (0x0f00 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f,WIG) }, \ { I_v##name, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f,WIG) } SSE2_TRINARY (paddq, 0xd4), SSE2_TRINARY (psubq, 0xfb), SSE2_TRINARY (pmuludq, 0xf4), /* These don't have MMX variants */ { I_pshufhw, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, 0x0f70, PRE_F3 }, { I_vpshufhw, { A_SSE, A_SSEM, A_I8 }, F_AVX, E_RM, 0x70, VEX(128,0xf3,0x0f,WIG) }, { I_vpshufhw, { A_AVX, A_AVXM, A_I8 }, F_AVX2, E_RM, 0x70, VEX(256,0xf3,0x0f,WIG) }, { I_pshuflw, { A_SSE, A_SSEM, A_I8 }, F_SSE2, E_RM, 0x0f70, PRE_F2 }, { I_vpshuflw, { A_SSE, A_SSEM, A_I8 }, F_AVX, E_RM, 0x70, VEX(128,0xf2,0x0f,WIG) }, { I_vpshuflw, { A_AVX, A_AVXM, A_I8 }, F_AVX2, E_RM, 0x70, VEX(256,0xf2,0x0f,WIG) }, /* SSE3 */ { I_lddqu, { A_SSE, A_MEM }, F_SSE3, E_RM, 0x0ff0, PRE_F2 }, { I_vlddqu, { A_SSE, A_MEM }, F_AVX, E_RM, 0xf0, VEX(128,0xf2,0x0f,WIG) }, { I_vlddqu, { A_AVX, A_MEM }, F_AVX, E_RM, 0xf0, VEX(256,0xf2,0x0f,WIG) }, { I_hsubps, { A_SSE, A_SSEM }, F_SSE3, E_RM, 0x0f7d, PRE_F2 }, { I_vhsubps, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x7d, VEX(128,0xf2,0x0f,WIG) }, { I_vhsubps, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, 0x7d, VEX(256,0Xf2,0x0f,WIG) }, { I_hsubpd, { A_SSE, A_SSEM }, F_SSE3, E_RM, 0x0f7d, PRE_66 }, { I_vhsubpd, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x7d, VEX(128,0x66,0x0f,WIG) }, { I_vhsubpd, { A_AVX, A_AVX, A_AVXM }, F_AVX, E_RVM, 0x7d, VEX(256,0x66,0x0f,WIG) }, /* SSSE 3 */ #define SSSE3_BINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_SSSE3, E_RM, (0x0f3800 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSSE3, E_RM, (0x0f3800 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x66,0x0f38,WIG) }, \ { I_v##name, { A_AVX, A_AVXM }, F_AVX2, E_RM, opc, VEX(256,0x66,0x0f38,WIG) } SSSE3_BINARY (pabsb, 0x1c), SSSE3_BINARY (pabsw, 0x1d), SSSE3_BINARY (pabsd, 0x1e), #define SSSE3_TRINARY(name, opc) \ { I_##name, { A_MMX, A_MMXM }, F_SSSE3, E_RM, (0x0f3800 + opc) }, \ { I_##name, { A_SSE, A_SSEM }, F_SSSE3, E_RM, (0x0f3800 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f38,WIG) }, \ { I_v##name, { A_AVX, A_SSE, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f38,WIG) } SSSE3_TRINARY (psignb, 0x08), SSSE3_TRINARY (psignw, 0x09), SSSE3_TRINARY (psignd, 0x0a), SSSE3_TRINARY (phaddw, 0x01), SSSE3_TRINARY (phaddd, 0x02), SSSE3_TRINARY (phaddsw, 0x03), SSSE3_TRINARY (phsubw, 0x05), SSSE3_TRINARY (phsubd, 0x06), SSSE3_TRINARY (phsubsw, 0x07), SSSE3_TRINARY (pmulhrsw, 0x0b), SSSE3_TRINARY (pshufb, 0x00), { I_palignr, { A_MMX, A_MMXM, A_I8 }, F_SSSE3, E_RM, 0x0f3a0f }, { I_palignr, { A_SSE, A_SSEM, A_I8 }, F_SSSE3, E_RM, 0x0f3a0f, PRE_66 }, { I_vpalignr, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0f, VEX(128,0x66,0x0f3a,WIG) }, /* SSE 4.1 */ #define SSE41_BINARY(name, opc) \ { I_##name, { A_SSE, A_SSEM }, F_SSE41, E_RM, (0x0f3800 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSEM }, F_AVX, E_RM, opc, VEX(128,0x66,0x0f38,WIG) }, \ { I_v##name, { A_AVX, A_AVXM }, F_AVX2, E_RM, opc, VEX(256,0x66,0x0f38,WIG) } #define SSE41_PMOVX(type, opc) \ SSE41_BINARY (pmovsx##type, opc), \ SSE41_BINARY (pmovzx##type, opc + 0x10) SSE41_PMOVX (bw, 0x20), SSE41_PMOVX (bd, 0x21), SSE41_PMOVX (bq, 0x22), SSE41_PMOVX (wd, 0x23), SSE41_PMOVX (wq, 0x24), SSE41_PMOVX (dq, 0x25), SSE41_BINARY (ptest, 0x17), /* No AVX2 variant for phminposuw */ { I_phminposuw, { A_SSE, A_SSEM }, F_SSE41, E_RM, 0x0f3841, PRE_66 }, { I_vphminposuw, { A_SSE, A_SSEM }, F_AVX, E_RM, 0x41, VEX(128,0x66,0x0f38,WIG) }, /* See under MMX for the -w versions */ { I_pextrb, { A_RM, A_SSE, A_I8 }, F_SSE41, E_MR, 0x0f3a14, PRE_66 }, { I_pextrd, { A_RM, A_SSE, A_I8 }, F_SSE41, E_MR, 0x0f3a16, PRE_66 }, { I_pextrq, { A_RM, A_SSE, A_I8 }, F_SSE41, E_MR, 0x0f3a16, PRE_66 }, { I_vpextrb, { A_RM, A_SSE, A_I8 }, F_AVX, E_MR, 0x14, VEX(128,0x66,0x0f3a,W0) }, { I_vpextrd, { A_RM, A_SSE, A_I8 }, F_AVX, E_MR, 0x16, VEX(128,0x66,0x0f3a,W0) }, { I_vpextrq, { A_RM, A_SSE, A_I8 }, F_AVX, E_MR, 0x16, VEX(128,0X66,0x0f3a,W1) }, { I_pinsrb, { A_SSE, A_RM, A_I8 }, F_SSE41, E_RM, 0x0f3a20, PRE_66 }, { I_pinsrd, { A_SSE, A_RM, A_I8 }, F_SSE41, E_RM, 0x0f3a22, PRE_66 }, { I_pinsrq, { A_SSE, A_RM, A_I8 }, F_SSE41, E_RM, 0x0f3a22, PRE_66 }, { I_vpinsrb, { A_SSE, A_SSE, A_RM, A_I8 }, F_AVX, E_RVM, 0x20, VEX(128,0x66,0x0f3a,W0) }, { I_vpinsrd, { A_SSE, A_SSE, A_RM, A_I8 }, F_AVX, E_RVM, 0x22, VEX(128,0x66,0x0f3a,W0) }, { I_vpinsrq, { A_SSE, A_SSE, A_RM, A_I8 }, F_AVX, E_RVM, 0x22, VEX(128,0x66,0x0f3a,W1) }, #define SSE41_TRINARY(name, opc) \ { I_##name, { A_SSE, A_SSEM }, F_SSE41, E_RM, (0x0f3800 + opc), PRE_66 }, \ { I_v##name, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f38,WIG) }, \ { I_v##name, { A_AVX, A_SSE, A_AVXM }, F_AVX2, E_RVM, opc, VEX(256,0x66,0x0f38,WIG) } SSE41_TRINARY (packusdw, 0x2b), SSE41_TRINARY (pcmpeqq, 0x29), SSE41_TRINARY (pmaxsb, 0x3c), SSE41_TRINARY (pmaxsd, 0x3d), SSE41_TRINARY (pmaxud, 0x3f), SSE41_TRINARY (pmaxuw, 0x3e), SSE41_TRINARY (pminsb, 0x38), SSE41_TRINARY (pminsd, 0x39), SSE41_TRINARY (pminud, 0x3b), SSE41_TRINARY (pminuw, 0x3a), SSE41_TRINARY (pmuldq, 0x28), SSE41_TRINARY (pmulld, 0x40), #define SSE41_TRINARY_IMM(name, opc) \ { I_##name##ps, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, (0x0f3a00 + opc), PRE_66 }, \ { I_v##name##ps, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc, VEX(128,0x66,0x0f3a,WIG) }, \ { I_v##name##ps, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc, VEX(256,0x66,0x0f3a,WIG) }, \ { I_##name##pd, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, (0x0f3a00 + opc + 1), PRE_66 }, \ { I_v##name##pd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, opc + 1, VEX(128,0x66,0x0f3a,WIG) }, \ { I_v##name##pd, { A_AVX, A_AVX, A_AVXM, A_I8 }, F_AVX, E_RVM, opc + 1, VEX(256,0x66,0x0f3a,WIG) } SSE41_TRINARY_IMM (blend, 0x0c), SSE41_TRINARY_IMM (round, 0x08), { I_roundss, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, 0x0f3a0a, PRE_66 }, { I_vroundss, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0a, VEX(LIG,0x66,0x0f3a,WIG) }, { I_roundsd, { A_SSE, A_SSEM, A_I8 }, F_SSE41, E_RM, 0x0f3a0b, PRE_66 }, { I_vroundsd, { A_SSE, A_SSE, A_SSEM, A_I8 }, F_AVX, E_RVM, 0x0b, VEX(LIG,0x66,0x0f3a,WIG) }, /* SSE 4.2 */ { I_pcmpgtq, { A_SSE, A_SSEM }, F_SSE42, E_RM, 0x0f3837, PRE_66 }, { I_vpcmpgtq, { A_SSE, A_SSE, A_SSEM }, F_AVX, E_RVM, 0x37, VEX(128,0x66,0x0f38,WIG) }, { I_vpcmpgtq, { A_AVX, A_AVX, A_AVXM }, F_AVX2, E_RVM, 0x37, VEX(256,0x66,0x0f38,WIG) }, { I_crc32, { A_R32, A_RM8 }, F_SSE42, E_RM, 0x0f38f0, PRE_F2 }, { I_crc32, { A_R64, A_RM8 }, F_SSE42, E_RM, 0x0f38f0, PRE_F2 }, { I_crc32, { A_R32, A_RM16 }, F_SSE42, E_RM, 0x0f38f1, PRE_F2 | PRE_66 }, { I_crc32, { A_R32, A_RM32 }, F_SSE42, E_RM, 0x0f38f1, PRE_F2 }, { I_crc32, { A_R64, A_RM64 }, F_SSE42, E_RM, 0x0f38f1, PRE_F2 }, /* AVX */ { I_vmaskmovps, { A_SSE, A_SSE, A_MEM }, F_AVX, E_RVM, 0x2c, VEX(128,0x66,0x0f38,W0) }, { I_vmaskmovps, { A_AVX, A_AVX, A_MEM }, F_AVX, E_RVM, 0x2c, VEX(256,0x66,0x0f38,W0) }, { I_vmaskmovps, { A_MEM, A_SSE, A_SSE }, F_AVX, E_MVR, 0x2e, VEX(128,0x66,0x0f38,W0) }, { I_vmaskmovps, { A_MEM, A_AVX, A_AVX }, F_AVX, E_MVR, 0x2e, VEX(256,0x66,0x0f38,W0) }, { I_vmaskmovpd, { A_SSE, A_SSE, A_MEM }, F_AVX, E_RVM, 0x2d, VEX(128,0x66,0x0f38,W0) }, { I_vmaskmovpd, { A_AVX, A_AVX, A_MEM }, F_AVX, E_RVM, 0x2d, VEX(256,0x66,0x0f38,W0) }, { I_vmaskmovpd, { A_MEM, A_SSE, A_SSE }, F_AVX, E_MVR, 0x2f, VEX(128,0x66,0x0f38,W0) }, { I_vmaskmovpd, { A_MEM, A_AVX, A_AVX }, F_AVX, E_MVR, 0x2f, VEX(256,0x66,0x0f38,W0) }, { I_vzeroupper, { 0 }, F_AVX, E_NP, 0x77, VEX(128,0x00,0x0f,WIG) }, { I_vzeroall, { 0 }, F_AVX, E_NP, 0x77, VEX(256,0x00,0x0f,WIG) }, /* AVX 2 */ { I_vgatherdd, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x90, VEX(128,0x66,0x0f38,W0) }, { I_vgatherdd, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x90, VEX(256,0x66,0x0f38,W0) }, { I_vgatherqd, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x91, VEX(128,0x66,0x0f38,W0) }, { I_vgatherqd, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x91, VEX(256,0x66,0x0f38,W0) }, { I_vgatherdq, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x90, VEX(128,0x66,0x0f38,W1) }, { I_vgatherdq, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x90, VEX(256,0x66,0x0f38,W1) }, { I_vgatherqq, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x91, VEX(128,0x66,0x0f38,W1) }, { I_vgatherqq, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x91, VEX(256,0x66,0x0f38,W1) }, { I_vgatherdps, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x92, VEX(128,0x66,0x0f38,W0) }, { I_vgatherdps, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x92, VEX(256,0x66,0x0f38,W0) }, { I_vgatherqps, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x93, VEX(128,0x66,0x0f38,W0) }, { I_vgatherqps, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x93, VEX(256,0x66,0x0f38,W0) }, { I_vgatherdpd, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x92, VEX(128,0x66,0x0f38,W1) }, { I_vgatherdpd, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x92, VEX(256,0x66,0x0f38,W1) }, { I_vgatherqpd, { A_SSE, A_MEM, A_SSE }, F_AVX2, E_RMV, 0x93, VEX(128,0x66,0x0f38,W1) }, { I_vgatherqpd, { A_AVX, A_MEM, A_AVX }, F_AVX2, E_RMV, 0x93, VEX(256,0x66,0x0f38,W1) }, /* BMI 1 */ { I_andn, { A_R32, A_R32, A_RM }, F_BMI1, E_RVM, 0xf2, VEX(LZ,0x00,0x0f38,W0) }, { I_andn, { A_R64, A_R64, A_RM }, F_BMI1, E_RVM, 0xf2, VEX(LZ,0x00,0x0f38,W1) }, { I_bextr, { A_R32, A_R32, A_RM }, F_BMI1, E_RVM, 0xf7, VEX(LZ,0x00,0x0f38,W0) }, { I_bextr, { A_R64, A_R64, A_RM }, F_BMI1, E_RVM, 0xf7, VEX(LZ,0x00,0x0f38,W1) }, /* TBM variant of bextr */ { I_bextr, { A_R32, A_R32, A_I32 }, F_TBM, E_RM, 0x10, XOP(LZ,0x00,0x0a,W0) }, { I_bextr, { A_R64, A_R64, A_I32 }, F_TBM, E_RM, 0x10, XOP(LZ,0x00,0x0a,W1) }, { I_blsr, { A_R32, A_RM }, F_BMI1, E_VM, 0xf3, VEX(LZ,0x00,0x0f38,W0) | 1 }, { I_blsr, { A_R64, A_RM }, F_BMI1, E_VM, 0xf3, VEX(LZ,0x00,0x0f38,W1) | 1 }, { I_blsmsk, { A_R32, A_RM }, F_BMI1, E_VM, 0xf3, VEX(LZ,0x00,0x0f38,W0) | 2 }, { I_blsmsk, { A_R64, A_RM }, F_BMI1, E_VM, 0xf3, VEX(LZ,0x00,0x0f38,W1) | 2 }, { I_blsi, { A_R32, A_RM }, F_BMI1, E_VM, 0xf3, VEX(LZ,0x00,0x0f38,W0) | 3 }, { I_blsi, { A_R64, A_RM }, F_BMI1, E_VM, 0xf3, VEX(LZ,0x00,0x0f38,W1) | 3 }, { I_tzcnt, { A_R, A_RM }, F_BMI1, E_RM, 0xf30fbc }, /* BMI 2 */ #define BMI2_TRINARY(name, opc, prefix) \ { I_##name, { A_R32, A_RM, A_R32 }, F_BMI2, E_RMV, opc, VEX(LZ,prefix,0x0f38,W0) }, \ { I_##name, { A_R64, A_RM, A_R64 }, F_BMI2, E_RMV, opc, VEX(LZ,prefix,0x0f38,W1) } BMI2_TRINARY (bzhi, 0xf5, 0x00), BMI2_TRINARY (pdep, 0xf5, 0xf2), BMI2_TRINARY (pext, 0xf5, 0xf3), BMI2_TRINARY (shlx, 0xf7, 0x66), BMI2_TRINARY (shrx, 0xf7, 0xf2), BMI2_TRINARY (sarx, 0xf7, 0xf3), { I_mulx, { A_R32, A_R32, A_RM }, F_BMI2, E_RVM, 0xf6, VEX(LZ,0x00,0x0f38,W0) }, { I_mulx, { A_R64, A_R64, A_RM }, F_BMI2, E_RVM, 0xf6, VEX(LZ,0x00,0x0f38,W1) }, { I_rorx, { A_R32, A_RM, A_I8 }, F_BMI2, E_RM, 0xf0, VEX(LZ,0xF2,0x0f38,W0) }, { I_rorx, { A_R64, A_RM, A_I8 }, F_BMI2, E_RM, 0xf0, VEX(LZ,0xF2,0x0f38,W0) }, /* F16C */ { I_vcvtph2ps, { A_SSE, A_SSEM }, F_F16C, E_RM, 0x13, VEX(128,0x66,0x0f38,W0) }, { I_vcvtph2ps, { A_AVX, A_SSEM }, F_F16C, E_RM, 0x13, VEX(256,0x66,0x0f38,W0) }, { I_vcvtps2ph, { A_SSEM, A_AVX, A_I8 }, F_F16C, E_MR, 0x1d, VEX(256,0x66,0x0f3a,W0) }, { I_vcvtps2ph, { A_SSEM, A_SSE, A_I8 }, F_F16C, E_MR, 0x1d, VEX(128,0x66,0x0f3a,W0) }, /* XOP */ { I_vpcmov, { A_SSE, A_SSE, A_SSEM, A_SSE }, F_XOP, E_RVMI, 0xa2, XOP(128,0x00,0x8,W0) }, { I_vpcmov, { A_AVX, A_AVX, A_AVXM, A_AVX }, F_XOP, E_RVMI, 0xa2, XOP(256,0x00,0x8,W0) }, { I_vpcmov, { A_SSE, A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVIM, 0xa2, XOP(128,0x00,0x8,W1) }, { I_vpcmov, { A_AVX, A_AVX, A_AVX, A_AVXM }, F_XOP, E_RVIM, 0xa2, XOP(128,0x00,0x8,W0) }, #define XOP_COMPARE(type, opc) \ { I_vpcomlt##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 0 }, \ { I_vpcomle##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 1 }, \ { I_vpcomgt##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 2 }, \ { I_vpcomge##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 3 }, \ { I_vpcome##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 4 }, \ { I_vpcomne##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 5 }, \ { I_vpcomfalse##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 6 }, \ { I_vpcomtrue##type, { A_SSE, A_SSE, A_SSEM }, F_XOP, E_RVMX, opc, XOP(128,0x00,0x08,W0) | 7 } XOP_COMPARE (b, 0xcc), XOP_COMPARE (w, 0xcd), XOP_COMPARE (d, 0xce), XOP_COMPARE (q, 0xcf), XOP_COMPARE (ub, 0xec), XOP_COMPARE (uw, 0xed), XOP_COMPARE (ud, 0xee), XOP_COMPARE (uq, 0xef), { I_none }, }; #define N_VARIANTS (sizeof (variants) / sizeof (variants[0])) static const char inames[][16] = { "invalid", #define PROCESS_INSTRUCTION(name, rwinfo, n_ops) \ #name, ALL_INSTRUCTIONS }; #define GET_NAME(inst) \ inames[GET_SERIAL(inst)] static void sanity_check (void) { int i, j; uint32_t a_mask; /* Verify that every instruction has at least one variant */ for (i = 0; i < I_n_instructions; ++i) { for (j = 0; j < N_VARIANTS; ++j) { if (GET_SERIAL (variants[j].inst) == i) goto next_instruction; } printf ("No variants found for %s (serial %d)\n", inames[i], i); abort(); next_instruction: ; } a_mask = 0; for (i = 0; i < OP_N_OP_TYPES; ++i) a_mask |= (1 << i); for (i = 0; i < N_VARIANTS; ++i) { const variant_t *variant = &(variants[i]); int n_ops = GET_N_OPS (variant->inst); enum { BEFORE, DURING, AFTER } state; int j, n_mem_ops; printf ("\r%s ", GET_NAME (variant->inst)); n_mem_ops = 0; for (j = 0; j < 4; ++j) { if (j < n_ops) assert (variant->ops[j] != 0); else assert (variant->ops[j] == 0); assert ((variant->ops[j] & ~a_mask) == 0); if ((variant->ops[j] & A_MEM)) n_mem_ops++; } assert (n_mem_ops <= 1); assert ((variant->feature & ~F_MASK) == 0); assert (variant->encoding < E_N_ENCODINGS); if (variant->encoding != E_M && variant->encoding != E_VM && variant->encoding != E_RVMX) { assert (GET_OP_EXTENSION (variant->info) == 0); } /* Verify that all variants of the same instruction * are grouped together */ state = BEFORE; for (j = 0; j < N_VARIANTS; ++j) { const variant_t *other = &variants[j]; switch (state) { case BEFORE: if (other->inst == variant->inst) state = DURING; break; case DURING: if (other->inst != variant->inst) state = AFTER; break; case AFTER: assert (other->inst != variant->inst); break; } } } printf ("\rSanity check passed\n"); } typedef struct annotation_t annotation_t; struct annotation_t { annotation_type_t type; ssize_t offset; size_t size; char name[32]; annotation_t * label; const uint8_t * fragment; uint32_t opcode; uint32_t align_mask; uint32_t align_bits; }; struct fragment_t { assembler_t * as; bool_t error; array_t * annotations; array_t * code; }; struct assembler_t { code_manager_t * code_manager; feature_t features; bool_t verbose; /* Size of the last piece of code that was * linked. Only used for test purposes. */ size_t last_size; /* Table that maps the serial number of an instruction * to its first variant */ const variant_t * first_variant[I_n_instructions]; }; static feature_t detect_features (assembler_t *as) { fragment_t *fragment = fragment_new (as); typedef uint32_t (* func_t) (void); uint8_t fxsave_area[512 + 16] = { 0 }; feature_t features; uint8_t *c; #define FXSAVE_AREA ((((uintptr_t)&fxsave_area) + 15) & ~15) /* The code below will run on both 32 and 64 bit because * there is no use of 64 bit specific features, and the * machine code for things like * * I_push ebx * * will be interpreted as I_push rbx in 64 bit mode. */ BEGIN_ASM (fragment) I_push, ebx, /* We will accumulate features in esi, so initialize * it to F_386, which is always present. */ I_mov, esi, IMM (F_386), /* Check that cpuid is supported */ I_pushf, I_pop, eax, I_mov, ecx, eax, I_xor, eax, IMM (1 << 21), I_push, eax, I_popf, I_pushf, I_pop, eax, I_xor, eax, ecx, I_jz, LABEL ("done"), /* No cpuid */ I_or, esi, IMM (F_CPUID), /* Check cpuid.(eax=0x01) */ I_mov, eax, IMM (1), I_cpuid, /* Multi-byte nop is supported whenever the family * is 0b0110 or 0b1111. This is true for both AMD * and Intel. */ I_or, esi, IMM (F_MULTINOP), I_and, eax, IMM (0xf << 8), I_cmp, eax, IMM (0xf << 8), I_je, LABEL ("done_multinop"), I_cmp, eax, IMM (0x6 << 8), I_je, LABEL ("done_multinop"), I_and, esi, IMM (~F_MULTINOP), DEFINE_LABEL ("done_multinop"), #define TEST_FEATURE(reg, bit, name) \ I_test, reg, IMM (1 << (bit)), \ I_jz, LABEL ("no_" # name), \ I_or, esi, IMM (name), \ DEFINE_LABEL ("no_" # name) TEST_FEATURE (edx, 4, F_RDTSC), TEST_FEATURE (edx, 15, F_CMOV), TEST_FEATURE (edx, 19, F_CLFLUSH), TEST_FEATURE (edx, 23, F_MMX), TEST_FEATURE (edx, 24, F_FXSR), TEST_FEATURE (edx, 25, F_SSE | F_MMX_EX), /* SSE implies MMX_EX */ TEST_FEATURE (edx, 26, F_SSE2), TEST_FEATURE (ecx, 0, F_SSE3), TEST_FEATURE (ecx, 9, F_SSSE3), TEST_FEATURE (ecx, 12, F_FMA), TEST_FEATURE (ecx, 19, F_SSE41), TEST_FEATURE (ecx, 20, F_SSE42), TEST_FEATURE (ecx, 22, F_MOVBE), TEST_FEATURE (ecx, 23, F_POPCNT), TEST_FEATURE (ecx, 27, F_OSXSAVE), TEST_FEATURE (ecx, 28, F_AVX), TEST_FEATURE (ecx, 29, F_F16C), /* Check CPUID.(eax=0x80000001) */ I_mov, eax, IMM (0x80000000), I_cpuid, I_cmp, eax, IMM (0x80000001), I_jb, LABEL ("no_extended_info"), I_mov, eax, IMM (0x80000001), I_cpuid, TEST_FEATURE (edx, 22, F_MMX_EX), TEST_FEATURE (edx, 27, F_RDTSCP), TEST_FEATURE (ecx, 5, F_LZCNT), TEST_FEATURE (ecx, 11, F_XOP), TEST_FEATURE (ecx, 16, F_FMA4), TEST_FEATURE (ecx, 21, F_TBM), DEFINE_LABEL ("no_extended_info"), /* Check CPUID.(eax=0x07, ecx=0x00) */ I_mov, eax, IMM (7), I_xor, ecx, ecx, I_cpuid, TEST_FEATURE (ebx, 5, F_AVX2), TEST_FEATURE (ebx, 3, F_BMI1), TEST_FEATURE (ebx, 8, F_BMI2), /* Check whether DAZ is supported */ I_test, esi, IMM (F_FXSR), I_jz, LABEL ("no_DAZ"), #if defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64) I_movabs, rax, IMM64 (FXSAVE_AREA), I_fxsave, PTR (rax), I_test, BYTE_PTR + BASE(rax, 28), IMM(1 << 6), #else I_mov, eax, IMM (FXSAVE_AREA), I_fxsave, PTR (eax), I_test, BYTE_PTR + BASE(eax, 28), IMM(1 << 6), #endif I_jz, LABEL ("no_DAZ"), I_or, esi, IMM (F_DAZ), DEFINE_LABEL ("no_DAZ"), /* AVX is required for AVX2 and F16C */ I_test, esi, IMM (F_AVX), I_jz, LABEL ("no_ymm"), /* Check OS support for saving YMM and XMM state */ I_test, esi, IMM (F_OSXSAVE), I_jz, LABEL ("no_ymm"), I_xor, ecx, ecx, I_xgetbv, I_and, eax, IMM (0x06), I_cmp, eax, IMM (0x06), I_je, LABEL ("done"), DEFINE_LABEL ("no_ymm"), I_and, esi, IMM (~(F_AVX | F_F16C | F_AVX2)), /* Return */ DEFINE_LABEL ("done"), I_mov, eax, esi, I_pop, ebx, I_ret, DEFINE_VALUE64("printf", (uintptr_t)printf), END_ASM(); c = assembler_link (as, fragment, NULL); features = ((func_t)c)(); assembler_free (as, c); return features; } assembler_t * assembler_new (const char *prefix) { assembler_t *assembler; int i; if (getenv ("PIXMAN_JIT_SANITY_CHECK")) sanity_check (); if (!(assembler = malloc (sizeof *assembler))) goto out_assembler; if (!(assembler->code_manager = code_manager_new (prefix))) goto out_code_manager; memset (assembler->first_variant, 0, sizeof assembler->first_variant); for (i = 0; i < N_VARIANTS; ++i) { const variant_t *variant = &(variants[i]); int serial = GET_SERIAL (variant->inst); if (!assembler->first_variant[serial]) assembler->first_variant[serial] = variant; } assembler->verbose = FALSE; /* Make sure that detect_features can generate all instructions, * even those that are not supported with the current CPU. */ assembler->features = F_MASK; assembler->features = detect_features (assembler); return assembler; out_code_manager: free (assembler); out_assembler: return NULL; } void assembler_set_verbose (assembler_t *as, bool_t verbose) { as->verbose = verbose; } static uint8_t * emit_imm32 (uint8_t *code, int32_t imm) { *code++ = (imm & 0x000000ff) >> 0; *code++ = (imm & 0x0000ff00) >> 8; *code++ = (imm & 0x00ff0000) >> 16; *code++ = (imm & 0xff000000) >> 24; return code; } static uint8_t * emit_imm16 (uint8_t *code, int32_t imm) { *code++ = (imm & 0x00ff) >> 0; *code++ = (imm & 0xff00) >> 8; return code; } static uint8_t * emit_imm8 (uint8_t *code, int32_t imm) { *code++ = imm & 0xff; return code; } static uint32_t hash (const char *str) { const signed char *p = (const signed char *)str; uint32_t h = 5381; signed char c; while ((c = *p++)) h = h * 33 + c; return h; } static bool_t gather_labels (array_t *a) { size_t n_annotations; annotation_t *annotations; annotation_t **hash_table; uint32_t mask; int i; annotations = array_get_data (&a, &n_annotations); mask = 2 * n_annotations; mask |= mask >> 1; mask |= mask >> 2; mask |= mask >> 4; mask |= mask >> 8; mask |= mask >> 16; if (!(hash_table = calloc (mask + 1, sizeof (annotation_t *)))) return FALSE; for (i = 0; i < n_annotations; ++i) { annotation_t *label = &(annotations[i]); if (label->type == LABEL) { uint32_t idx = hash (label->name); annotation_t *existing; while ((existing = hash_table[idx & mask])) { if (strcmp (existing->name, label->name) == 0) { printf ("Duplicate label %s\n", label->name); abort (); } idx++; } hash_table[idx & mask] = label; } } for (i = 0; i < n_annotations; ++i) { annotation_t *ref = &(annotations[i]); annotation_t *label; uint32_t idx; if (ref->type == JCC || ref->type == RIP_REF || ref->type == JUMP || ref->type == CALL) { idx = hash (ref->name); while ((label = hash_table[idx++ & mask])) { if (strcmp (ref->name, label->name) == 0) goto found; } printf ("Label \"%s\" does not exist\n", ref->name); abort(); found: ref->label = label; } } free (hash_table); return TRUE; } #define IS_S8(x) (((x) >= -128) && ((x) < 128)) /* This function keeps moving annotations forward until nothing changes */ static void resolve_refs (array_t *annotations, size_t *bits, size_t *mask) { annotation_t *ann; ssize_t displacement; size_t n_annotations; size_t initial_bits; size_t initial_mask; int i; initial_bits = 0; initial_mask = 0; ann = array_get_data (&annotations, &n_annotations); do { bool_t first_align = TRUE; displacement = 0; for (i = 0; i < n_annotations; ++i) { annotation_t *a = &ann[i]; a->offset += displacement; displacement -= a->size; switch (a->type) { case ALIGN: /* The first align is absorbed into the final position * of the generated code. */ if (first_align) { initial_mask = a->align_mask; initial_bits = (a->align_bits - a->offset) & a->align_mask; a->size = 0; first_align = FALSE; } else { a->size = (a->align_bits - (initial_bits + a->offset)) & a->align_mask; initial_mask |= a->align_mask; } break; case JCC: if (IS_S8 (a->label->offset - (a->offset + 2))) a->size = 2; else a->size = 6; break; case JUMP: if (IS_S8 (a->label->offset - (a->offset + 2))) a->size = 2; else a->size = 5; break; case CALL: case RIP_REF: a->size = 4; break; case FRAGMENT: case LABEL: case NONE: break; } displacement += a->size; } } while (displacement); *bits = initial_bits; *mask = initial_mask; } static uint8_t * emit_nop (uint8_t *c, int n_bytes, bool_t multi_byte_nop) { if (n_bytes >= 16) { /* For nops longer than 16 bytes, use a jump */ n_bytes -= 5; *c++ = 0xe9; c = emit_imm32 (c, n_bytes); } if (multi_byte_nop) { while (n_bytes) { static const char multi_nop[][9] = { { 0 }, { 0x90 }, { 0x66, 0x90 }, { 0x0F, 0x1F, 0x00 }, { 0x0F, 0x1F, 0x40, 0x00 }, { 0x0F, 0x1F, 0x44, 0x00, 0x00 }, { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }, { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }, { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }, }; int n; n = n_bytes > 9? 9 : n_bytes; memcpy (c, multi_nop[n], n); c += n; n_bytes -= n; } } else { memset (c, 0x90, n_bytes); c += n_bytes; } return c; } static void copy_code (assembler_t *as, uint8_t *dest, array_t *annotations) { annotation_t *ann; size_t n_annotations; const uint8_t *src = NULL; int i; ann = array_get_data (&annotations, &n_annotations); for (i = 0; ann[i].type != NONE; ++i) { annotation_t *a = &ann[i]; size_t size; switch (a->type) { case NONE: case LABEL: break; case FRAGMENT: src = a->fragment; break; case JCC: if (a->size == 2) { *dest++ = a->opcode; *dest++ = a->label->offset - (a->offset + 2); } else { *dest++ = 0x0f; *dest++ = a->opcode + 0x10; dest = emit_imm32 (dest, a->label->offset - (a->offset + 6)); } break; case JUMP: if (a->size == 2) { *dest++ = 0xeb; *dest++ = a->label->offset - (a->offset + 2); } else { *dest++ = 0xe9; dest = emit_imm32 (dest, a->label->offset - (a->offset + 5)); } break; case CALL: case RIP_REF: dest = emit_imm32 (dest, a->label->offset - (a->offset + 4)); break; case ALIGN: dest = emit_nop (dest, a->size, (as->features & F_MULTINOP)); break; } size = ann[i + 1].offset - a->offset - a->size; memcpy (dest, src, size); dest += size; src += size; } } static annotation_t * add_annotation (array_t **array, const annotation_t *ann) { annotation_t *a; if (!array_append (array, 1, &a)) return NULL; *a = *ann; return a; } static void print_features (assembler_t *as) { printf ("Machine features:\n"); #define PRINT_FEATURE(feature) \ printf (" %14s = %d\n", #feature, !!(as->features & feature)) PRINT_FEATURE (F_386); PRINT_FEATURE (F_MMX); PRINT_FEATURE (F_MMX_EX); PRINT_FEATURE (F_SSE); PRINT_FEATURE (F_SSE2); PRINT_FEATURE (F_SSE3); PRINT_FEATURE (F_SSSE3); PRINT_FEATURE (F_SSE41); PRINT_FEATURE (F_SSE42); PRINT_FEATURE (F_AVX); PRINT_FEATURE (F_AVX2); PRINT_FEATURE (F_OSXSAVE); PRINT_FEATURE (F_F16C); PRINT_FEATURE (F_MULTINOP); PRINT_FEATURE (F_RDTSC); PRINT_FEATURE (F_RDTSCP); PRINT_FEATURE (F_CLFLUSH); PRINT_FEATURE (F_CMOV); PRINT_FEATURE (F_CPUID); PRINT_FEATURE (F_MOVBE); PRINT_FEATURE (F_BMI1); PRINT_FEATURE (F_BMI2); PRINT_FEATURE (F_LZCNT); PRINT_FEATURE (F_POPCNT); PRINT_FEATURE (F_FMA); PRINT_FEATURE (F_FMA4); PRINT_FEATURE (F_XOP); PRINT_FEATURE (F_TBM); PRINT_FEATURE (F_FXSR); PRINT_FEATURE (F_DAZ); } uint8_t * assembler_link (assembler_t *as, fragment_t *fragment, ...) { array_t *annotations = NULL; annotation_t ann, *a, *sentinel; uint8_t *writable, *executable; size_t n_annotations; size_t bits, mask; size_t offset; va_list list; annotation_t *tmp; if (as->verbose) print_features (as); executable = NULL; va_start (list, fragment); if (!array_create (&annotations, sizeof (annotation_t))) goto oom; offset = 0; while (fragment) { size_t n_bytes; uint8_t *code = array_get_data (&fragment->code, &n_bytes); ann.type = FRAGMENT; ann.fragment = code; ann.offset = offset; ann.size = 0; if (!add_annotation (&annotations, &ann)) goto oom; a = array_get_data (&fragment->annotations, &n_annotations); while (n_annotations--) { if (!(tmp = add_annotation (&annotations, a++))) goto oom; tmp->offset += offset; } offset += n_bytes; fragment = va_arg (list, fragment_t *); } /* Add sentinel annotation to determine size of code */ ann.type = NONE; ann.offset = offset; if (!(sentinel = add_annotation (&annotations, &ann))) goto oom; if (!gather_labels (annotations)) goto oom; resolve_refs (annotations, &bits, &mask); if (!code_manager_alloc ( as->code_manager, "hello", sentinel->offset, bits, mask, &writable, &executable)) { executable = NULL; goto oom; } copy_code (as, writable, annotations); as->last_size = sentinel->offset; oom: va_end (list); array_free (&annotations); return executable; } size_t assembler_get_last_size (assembler_t *as) { return as->last_size; } void assembler_free (assembler_t *as, uint8_t *code) { /* FIXME */ } fragment_t * fragment_new (assembler_t *as) { fragment_t *frag; if (!(frag = malloc (sizeof *frag))) goto oom_fragment; if (!array_create (&frag->annotations, sizeof (annotation_t))) goto oom_annotations; if (!array_create (&frag->code, sizeof (uint8_t))) goto oom_code; frag->as = as; frag->error = FALSE; return frag; oom_code: array_free (&frag->annotations); oom_annotations: free (frag); oom_fragment: return NULL; } #define GET_TYPE(op) \ ((op) & ((1 << 6) - 1)) #define GET_IMM(op) \ (((int64_t)(op)) >> 6) #define GET_LABEL(op) \ ((const char *)(size_t)((op) >> 6)) #define GET_REGNO(op) \ ((op) >> 6) static bool_t in_64_bit_mode (void) { #if defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64) return TRUE; #else return FALSE; #endif } static int is_type (op_t op, uint32_t types) { return (types & (1 << GET_TYPE (op))) != 0; } static int is_reg (op_t op) { return is_type (op, A_R8 | A_R | A_MMX | A_SSE | A_AVX | (1 << OP_FAKE_REG)); } static uint8_t * emit_address_byte (uint8_t *code, int m, int o, int r, uint8_t *e1, uint8_t *e2) { if (e1) *e1 = (o & 0x8) >> 3; if (e2) *e2 = (r & 0x8) >> 3; *code++ = ((m & 0x03) << 6) | ((o & 0x07) << 3) | ((r & 0x07)); return code; } static void copy_string (char *dst, const char *src, int size) { if (strlen (src) >= size) { assert (0); return; } strncpy (dst, src, size); dst[size - 1] = '\0'; } static uint8_t * emit_reg_regm (uint8_t *c, op_t reg, op_t regm, uint8_t *r, uint8_t *x, uint8_t *b, annotation_t *ann) { int reg_regno; assert (is_reg (reg)); reg_regno = GET_REGNO (reg); if (is_type (regm, (1 << OP_RIP_REL8) | (1 << OP_RIP_REL16) | (1 << OP_RIP_REL32) | (1 << OP_RIP_REL64) | (1 << OP_RIP_REL))) { c = emit_address_byte (c, 0, reg_regno, 5, r, NULL); ann->type = RIP_REF; ann->offset = (size_t)c; copy_string (ann->name, GET_LABEL (regm), sizeof ann->name); } else if (is_reg (regm)) { c = emit_address_byte ( c, 3, reg_regno, GET_REGNO (regm), r, b); } else { int shift = (regm >> 6) & 0x3; reg_t base = (regm >> 18) & 0x3ff; reg_t index = (regm >> 8) & 0x3ff; int32_t disp = (regm >> 32); int base_regno = GET_REGNO (base); int index_regno = GET_REGNO (index); int ebp_regno = GET_REGNO (ebp); int esp_regno = GET_REGNO (esp); int r12_regno = GET_REGNO (r12); int r13_regno = GET_REGNO (r13); if (index_regno == esp_regno) { printf ("esp/rsp can't be used as an index register\n"); abort(); } if (index == NO_REG) { if (base == NO_REG && in_64_bit_mode()) { c = emit_address_byte (c, 0, reg_regno, 4, r, NULL); c = emit_address_byte (c, 0, 4, 5, NULL, NULL); c = emit_imm32 (c, disp); } else if (base == rip || base == NO_REG) { /* In 32 bit mode this is an absolute address. In 64 bit * mode, it is a RIP-based address. */ c = emit_address_byte (c, 0, reg_regno, 5, r, NULL); c = emit_imm32 (c, disp); } else if (base_regno == esp_regno || base_regno == r12_regno) { if (disp == 0) { c = emit_address_byte (c, 0, reg_regno, esp_regno, r, NULL); c = emit_address_byte (c, 0, esp_regno, base_regno, x, b); } else if (IS_S8 (disp)) { c = emit_address_byte (c, 1, reg_regno, esp_regno, r, NULL); c = emit_address_byte (c, 0, esp_regno, base_regno, x, b); c = emit_imm8 (c, disp); } else { c = emit_address_byte (c, 2, reg_regno, esp_regno, r, NULL); c = emit_address_byte (c, 0, esp_regno, base_regno, x, b); c = emit_imm32 (c, disp); } } else if (disp == 0 && base_regno != ebp_regno && base_regno != r13_regno) { c = emit_address_byte (c, 0, reg_regno, base_regno, r, b); } else if (IS_S8 (disp)) { c = emit_address_byte (c, 1, reg_regno, base_regno, r, b); c = emit_imm8 (c, disp); } else { c = emit_address_byte (c, 2, reg_regno, base_regno, r, b); c = emit_imm32 (c, disp); } } else if (base == NO_REG) { c = emit_address_byte (c, 0, reg_regno, 4, r, NULL); c = emit_address_byte (c, shift, index_regno, 5, x, b); c = emit_imm32 (c, disp); } else if (disp == 0 && base_regno != ebp_regno && base_regno != r13_regno) { c = emit_address_byte (c, 0, reg_regno, 4, r, NULL); c = emit_address_byte (c, shift, index_regno, base_regno, x, b); } else if (IS_S8 (disp)) { c = emit_address_byte (c, 1, reg_regno, 4, r, NULL); c = emit_address_byte (c, shift, index_regno, base_regno, x, b); c = emit_imm8 (c, disp); } else { c = emit_address_byte (c, 2, reg_regno, 4, r, NULL); c = emit_address_byte (c, shift, index_regno, base_regno, x, b); c = emit_imm32 (c, disp); } } return c; } static uint8_t * emit_opcode (uint8_t *c, uint32_t opc) { if (opc & 0xff000000) *c++ = (opc & 0xff000000) >> 24; if (opc & 0xff0000) *c++ = (opc & 0xff0000) >> 16; if (opc & 0xff00) *c++ = (opc & 0xff00) >> 8; if (opc & 0xff) *c++ = (opc & 0xff) >> 0; if (!opc) *c++ = 0; return c; } static uint8_t * emit_imm (uint8_t *c, arg_type_t size, int32_t immediate) { if (size == A_I8) c = emit_imm8 (c, immediate); else if (size == A_I16) c = emit_imm16 (c, immediate); else if (size == A_I32) c = emit_imm32 (c, immediate); return c; } static void compute_sizes (int n_ops, const op_t ops[4], int *op_size, int *address_size) { int i; if (in_64_bit_mode()) *address_size = 64; else *address_size = 32; *op_size = 0; for (i = 0; i < n_ops; ++i) { if (is_type (ops[i], A_R8 | A_MEM8) && *op_size < 8) *op_size = 8; if (is_type (ops[i], A_R16 | A_MEM16) && *op_size < 16) *op_size = 16; else if (is_type (ops[i], A_R32 | A_MEM32) && *op_size < 32) *op_size = 32; else if (is_type (ops[i], A_R64 | A_MEM64) && *op_size < 64) *op_size = 64; if (is_type (ops[i], (1 << OP_MEM8) | (1 << OP_MEM16) | (1 << OP_MEM32) | (1 << OP_MEM64) | (1 << OP_MEM))) { reg_t base = (ops[i] >> 18) & 0x3ff; reg_t index = (ops[i] >> 8) & 0x3ff; if (is_type (base, A_R32) || is_type (index, A_R32)) *address_size = 32; } } } static void emit (fragment_t *frag, const variant_t *variant, const op_t ops[4]) { uint8_t code[16]; uint8_t prefixes[16]; uint8_t *c, *d, *p; uint8_t w, r, x, b; size_t offset; uint32_t opc; int regno; int op_size, address_size; annotation_t ann; int vex_regno; ann.type = NONE; c = code; p = prefixes; w = r = x = b = FALSE; /* Emit opcode */ if (variant->encoding != E_ANNOTATE && variant->encoding != E_D) { if (variant->encoding == E_O) { regno = GET_REGNO (ops[0]); opc = variant->opcode + (regno & 0x7); b = regno >> 3; } else { opc = variant->opcode; } c = emit_opcode (c, opc); } /* Emit operands */ vex_regno = 0x0; switch (variant->encoding) { case E_RM: c = emit_reg_regm (c, ops[0], ops[1], &r, &x, &b, &ann); break; case E_MR: c = emit_reg_regm (c, ops[1], ops[0], &r, &x, &b, &ann); break; case E_M: regno = GET_OP_EXTENSION (variant->info); c = emit_reg_regm (c, ((regno << 6) | OP_FAKE_REG), ops[0], &r, &x, &b, &ann); break; case E_VM: regno = GET_OP_EXTENSION (variant->info); c = emit_reg_regm (c, ((regno << 6) | OP_FAKE_REG), ops[1], &r, &x, &b, &ann); vex_regno = GET_REGNO (ops[0]); break; case E_RVM: c = emit_reg_regm (c, ops[0], ops[2], &r, &x, &b, &ann); vex_regno = GET_REGNO (ops[1]); break; case E_RVMI: c = emit_reg_regm (c, ops[0], ops[2], &r, &x, &b, &ann); c = emit_imm8 (c, GET_REGNO (ops[3]) << 4); vex_regno = GET_REGNO (ops[1]); break; case E_RVMX: c = emit_reg_regm (c, ops[0], ops[3], &r, &x, &b, &ann); c = emit_imm8 (c, GET_OP_EXTENSION (variant->info)); vex_regno = GET_REGNO (ops[1]); break; case E_RVIM: c = emit_reg_regm (c, ops[0], ops[3], &r, &x, &b, &ann); c = emit_imm8 (c, GET_REGNO (ops[2]) << 4); vex_regno = GET_REGNO (ops[1]); break; case E_MVR: c = emit_reg_regm (c, ops[2], ops[0], &r, &x, &b, &ann); vex_regno = GET_REGNO (ops[1]); break; case E_RMV: c = emit_reg_regm (c, ops[0], ops[1], &r, &x, &b, &ann); vex_regno = GET_REGNO (ops[2]); break; case E_ANNOTATE: ann.type = GET_ANNOTATION_TYPE (variant->info); ann.offset = (size_t)c; if (variant->ops[0] == A_LABEL) copy_string (ann.name, GET_LABEL (ops[0]), sizeof ann.name); if (ann.type == ALIGN) { ann.align_mask = 1; while (ann.align_mask < GET_IMM (ops[0])) ann.align_mask <<= 1; ann.align_mask -= 1; ann.align_bits = GET_IMM(ops[0]) & ann.align_mask; } else if (ann.type == JCC) { ann.opcode = variant->opcode; } break; case E_NP: case E_O: case E_D: break; case E_N_ENCODINGS: assert (!"should not be reached"); break; } /* Emit prefixes */ compute_sizes (GET_N_OPS (variant->inst), ops, &op_size, &address_size); if (in_64_bit_mode() && address_size == 32) *p++ = 0x67; if (op_size == 16) *p++ = 0x66; else if (op_size == 64 && !(variant->info & NO_REX_W)) w = TRUE; if (variant->info & PRE_66) *p++ = 0x66; if (variant->info & PRE_F3) *p++ = 0xf3; if (variant->info & PRE_F2) *p++ = 0xf2; if (variant->info & (NEED_VEX | NEED_XOP)) { uint8_t mmmm = GET_VEX_OPCODE (variant->info); uint8_t l = GET_VEX_LEN (variant->info); uint8_t w = GET_VEX_W (variant->info); uint8_t pp = GET_VEX_PREFIX (variant->info); uint8_t vvvv = vex_regno; if ((variant->info & NEED_VEX) && mmmm == 0x01 && !x && !b && w == WIG) { *p++ = 0xc5; *p++ = ((!r) << 7) | (((~vvvv) & 0xf) << 3) | (l << 2) | pp; } else { *p++ = (variant->info & NEED_XOP)? 0x8f : 0xc4; *p++ = ((!r) << 7) | ((!x) << 6) | ((!b) << 5) | mmmm; *p++ = (w << 7) | (((~vvvv) & 0xf) << 3) | (l << 2) | pp; } } else { uint8_t rex = (w << 3) | (r << 2) | (x << 1) | (b << 0); if (rex) *p++ = 0x40 | rex; } /* Emit immediates */ if (variant->encoding != E_ANNOTATE) { int i; for (i = 3; i >= 0; --i) c = emit_imm (c, variant->ops[i], GET_IMM (ops[i])); } array_get_data (&frag->code, &offset); if (frag->as->verbose) { printf ("%4ld: %8s | ", offset, GET_NAME (variant->inst)); for (d = prefixes; d < p; ++d) printf ("%02x ", *d); printf (". "); for (d = code; d < c; ++d) printf ("%02x ", *d); if (ann.type != NONE) printf ("[annotation] "); printf ("\n"); } if (ann.type != NONE) { ann.offset += offset + (p - prefixes) - (size_t)code; ann.size = 0; add_annotation (&frag->annotations, &ann); } array_append (&frag->code, (p - prefixes) + (c - code), &d); memcpy (d, prefixes, p - prefixes); memcpy (d + (p - prefixes), code, c - code); } void fragment_assemble (fragment_t *frag, const uint64_t *code) { uint64_t inst; while ((inst = *code++) != 0) { const variant_t *variant = NULL; int n_ops; int serial; serial = GET_SERIAL (inst); if (GET_TYPE (inst) != OP_INST || serial >= I_n_instructions) { printf ("%lx is not a valid instruction\n", inst); abort(); } variant = frag->as->first_variant[serial]; n_ops = GET_N_OPS (variant->inst); while (variant->inst == inst) { int j; for (j = 0; j < n_ops; ++j) { if (!is_type (code[j], variant->ops[j])) goto next_variant; } emit (frag, variant, code); goto next_instruction; next_variant: variant++; } printf ("Operand mismatch for \"%s\"\n", GET_NAME (inst)); abort(); next_instruction: code += n_ops; } }