diff options
author | L. E. Segovia <amy@centricular.com> | 2023-12-28 17:05:22 -0300 |
---|---|---|
committer | L. E. Segovia <amy@centricular.com> | 2023-12-29 13:01:27 +0000 |
commit | 23c36d52df371c100311f4f790639e9a7c8af033 (patch) | |
tree | 70265d4a916712aa5f2f3353794665fefa38bdf5 | |
parent | 1fb793ea5aabb6a5b16308465f63c5722437b20b (diff) |
avx: Implement convsssql
-rw-r--r-- | orc/orcrules-avx.c | 29 | ||||
-rw-r--r-- | orc/orcx86insn.c | 87 | ||||
-rw-r--r-- | orc/orcx86insn.h | 13 | ||||
-rw-r--r-- | testsuite/orcc/test.orc | 22 |
4 files changed, 140 insertions, 11 deletions
diff --git a/orc/orcrules-avx.c b/orc/orcrules-avx.c index 4879e1f..ee3888f 100644 --- a/orc/orcrules-avx.c +++ b/orc/orcrules-avx.c @@ -5,6 +5,7 @@ #include <orc/orcavx-internal.h> +#include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -3008,6 +3009,33 @@ avx_rule_convsuslw_avx2 (OrcCompiler *p, void *user, OrcInstruction *insn) } } +static void +avx_rule_convsssql_avx2 (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + const int src = p->vars[insn->src_args[0]].alloc; + const int dest = p->vars[insn->dest_args[0]].alloc; + const int size = p->vars[insn->src_args[0]].size << p->loop_shift; + const int tmpc_max = orc_compiler_get_temp_constant (p, 8, INT32_MAX); + const int tmpc_min = orc_compiler_get_temp_constant (p, 8, INT32_MIN); + const int tmp = orc_compiler_get_temp_reg (p); + + if (size >= 32) { + orc_avx_emit_pcmpgtq (p, src, tmpc_max, tmp); + orc_avx_emit_blendvpd (p, src, tmpc_max, tmp, dest); + orc_avx_emit_pcmpgtq (p, dest, tmpc_min, tmp); + orc_avx_emit_blendvpd (p, tmpc_min, dest, tmp, dest); + // full interleave required again + orc_avx_emit_pshufd (p, ORC_AVX_SSE_SHUF (3, 1, 2, 0), dest, dest); + orc_avx_emit_permute4x64_imm (p, ORC_AVX_SSE_SHUF (3, 1, 2, 0), dest, dest); + } else { + orc_avx_sse_emit_pcmpgtq (p, src, tmpc_max, tmp); + orc_avx_sse_emit_blendvpd (p, src, tmpc_max, tmp, dest); + orc_avx_sse_emit_pcmpgtq (p, dest, tmpc_min, tmp); + orc_avx_sse_emit_blendvpd (p, tmpc_min, dest, tmp, dest); + orc_avx_sse_emit_pshufd (p, ORC_AVX_SSE_SHUF (3, 1, 2, 0), dest, dest); + } +} + void orc_compiler_avx_register_rules (OrcTarget *target) { @@ -3239,6 +3267,7 @@ orc_compiler_avx_register_rules (OrcTarget *target) REGISTER_RULE_WITH_GENERIC (convulq, convulq_avx2); REGISTER_RULE_WITH_GENERIC (convssslw, convssslw_avx2); REGISTER_RULE_WITH_GENERIC (convsuslw, convsuslw_avx2); + REGISTER_RULE_WITH_GENERIC (convsssql, convsssql_avx2); REGISTER_RULE_WITH_GENERIC (mulslq, mulslq_avx2); REGISTER_RULE_WITH_GENERIC (mulhsl, mulhsl_avx2); REGISTER_RULE_WITH_GENERIC (cmpeqq, cmpeqq_avx2); diff --git a/orc/orcx86insn.c b/orc/orcx86insn.c index a3a8485..0ae3344 100644 --- a/orc/orcx86insn.c +++ b/orc/orcx86insn.c @@ -14,6 +14,9 @@ #include <orc/orcsse.h> #include <orc/orcmmx.h> +#include <orc/orcx86insn.h> + + #define ORC_VEX_3_BIT 0xC4 #define ORC_VEX_2_BIT 0xC5 @@ -296,6 +299,7 @@ static const OrcSysOpcode orc_x86_opcodes[] = { { "pinsrd", ORC_X86_INSN_TYPE_IMM8_REGM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x22 }, { "perm2i128", ORC_X86_INSN_TYPE_IMM8_MMXM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x46 }, { "pblendd", ORC_X86_INSN_TYPE_IMM8_MMXM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x02 }, + { "blendvpd", ORC_X86_INSN_TYPE_MMXM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x4b }, }; static void @@ -412,6 +416,7 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn) char src_op[40] = { 0 }; char dst_op[40] = { 0 }; char src_2nd_op[40] = { 0 }; + char src_3rd_op[40] = { 0 }; const OrcX86OpcodePrefix is_sse = get_common_reg_type(xinsn); @@ -578,9 +583,7 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn) const int operand2 = xinsn->src[0]; if (may_have_avx_operand && xinsn->src[1] != 0) { - // TODO: implement and figure out more opcode types switch (xinsn->opcode->type) { - case ORC_X86_INSN_TYPE_MMXM_MMX: case ORC_X86_INSN_TYPE_SSEM_SSE: case ORC_X86_INSN_TYPE_IMM8_MMXM_MMX: case ORC_X86_INSN_TYPE_IMM8_MMX_SHIFT: @@ -594,6 +597,26 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn) sprintf(src_2nd_op, "%%%s, ", orc_x86_get_simd_regname (operand2, ORC_X86_AVX_VEX128_PREFIX)); break; + case ORC_X86_INSN_TYPE_MMXM_MMX: + // In all cases it can be either a pointer to, or a XMM/YMM register + // Intel Intrinsics Manual s.2.3.9 + if (xinsn->type == ORC_X86_RM_REG) { + sprintf(src_2nd_op, "%%%s, ", + orc_x86_get_simd_regname (operand2, is_sse)); + } else if (xinsn->type == ORC_X86_RM_MEMOFFSET) { + sprintf(src_2nd_op, "%d(%%%s), ", xinsn->offset, + orc_x86_get_regname_ptr (p, xinsn->src[2])); + } else if (xinsn->type == ORC_X86_RM_MEMINDEX) { + sprintf(src_2nd_op, "%d(%%%s,%%%s,%d), ", xinsn->offset, + orc_x86_get_regname_ptr (p, xinsn->src[2]), + orc_x86_get_regname_ptr (p, xinsn->index_reg), + 1 << xinsn->shift); + } else { + ORC_COMPILER_ERROR(p, "Unhandled instruction type %d for 4th operand", xinsn->type); + ORC_ASSERT(0); + return; + } + break; case ORC_X86_INSN_TYPE_REGM_MMX: case ORC_X86_INSN_TYPE_MMXM_MMX_REV: case ORC_X86_INSN_TYPE_SSEM_SSE_REV: @@ -623,6 +646,24 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn) } } + if (xinsn->src[2] != 0) { + switch (xinsn->opcode->type) { + case ORC_X86_INSN_TYPE_MMXM_MMX: + if (may_have_avx_operand) { + sprintf(src_3rd_op, "%%%s, ", + orc_x86_get_simd_regname (xinsn->src[2], is_sse)); + } else { + ORC_COMPILER_ERROR(p, "Blends on SSE require XMM0 as the mask, this cannot be guaranteed"); + ORC_ASSERT(0); + return; + } + break; + default: + ORC_COMPILER_ERROR(p, "Unhandled instruction type %d for 4th operand", xinsn->type); + ORC_ASSERT(0); + return; + } + } // Handle destinations switch (xinsn->opcode->type) { @@ -721,13 +762,8 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn) } if (xinsn->prefix == ORC_X86_AVX_VEX128_PREFIX || xinsn->prefix == ORC_X86_AVX_VEX256_PREFIX) { - if (src_2nd_op[0] != '\0') { - ORC_ASM_CODE(p," v%s %s%s%s%s\n", xinsn->opcode->name, - imm_str, src_op, src_2nd_op, dst_op); - } else { - ORC_ASM_CODE(p," v%s %s%s%s\n", xinsn->opcode->name, - imm_str, src_op, dst_op); - } + ORC_ASM_CODE(p," v%s %s%s%s%s%s\n", xinsn->opcode->name, + imm_str, src_op, src_2nd_op, src_3rd_op, dst_op); } else { ORC_ASM_CODE(p," %s %s%s%s\n", xinsn->opcode->name, imm_str, src_op, dst_op); @@ -1202,8 +1238,17 @@ static void orc_vex_insn_output_immediate (OrcCompiler *const p, const OrcX86Insn *const xinsn) { switch ((OrcX86InsnType)xinsn->opcode->type) { - case ORC_X86_INSN_TYPE_REGM_MMX: case ORC_X86_INSN_TYPE_MMXM_MMX: + switch (xinsn->opcode_index) { + // Complete here with VPBLENDVB and VBLENDVPS when implemented + // Intel Intrinsics Manual s.2.3.9 + case ORC_X86_blendvpd_avx: + *p->codeptr++ = (xinsn->src[2] & 0xF) << 4; + default: + break; + } + break; + case ORC_X86_INSN_TYPE_REGM_MMX: case ORC_X86_INSN_TYPE_SSEM_AVX: case ORC_X86_INSN_TYPE_MMXM_MMX_REV: case ORC_X86_INSN_TYPE_SSEM_SSE_REV: @@ -2014,6 +2059,7 @@ orc_vex_emit_cpuinsn_size (OrcCompiler *const p, const int index, xinsn->prefix = prefix; xinsn->src[0] = src0; xinsn->src[1] = src1; + xinsn->src[2] = 0; xinsn->dest = dest; xinsn->type = ORC_X86_RM_REG; xinsn->size = size; @@ -2031,6 +2077,7 @@ orc_vex_emit_cpuinsn_imm (OrcCompiler *const p, const int index, const int imm, xinsn->imm = imm; xinsn->src[0] = src0; xinsn->src[1] = src1; + xinsn->src[2] = 0; xinsn->dest = dest; xinsn->type = ORC_X86_RM_REG; xinsn->size = 4; @@ -2050,6 +2097,7 @@ orc_vex_emit_cpuinsn_load_memoffset (OrcCompiler *const p, const int index, xinsn->imm = imm; xinsn->src[0] = src0; xinsn->src[1] = src1; + xinsn->src[2] = 0; xinsn->dest = dest; xinsn->type = ORC_X86_RM_MEMOFFSET; xinsn->offset = offset; @@ -2094,3 +2142,22 @@ orc_vex_emit_cpuinsn_load_memindex (OrcCompiler *const p, const int index, const xinsn->shift = shift; xinsn->size = size; } + +void +orc_vex_emit_blend_size (OrcCompiler *const p, const int index, + const int size, const int src0, const int src1, const int src2, + const int dest, const OrcX86OpcodePrefix prefix) +{ + OrcX86Insn *xinsn = orc_x86_get_output_insn (p); + const OrcSysOpcode *const opcode = orc_x86_opcodes + index; + + xinsn->opcode_index = index; + xinsn->opcode = opcode; + xinsn->prefix = prefix; + xinsn->src[0] = src0; + xinsn->src[1] = src1; + xinsn->src[2] = src2; + xinsn->dest = dest; + xinsn->type = ORC_X86_RM_REG; + xinsn->size = size; +} diff --git a/orc/orcx86insn.h b/orc/orcx86insn.h index 20ae099..4db30db 100644 --- a/orc/orcx86insn.h +++ b/orc/orcx86insn.h @@ -302,6 +302,7 @@ typedef enum ORC_X86_pinsrd, ORC_X86_permute2i128_avx, ORC_X86_pblendd_avx, + ORC_X86_blendvpd_avx, } OrcX86Opcode; typedef enum { @@ -330,7 +331,7 @@ struct _OrcX86Insn { // Immediate mode operand int imm; // Source register(s)/address - int src[2]; + int src[3]; // Destination int dest; // Operand size @@ -367,6 +368,9 @@ ORC_API void orc_vex_emit_cpuinsn_store_memoffset (OrcCompiler *p, int index, ORC_API void orc_vex_emit_cpuinsn_load_memindex (OrcCompiler *p, int index, int size, int imm, int offset, int src, int src_index, int shift, int dest, OrcX86OpcodePrefix prefix); +ORC_API void orc_vex_emit_blend_size (OrcCompiler *p, int opcode, int size, + int src0, int src1, int src2, int dest, OrcX86OpcodePrefix prefix); + #define orc_sse_emit_punpcklbw(p,a,b) orc_x86_emit_cpuinsn_size(p, ORC_X86_punpcklbw, 16, a, b) #define orc_avx_sse_emit_punpcklbw(p,s1,s2,d) orc_vex_emit_cpuinsn_size(p, ORC_X86_punpcklbw, 16, s1, s2, d, ORC_X86_AVX_VEX128_PREFIX) #define orc_avx_emit_punpcklbw(p,s1,s2,d) orc_vex_emit_cpuinsn_size(p, ORC_X86_punpcklbw, 32, s1, s2, d, ORC_X86_AVX_VEX256_PREFIX) @@ -993,6 +997,13 @@ ORC_API void orc_vex_emit_cpuinsn_load_memindex (OrcCompiler *p, int index, int #define orc_avx_emit_blendpd(p,imm,s1,s2,d) orc_vex_emit_cpuinsn_imm(p, ORC_X86_blendpd_avx, imm, s1, s2, d, ORC_X86_AVX_VEX256_PREFIX) #define orc_avx_emit_pblendd(p,imm,s1,s2,d) orc_vex_emit_cpuinsn_imm(p, ORC_X86_pblendd_avx, imm, s1, s2, d, ORC_X86_AVX_VEX256_PREFIX) +#define orc_avx_sse_emit_blendvpd(p, s1, s2, mask, d) \ + orc_vex_emit_blend_size (p, ORC_X86_blendvpd_avx, 1, s1, s2, mask, d, \ + ORC_X86_AVX_VEX128_PREFIX) +#define orc_avx_emit_blendvpd(p, s1, s2, mask, d) \ + orc_vex_emit_blend_size (p, ORC_X86_blendvpd_avx, 1, s1, s2, mask, d, \ + ORC_X86_AVX_VEX256_PREFIX) + #define orc_avx_sse_emit_pinsrd_register(p, imm, s1, s2, d) \ orc_vex_emit_cpuinsn_imm (p, ORC_X86_pinsrd, imm, s1, s2, d, \ ORC_X86_AVX_VEX128_PREFIX) diff --git a/testsuite/orcc/test.orc b/testsuite/orcc/test.orc index 371130a..a565739 100644 --- a/testsuite/orcc/test.orc +++ b/testsuite/orcc/test.orc @@ -940,3 +940,25 @@ convfl d1, s1 convlf d1, s1 +.function adder_orc_volume_u32 +.dest 4 d1 guint32 +.param 4 p1 +.const 4 c1 0x80000000 +.temp 8 t1 +.temp 4 t2 + +xorl t2, d1, c1 +mulslq t1, t2, p1 +shrsq t1, t1, 27 +convsssql t2, t1 +xorl d1, t2, c1 + + +.function adder_orc_volume_s32 +.dest 4 d1 gint32 +.param 4 p1 +.temp 8 t1 + +mulslq t1, d1, p1 +shrsq t1, t1, 27 +convsssql d1, t1 |