summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorL. E. Segovia <amy@centricular.com>2023-12-28 17:05:22 -0300
committerL. E. Segovia <amy@centricular.com>2023-12-29 13:01:27 +0000
commit23c36d52df371c100311f4f790639e9a7c8af033 (patch)
tree70265d4a916712aa5f2f3353794665fefa38bdf5
parent1fb793ea5aabb6a5b16308465f63c5722437b20b (diff)
avx: Implement convsssql
-rw-r--r--orc/orcrules-avx.c29
-rw-r--r--orc/orcx86insn.c87
-rw-r--r--orc/orcx86insn.h13
-rw-r--r--testsuite/orcc/test.orc22
4 files changed, 140 insertions, 11 deletions
diff --git a/orc/orcrules-avx.c b/orc/orcrules-avx.c
index 4879e1f..ee3888f 100644
--- a/orc/orcrules-avx.c
+++ b/orc/orcrules-avx.c
@@ -5,6 +5,7 @@
#include <orc/orcavx-internal.h>
+#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -3008,6 +3009,33 @@ avx_rule_convsuslw_avx2 (OrcCompiler *p, void *user, OrcInstruction *insn)
}
}
+static void
+avx_rule_convsssql_avx2 (OrcCompiler *p, void *user, OrcInstruction *insn)
+{
+ const int src = p->vars[insn->src_args[0]].alloc;
+ const int dest = p->vars[insn->dest_args[0]].alloc;
+ const int size = p->vars[insn->src_args[0]].size << p->loop_shift;
+ const int tmpc_max = orc_compiler_get_temp_constant (p, 8, INT32_MAX);
+ const int tmpc_min = orc_compiler_get_temp_constant (p, 8, INT32_MIN);
+ const int tmp = orc_compiler_get_temp_reg (p);
+
+ if (size >= 32) {
+ orc_avx_emit_pcmpgtq (p, src, tmpc_max, tmp);
+ orc_avx_emit_blendvpd (p, src, tmpc_max, tmp, dest);
+ orc_avx_emit_pcmpgtq (p, dest, tmpc_min, tmp);
+ orc_avx_emit_blendvpd (p, tmpc_min, dest, tmp, dest);
+ // full interleave required again
+ orc_avx_emit_pshufd (p, ORC_AVX_SSE_SHUF (3, 1, 2, 0), dest, dest);
+ orc_avx_emit_permute4x64_imm (p, ORC_AVX_SSE_SHUF (3, 1, 2, 0), dest, dest);
+ } else {
+ orc_avx_sse_emit_pcmpgtq (p, src, tmpc_max, tmp);
+ orc_avx_sse_emit_blendvpd (p, src, tmpc_max, tmp, dest);
+ orc_avx_sse_emit_pcmpgtq (p, dest, tmpc_min, tmp);
+ orc_avx_sse_emit_blendvpd (p, tmpc_min, dest, tmp, dest);
+ orc_avx_sse_emit_pshufd (p, ORC_AVX_SSE_SHUF (3, 1, 2, 0), dest, dest);
+ }
+}
+
void
orc_compiler_avx_register_rules (OrcTarget *target)
{
@@ -3239,6 +3267,7 @@ orc_compiler_avx_register_rules (OrcTarget *target)
REGISTER_RULE_WITH_GENERIC (convulq, convulq_avx2);
REGISTER_RULE_WITH_GENERIC (convssslw, convssslw_avx2);
REGISTER_RULE_WITH_GENERIC (convsuslw, convsuslw_avx2);
+ REGISTER_RULE_WITH_GENERIC (convsssql, convsssql_avx2);
REGISTER_RULE_WITH_GENERIC (mulslq, mulslq_avx2);
REGISTER_RULE_WITH_GENERIC (mulhsl, mulhsl_avx2);
REGISTER_RULE_WITH_GENERIC (cmpeqq, cmpeqq_avx2);
diff --git a/orc/orcx86insn.c b/orc/orcx86insn.c
index a3a8485..0ae3344 100644
--- a/orc/orcx86insn.c
+++ b/orc/orcx86insn.c
@@ -14,6 +14,9 @@
#include <orc/orcsse.h>
#include <orc/orcmmx.h>
+#include <orc/orcx86insn.h>
+
+
#define ORC_VEX_3_BIT 0xC4
#define ORC_VEX_2_BIT 0xC5
@@ -296,6 +299,7 @@ static const OrcSysOpcode orc_x86_opcodes[] = {
{ "pinsrd", ORC_X86_INSN_TYPE_IMM8_REGM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x22 },
{ "perm2i128", ORC_X86_INSN_TYPE_IMM8_MMXM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x46 },
{ "pblendd", ORC_X86_INSN_TYPE_IMM8_MMXM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x02 },
+ { "blendvpd", ORC_X86_INSN_TYPE_MMXM_MMX, ORC_VEX_W0 | ORC_VEX_ESCAPE_3A, ORC_VEX_SIMD_PREFIX_66, 0x4b },
};
static void
@@ -412,6 +416,7 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn)
char src_op[40] = { 0 };
char dst_op[40] = { 0 };
char src_2nd_op[40] = { 0 };
+ char src_3rd_op[40] = { 0 };
const OrcX86OpcodePrefix is_sse = get_common_reg_type(xinsn);
@@ -578,9 +583,7 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn)
const int operand2 = xinsn->src[0];
if (may_have_avx_operand && xinsn->src[1] != 0) {
- // TODO: implement and figure out more opcode types
switch (xinsn->opcode->type) {
- case ORC_X86_INSN_TYPE_MMXM_MMX:
case ORC_X86_INSN_TYPE_SSEM_SSE:
case ORC_X86_INSN_TYPE_IMM8_MMXM_MMX:
case ORC_X86_INSN_TYPE_IMM8_MMX_SHIFT:
@@ -594,6 +597,26 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn)
sprintf(src_2nd_op, "%%%s, ",
orc_x86_get_simd_regname (operand2, ORC_X86_AVX_VEX128_PREFIX));
break;
+ case ORC_X86_INSN_TYPE_MMXM_MMX:
+ // In all cases it can be either a pointer to, or a XMM/YMM register
+ // Intel Intrinsics Manual s.2.3.9
+ if (xinsn->type == ORC_X86_RM_REG) {
+ sprintf(src_2nd_op, "%%%s, ",
+ orc_x86_get_simd_regname (operand2, is_sse));
+ } else if (xinsn->type == ORC_X86_RM_MEMOFFSET) {
+ sprintf(src_2nd_op, "%d(%%%s), ", xinsn->offset,
+ orc_x86_get_regname_ptr (p, xinsn->src[2]));
+ } else if (xinsn->type == ORC_X86_RM_MEMINDEX) {
+ sprintf(src_2nd_op, "%d(%%%s,%%%s,%d), ", xinsn->offset,
+ orc_x86_get_regname_ptr (p, xinsn->src[2]),
+ orc_x86_get_regname_ptr (p, xinsn->index_reg),
+ 1 << xinsn->shift);
+ } else {
+ ORC_COMPILER_ERROR(p, "Unhandled instruction type %d for 4th operand", xinsn->type);
+ ORC_ASSERT(0);
+ return;
+ }
+ break;
case ORC_X86_INSN_TYPE_REGM_MMX:
case ORC_X86_INSN_TYPE_MMXM_MMX_REV:
case ORC_X86_INSN_TYPE_SSEM_SSE_REV:
@@ -623,6 +646,24 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn)
}
}
+ if (xinsn->src[2] != 0) {
+ switch (xinsn->opcode->type) {
+ case ORC_X86_INSN_TYPE_MMXM_MMX:
+ if (may_have_avx_operand) {
+ sprintf(src_3rd_op, "%%%s, ",
+ orc_x86_get_simd_regname (xinsn->src[2], is_sse));
+ } else {
+ ORC_COMPILER_ERROR(p, "Blends on SSE require XMM0 as the mask, this cannot be guaranteed");
+ ORC_ASSERT(0);
+ return;
+ }
+ break;
+ default:
+ ORC_COMPILER_ERROR(p, "Unhandled instruction type %d for 4th operand", xinsn->type);
+ ORC_ASSERT(0);
+ return;
+ }
+ }
// Handle destinations
switch (xinsn->opcode->type) {
@@ -721,13 +762,8 @@ orc_x86_insn_output_asm (OrcCompiler *p, OrcX86Insn *xinsn)
}
if (xinsn->prefix == ORC_X86_AVX_VEX128_PREFIX || xinsn->prefix == ORC_X86_AVX_VEX256_PREFIX) {
- if (src_2nd_op[0] != '\0') {
- ORC_ASM_CODE(p," v%s %s%s%s%s\n", xinsn->opcode->name,
- imm_str, src_op, src_2nd_op, dst_op);
- } else {
- ORC_ASM_CODE(p," v%s %s%s%s\n", xinsn->opcode->name,
- imm_str, src_op, dst_op);
- }
+ ORC_ASM_CODE(p," v%s %s%s%s%s%s\n", xinsn->opcode->name,
+ imm_str, src_op, src_2nd_op, src_3rd_op, dst_op);
} else {
ORC_ASM_CODE(p," %s %s%s%s\n", xinsn->opcode->name,
imm_str, src_op, dst_op);
@@ -1202,8 +1238,17 @@ static void
orc_vex_insn_output_immediate (OrcCompiler *const p, const OrcX86Insn *const xinsn)
{
switch ((OrcX86InsnType)xinsn->opcode->type) {
- case ORC_X86_INSN_TYPE_REGM_MMX:
case ORC_X86_INSN_TYPE_MMXM_MMX:
+ switch (xinsn->opcode_index) {
+ // Complete here with VPBLENDVB and VBLENDVPS when implemented
+ // Intel Intrinsics Manual s.2.3.9
+ case ORC_X86_blendvpd_avx:
+ *p->codeptr++ = (xinsn->src[2] & 0xF) << 4;
+ default:
+ break;
+ }
+ break;
+ case ORC_X86_INSN_TYPE_REGM_MMX:
case ORC_X86_INSN_TYPE_SSEM_AVX:
case ORC_X86_INSN_TYPE_MMXM_MMX_REV:
case ORC_X86_INSN_TYPE_SSEM_SSE_REV:
@@ -2014,6 +2059,7 @@ orc_vex_emit_cpuinsn_size (OrcCompiler *const p, const int index,
xinsn->prefix = prefix;
xinsn->src[0] = src0;
xinsn->src[1] = src1;
+ xinsn->src[2] = 0;
xinsn->dest = dest;
xinsn->type = ORC_X86_RM_REG;
xinsn->size = size;
@@ -2031,6 +2077,7 @@ orc_vex_emit_cpuinsn_imm (OrcCompiler *const p, const int index, const int imm,
xinsn->imm = imm;
xinsn->src[0] = src0;
xinsn->src[1] = src1;
+ xinsn->src[2] = 0;
xinsn->dest = dest;
xinsn->type = ORC_X86_RM_REG;
xinsn->size = 4;
@@ -2050,6 +2097,7 @@ orc_vex_emit_cpuinsn_load_memoffset (OrcCompiler *const p, const int index,
xinsn->imm = imm;
xinsn->src[0] = src0;
xinsn->src[1] = src1;
+ xinsn->src[2] = 0;
xinsn->dest = dest;
xinsn->type = ORC_X86_RM_MEMOFFSET;
xinsn->offset = offset;
@@ -2094,3 +2142,22 @@ orc_vex_emit_cpuinsn_load_memindex (OrcCompiler *const p, const int index, const
xinsn->shift = shift;
xinsn->size = size;
}
+
+void
+orc_vex_emit_blend_size (OrcCompiler *const p, const int index,
+ const int size, const int src0, const int src1, const int src2,
+ const int dest, const OrcX86OpcodePrefix prefix)
+{
+ OrcX86Insn *xinsn = orc_x86_get_output_insn (p);
+ const OrcSysOpcode *const opcode = orc_x86_opcodes + index;
+
+ xinsn->opcode_index = index;
+ xinsn->opcode = opcode;
+ xinsn->prefix = prefix;
+ xinsn->src[0] = src0;
+ xinsn->src[1] = src1;
+ xinsn->src[2] = src2;
+ xinsn->dest = dest;
+ xinsn->type = ORC_X86_RM_REG;
+ xinsn->size = size;
+}
diff --git a/orc/orcx86insn.h b/orc/orcx86insn.h
index 20ae099..4db30db 100644
--- a/orc/orcx86insn.h
+++ b/orc/orcx86insn.h
@@ -302,6 +302,7 @@ typedef enum
ORC_X86_pinsrd,
ORC_X86_permute2i128_avx,
ORC_X86_pblendd_avx,
+ ORC_X86_blendvpd_avx,
} OrcX86Opcode;
typedef enum {
@@ -330,7 +331,7 @@ struct _OrcX86Insn {
// Immediate mode operand
int imm;
// Source register(s)/address
- int src[2];
+ int src[3];
// Destination
int dest;
// Operand size
@@ -367,6 +368,9 @@ ORC_API void orc_vex_emit_cpuinsn_store_memoffset (OrcCompiler *p, int index,
ORC_API void orc_vex_emit_cpuinsn_load_memindex (OrcCompiler *p, int index, int size,
int imm, int offset, int src, int src_index, int shift, int dest, OrcX86OpcodePrefix prefix);
+ORC_API void orc_vex_emit_blend_size (OrcCompiler *p, int opcode, int size,
+ int src0, int src1, int src2, int dest, OrcX86OpcodePrefix prefix);
+
#define orc_sse_emit_punpcklbw(p,a,b) orc_x86_emit_cpuinsn_size(p, ORC_X86_punpcklbw, 16, a, b)
#define orc_avx_sse_emit_punpcklbw(p,s1,s2,d) orc_vex_emit_cpuinsn_size(p, ORC_X86_punpcklbw, 16, s1, s2, d, ORC_X86_AVX_VEX128_PREFIX)
#define orc_avx_emit_punpcklbw(p,s1,s2,d) orc_vex_emit_cpuinsn_size(p, ORC_X86_punpcklbw, 32, s1, s2, d, ORC_X86_AVX_VEX256_PREFIX)
@@ -993,6 +997,13 @@ ORC_API void orc_vex_emit_cpuinsn_load_memindex (OrcCompiler *p, int index, int
#define orc_avx_emit_blendpd(p,imm,s1,s2,d) orc_vex_emit_cpuinsn_imm(p, ORC_X86_blendpd_avx, imm, s1, s2, d, ORC_X86_AVX_VEX256_PREFIX)
#define orc_avx_emit_pblendd(p,imm,s1,s2,d) orc_vex_emit_cpuinsn_imm(p, ORC_X86_pblendd_avx, imm, s1, s2, d, ORC_X86_AVX_VEX256_PREFIX)
+#define orc_avx_sse_emit_blendvpd(p, s1, s2, mask, d) \
+ orc_vex_emit_blend_size (p, ORC_X86_blendvpd_avx, 1, s1, s2, mask, d, \
+ ORC_X86_AVX_VEX128_PREFIX)
+#define orc_avx_emit_blendvpd(p, s1, s2, mask, d) \
+ orc_vex_emit_blend_size (p, ORC_X86_blendvpd_avx, 1, s1, s2, mask, d, \
+ ORC_X86_AVX_VEX256_PREFIX)
+
#define orc_avx_sse_emit_pinsrd_register(p, imm, s1, s2, d) \
orc_vex_emit_cpuinsn_imm (p, ORC_X86_pinsrd, imm, s1, s2, d, \
ORC_X86_AVX_VEX128_PREFIX)
diff --git a/testsuite/orcc/test.orc b/testsuite/orcc/test.orc
index 371130a..a565739 100644
--- a/testsuite/orcc/test.orc
+++ b/testsuite/orcc/test.orc
@@ -940,3 +940,25 @@ convfl d1, s1
convlf d1, s1
+.function adder_orc_volume_u32
+.dest 4 d1 guint32
+.param 4 p1
+.const 4 c1 0x80000000
+.temp 8 t1
+.temp 4 t2
+
+xorl t2, d1, c1
+mulslq t1, t2, p1
+shrsq t1, t1, 27
+convsssql t2, t1
+xorl d1, t2, c1
+
+
+.function adder_orc_volume_s32
+.dest 4 d1 gint32
+.param 4 p1
+.temp 8 t1
+
+mulslq t1, d1, p1
+shrsq t1, t1, 27
+convsssql d1, t1