summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJorge Zapata <jorgeluis.zapata@gmail.com>2024-01-20 10:32:38 +0100
committerJorge Zapata <jorgeluis.zapata@gmail.com>2024-03-12 10:03:58 +0100
commit7e92b5304e2d0d240a5d3557ee7f2a65956ace07 (patch)
tree3beda1a7b004898befdefab2b9b920b00ceb3b56
parentadac8626bb1c317103a07d95770b2c6fbf026353 (diff)
Initial migration of AVX target
Part-of: <https://gitlab.freedesktop.org/gstreamer/orc/-/merge_requests/148>
-rw-r--r--orc/orccpu-x86.c3
-rw-r--r--orc/orcprogram-avx.c1000
-rw-r--r--orc/orcprogram-x86.c35
-rw-r--r--orc/orcx86.h5
4 files changed, 203 insertions, 840 deletions
diff --git a/orc/orccpu-x86.c b/orc/orccpu-x86.c
index defa101..c7be3c1 100644
--- a/orc/orccpu-x86.c
+++ b/orc/orccpu-x86.c
@@ -585,6 +585,3 @@ orc_mmx_get_cpu_flags(void)
orc_x86_detect_cpuid ();
return orc_x86_mmx_flags;
}
-
-
-
diff --git a/orc/orcprogram-avx.c b/orc/orcprogram-avx.c
index 52bcf01..f6b4257 100644
--- a/orc/orcprogram-avx.c
+++ b/orc/orcprogram-avx.c
@@ -14,60 +14,9 @@
#include <orc/orcinternal.h>
#include <orc/orcprogram.h>
-#define SIZE 65536
-
-#define ORC_SSE_ALIGNED_DEST_CUTOFF 64
-
-static void orc_avx_emit_loop (OrcCompiler *compiler, int offset, int update);
-static void orc_compiler_avx_init (OrcCompiler *const compiler);
-static unsigned int orc_compiler_avx_get_default_flags (void);
-static void orc_compiler_avx_assemble (OrcCompiler *compiler);
-static void avx_load_constant (OrcCompiler *compiler, int reg, int size,
- int value);
-static void avx_load_constant_long (OrcCompiler *compiler, int reg,
- OrcConstant *constant);
-static const char *avx_get_flag_name (const int shift);
-
-void
-orc_avx_init (void)
-{
- // clang-format off
- static OrcTarget target = {
- "avx",
- #if defined(HAVE_I386) || defined(HAVE_AMD64)
- TRUE,
- #else
- FALSE,
- #endif
- ORC_VEC_REG_BASE,
- orc_compiler_avx_get_default_flags,
- orc_compiler_avx_init,
- orc_compiler_avx_assemble,
- { { 0 } },
- 0,
- NULL,
- avx_load_constant,
- avx_get_flag_name,
- NULL,
- avx_load_constant_long
- };
- // clang-format on
-
-#if defined(HAVE_I386) || defined(HAVE_AMD64)
- /* initializes cache information */
- const int flags = orc_sse_get_cpu_flags ();
-
- if (!(flags & ORC_TARGET_AVX_AVX) || !(flags & ORC_TARGET_AVX_AVX2))
- target.executable = FALSE;
-#endif
-
- orc_target_register (&target);
-
- orc_compiler_avx_register_rules (&target);
-}
-
+/* X86 specific */
static unsigned int
-orc_compiler_avx_get_default_flags (void)
+avx_get_default_flags (void)
{
unsigned int flags = 0;
@@ -111,208 +60,161 @@ avx_get_flag_name (const int shift)
return NULL;
}
-static void
-orc_compiler_avx_init (OrcCompiler *const compiler)
+static int
+avx_is_executable (void)
{
- int i;
+#if defined(HAVE_I386) || defined(HAVE_AMD64)
+ /* initializes cache information */
+ const int flags = orc_sse_get_cpu_flags ();
- if (compiler->target_flags & ORC_TARGET_SSE_64BIT) {
- compiler->is_64bit = TRUE;
- }
- if (compiler->target_flags & ORC_TARGET_SSE_FRAME_POINTER) {
- compiler->use_frame_pointer = TRUE;
- }
- if (!(compiler->target_flags & ORC_TARGET_SSE_SHORT_JUMPS)) {
- compiler->long_jumps = TRUE;
+ if ((flags & ORC_TARGET_AVX_AVX) && (flags & ORC_TARGET_AVX_AVX2)) {
+ return TRUE;
}
+#endif
+ return FALSE;
+}
- if (compiler->is_64bit) {
- for (i = ORC_GP_REG_BASE; i < ORC_GP_REG_BASE + 16; i++) {
- compiler->valid_regs[i] = 1;
- }
- compiler->valid_regs[X86_ESP] = 0;
+static void
+avx_validate_registers (int *regs, int is_64bit)
+{
+ int i;
+
+ if (is_64bit) {
for (i = 0; i < ORC_AVX_REG_AMOUNT; i++) {
- compiler->valid_regs[X86_YMM0 + i] = 1;
+ regs[X86_YMM0 + i] = 1;
}
+ } else {
+ for (i = 0; i < ORC_AVX_REG_AMOUNT - 8; i++) {
+ regs[X86_YMM0 + i] = 1;
+ }
+ }
+}
- compiler->save_regs[X86_EBX] = 1;
- compiler->save_regs[X86_EBP] = 1;
- compiler->save_regs[X86_R12] = 1;
- compiler->save_regs[X86_R13] = 1;
- compiler->save_regs[X86_R14] = 1;
- compiler->save_regs[X86_R15] = 1;
+static void
+avx_saveable_registers (int *regs, int is_64bit)
+{
#ifdef HAVE_OS_WIN32
- compiler->save_regs[X86_EDI] = 1;
- compiler->save_regs[X86_ESI] = 1;
- // When present, the upper portions of YMM0-YMM15 and ZMM0-ZMM15 are also
- // volatile
- // https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170#callercallee-saved-registers
+ if (is_64bit) {
+ int i;
for (i = 6; i < ORC_AVX_REG_AMOUNT; i++) {
- compiler->save_regs[X86_YMM0 + i] = 1;
+ regs[X86_YMM0 + i] = 1;
}
+ }
#endif
+}
+
+static int
+avx_is_64bit (int flags)
+{
+ if (flags & ORC_TARGET_SSE_64BIT) {
+ return TRUE;
} else {
- for (i = ORC_GP_REG_BASE; i < ORC_GP_REG_BASE + 8; i++) {
- compiler->valid_regs[i] = 1;
- }
- compiler->valid_regs[X86_ESP] = 0;
- if (compiler->use_frame_pointer) {
- compiler->valid_regs[X86_EBP] = 0;
- }
- for (i = 0; i < ORC_AVX_REG_AMOUNT - 8; i++) {
- compiler->valid_regs[X86_YMM0 + i] = 1;
- }
- compiler->save_regs[X86_EBX] = 1;
- compiler->save_regs[X86_EDI] = 1;
- compiler->save_regs[X86_EBP] = 1;
+ return FALSE;
}
- for (i = 0; i < 128; i++) {
- compiler->alloc_regs[i] = 0;
- compiler->used_regs[i] = 0;
+}
+
+static int
+avx_use_frame_pointer (int flags)
+{
+ if (flags & ORC_TARGET_SSE_FRAME_POINTER) {
+ return TRUE;
+ } else {
+ return FALSE;
}
+}
- if (compiler->is_64bit) {
-#ifdef HAVE_OS_WIN32
- compiler->exec_reg = X86_ECX;
- compiler->gp_tmpreg = X86_EDX;
-#else
- compiler->exec_reg = X86_EDI;
- compiler->gp_tmpreg = X86_ECX;
-#endif
+static int
+avx_use_long_jumps (int flags)
+{
+ if (!(flags & ORC_TARGET_SSE_SHORT_JUMPS)) {
+ return TRUE;
} else {
- compiler->gp_tmpreg = X86_ECX;
- if (compiler->use_frame_pointer) {
- compiler->exec_reg = X86_EBX;
- } else {
- compiler->exec_reg = X86_EBP;
- }
+ return FALSE;
}
- compiler->valid_regs[compiler->gp_tmpreg] = 0;
- compiler->valid_regs[compiler->exec_reg] = 0;
+}
- switch (compiler->max_var_size) {
+static int
+avx_loop_shift (int max_var_size)
+{
+ switch (max_var_size) {
case 1:
- compiler->loop_shift = 5;
- break;
+ return 5;
case 2:
- compiler->loop_shift = 4;
- break;
+ return 4;
case 4:
- compiler->loop_shift = 3;
- break;
+ return 3;
case 8:
- compiler->loop_shift = 2;
- break;
+ return 2;
default:
- ORC_ERROR ("unhandled max var size %d", compiler->max_var_size);
+ ORC_ERROR ("unhandled max var size %d", max_var_size);
break;
}
- /* This limit is arbitrary, but some large functions run slightly
- slower when unrolled (ginger Core2 6,15,6), and only some small
- functions run faster when unrolled. Most are the same speed. */
- /* Also don't enable unrolling with loop_shift == 0, this enables
- double reading in the hot loop. */
- if (compiler->n_insns <= 10 && compiler->loop_shift > 0) {
- compiler->unroll_shift = 1;
- }
- if (!compiler->long_jumps) {
- compiler->unroll_shift = 0;
- }
- compiler->alloc_loop_counter = TRUE;
- compiler->allow_gp_on_stack = TRUE;
-
- {
- for (i = 0; i < compiler->n_insns; i++) {
- OrcInstruction *insn = compiler->insns + i;
- OrcStaticOpcode *opcode = insn->opcode;
-
- if (strcmp (opcode->name, "ldreslinb") == 0
- || strcmp (opcode->name, "ldreslinl") == 0
- || strcmp (opcode->name, "ldresnearb") == 0
- || strcmp (opcode->name, "ldresnearl") == 0) {
- compiler->vars[insn->src_args[0]].need_offset_reg = TRUE;
- }
- }
- }
+ return -1;
}
-void
-avx_save_accumulators (OrcCompiler *compiler)
+static void
+avx_init_accumulator (OrcCompiler *compiler, OrcVariable *var)
{
- for (int i = 0; i < ORC_N_COMPILER_VARIABLES; i++) {
- OrcVariable *var = compiler->vars + i;
-
- if (var->name == NULL)
- continue;
- switch (var->vartype) {
- case ORC_VAR_TYPE_ACCUMULATOR:
- {
- const int src = var->alloc;
- const int tmp = orc_compiler_get_temp_reg (compiler);
-
- // duplicate the high lane
- orc_avx_emit_extractf128_si256 (compiler, 1, src, tmp);
-
- // Pairwise summation
- if (var->size == 2) {
- orc_avx_sse_emit_paddw (compiler, src, tmp, src);
- } else {
- orc_avx_sse_emit_paddd (compiler, src, tmp, src);
- }
-
- // Duplicate the high half now
- orc_avx_sse_emit_pshufd (compiler, ORC_AVX_SSE_SHUF (3, 2, 3, 2), src,
- tmp);
-
- // Pairwise summation
- if (var->size == 2) {
- orc_avx_sse_emit_paddw (compiler, src, tmp, src);
- } else {
- orc_avx_sse_emit_paddd (compiler, src, tmp, src);
- }
-
- // Combine the remaining two pairs in the low half
- orc_avx_sse_emit_pshufd (compiler, ORC_AVX_SSE_SHUF (1, 1, 1, 1), src,
- tmp);
-
- // Pairwise summation
- if (var->size == 2) {
- orc_avx_sse_emit_paddw (compiler, src, tmp, src);
- } else {
- orc_avx_sse_emit_paddd (compiler, src, tmp, src);
- }
-
- // Reduce the last pair if it's 16-bit
- if (var->size == 2) {
- orc_avx_sse_emit_pshuflw (compiler, ORC_AVX_SSE_SHUF (1, 1, 1, 1),
- src, tmp);
- orc_avx_sse_emit_paddw (compiler, src, tmp, src);
- }
-
- if (var->size == 2) {
- orc_avx_sse_emit_pextrw_memoffset (compiler, 0,
- (int)ORC_STRUCT_OFFSET (OrcExecutor,
- accumulators[i - ORC_VAR_A1]),
- src, compiler->exec_reg);
- } else {
- orc_x86_emit_mov_avx_memoffset (compiler, 4, src,
- (int)ORC_STRUCT_OFFSET (OrcExecutor,
- accumulators[i - ORC_VAR_A1]),
- compiler->exec_reg, var->is_aligned, var->is_uncached);
- }
- }
- break;
- default:
- break;
- }
- }
+ orc_avx_emit_pxor (compiler, var->alloc, var->alloc, var->alloc);
}
static void
-avx_load_constant (OrcCompiler *compiler, int reg, int size, int value)
+avx_reduce_accumulator (OrcCompiler *compiler, int i, OrcVariable *var)
{
- orc_avx_load_constant (compiler, reg, size, value);
+ const int src = var->alloc;
+ const int tmp = orc_compiler_get_temp_reg (compiler);
+
+ // duplicate the high lane
+ orc_avx_emit_extractf128_si256 (compiler, 1, src, tmp);
+
+ // Pairwise summation
+ if (var->size == 2) {
+ orc_avx_sse_emit_paddw (compiler, src, tmp, src);
+ } else {
+ orc_avx_sse_emit_paddd (compiler, src, tmp, src);
+ }
+
+ // Duplicate the high half now
+ orc_avx_sse_emit_pshufd (compiler, ORC_AVX_SSE_SHUF (3, 2, 3, 2), src,
+ tmp);
+
+ // Pairwise summation
+ if (var->size == 2) {
+ orc_avx_sse_emit_paddw (compiler, src, tmp, src);
+ } else {
+ orc_avx_sse_emit_paddd (compiler, src, tmp, src);
+ }
+
+ // Combine the remaining two pairs in the low half
+ orc_avx_sse_emit_pshufd (compiler, ORC_AVX_SSE_SHUF (1, 1, 1, 1), src,
+ tmp);
+
+ // Pairwise summation
+ if (var->size == 2) {
+ orc_avx_sse_emit_paddw (compiler, src, tmp, src);
+ } else {
+ orc_avx_sse_emit_paddd (compiler, src, tmp, src);
+ }
+
+ // Reduce the last pair if it's 16-bit
+ if (var->size == 2) {
+ orc_avx_sse_emit_pshuflw (compiler, ORC_AVX_SSE_SHUF (1, 1, 1, 1),
+ src, tmp);
+ orc_avx_sse_emit_paddw (compiler, src, tmp, src);
+ }
+
+ if (var->size == 2) {
+ orc_avx_sse_emit_pextrw_memoffset (compiler, 0,
+ (int)ORC_STRUCT_OFFSET (OrcExecutor,
+ accumulators[i - ORC_VAR_A1]),
+ src, compiler->exec_reg);
+ } else {
+ orc_x86_emit_mov_avx_memoffset (compiler, 4, src,
+ (int)ORC_STRUCT_OFFSET (OrcExecutor,
+ accumulators[i - ORC_VAR_A1]),
+ compiler->exec_reg, var->is_aligned, var->is_uncached);
+ }
}
void
@@ -419,6 +321,7 @@ orc_avx_load_constant (OrcCompiler *compiler, int reg, int size,
orc_avx_emit_broadcast (compiler, reg, reg, 4);
}
+
static void
avx_load_constant_long (OrcCompiler *compiler, int reg, OrcConstant *constant)
{
@@ -435,168 +338,21 @@ avx_load_constant_long (OrcCompiler *compiler, int reg, OrcConstant *constant)
orc_avx_emit_broadcast (compiler, reg, reg, 16);
}
-void
-avx_load_constants_outer (OrcCompiler *compiler)
-{
- for (int i = 0; i < ORC_N_COMPILER_VARIABLES; i++) {
- if (compiler->vars[i].name == NULL)
- continue;
- switch (compiler->vars[i].vartype) {
- case ORC_VAR_TYPE_ACCUMULATOR:
- orc_avx_emit_pxor (compiler, compiler->vars[i].alloc,
- compiler->vars[i].alloc, compiler->vars[i].alloc);
- case ORC_VAR_TYPE_CONST:
- case ORC_VAR_TYPE_PARAM:
- case ORC_VAR_TYPE_SRC:
- case ORC_VAR_TYPE_DEST:
- case ORC_VAR_TYPE_TEMP:
- break;
- default:
- orc_compiler_error (compiler, "bad vartype");
- break;
- }
- }
-
- orc_compiler_emit_invariants (compiler);
-
- /* FIXME move to a better place */
- for (int i = 0; i < compiler->n_constants; i++) {
- compiler->constants[i].alloc_reg = orc_compiler_get_constant_reg (compiler);
- }
-
- for (int i = 0; i < compiler->n_constants; i++) {
- if (compiler->constants[i].alloc_reg) {
- if (compiler->constants[i].is_long) {
- avx_load_constant_long (compiler, compiler->constants[i].alloc_reg,
- compiler->constants + i);
- } else {
- avx_load_constant (compiler, compiler->constants[i].alloc_reg, 4,
- compiler->constants[i].value);
- }
- }
- }
-
- {
- for (int i = 0; i < compiler->n_insns; i++) {
- OrcInstruction *insn = compiler->insns + i;
- OrcStaticOpcode *opcode = insn->opcode;
-
- if (strcmp (opcode->name, "ldreslinb") == 0
- || strcmp (opcode->name, "ldreslinl") == 0
- || strcmp (opcode->name, "ldresnearb") == 0
- || strcmp (opcode->name, "ldresnearl") == 0) {
- if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, params[insn->src_args[1]]),
- compiler->exec_reg, compiler->vars[insn->src_args[0]].ptr_offset);
- } else {
- orc_x86_emit_mov_imm_reg (compiler, 4,
- compiler->vars[insn->src_args[1]].value.i,
- compiler->vars[insn->src_args[0]].ptr_offset);
- }
- }
- }
- }
-}
-
-void
-avx_load_constants_inner (OrcCompiler *compiler)
+static void
+avx_move_register_to_memoffset (OrcCompiler *compiler, int size, int reg1, int offset, int reg2, int aligned, int uncached)
{
- for (int i = 0; i < ORC_N_COMPILER_VARIABLES; i++) {
- if (compiler->vars[i].name == NULL)
- continue;
- switch (compiler->vars[i].vartype) {
- case ORC_VAR_TYPE_SRC:
- case ORC_VAR_TYPE_DEST:
- if (compiler->vars[i].ptr_register) {
- orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, arrays[i]),
- compiler->exec_reg, compiler->vars[i].ptr_register);
- }
- break;
- case ORC_VAR_TYPE_CONST:
- case ORC_VAR_TYPE_PARAM:
- case ORC_VAR_TYPE_ACCUMULATOR:
- case ORC_VAR_TYPE_TEMP:
- break;
- default:
- orc_compiler_error (compiler, "bad vartype");
- break;
- }
- }
+ orc_x86_emit_mov_avx_memoffset (compiler, size, reg1, offset, reg2, aligned, uncached);
}
-void
-avx_add_strides (OrcCompiler *compiler)
-{
- for (int i = 0; i < ORC_N_COMPILER_VARIABLES; i++) {
- if (compiler->vars[i].name == NULL)
- continue;
- switch (compiler->vars[i].vartype) {
- case ORC_VAR_TYPE_SRC:
- case ORC_VAR_TYPE_DEST:
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, params[i]), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_add_reg_memoffset (compiler, compiler->is_64bit ? 8 : 4,
- compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, arrays[i]),
- compiler->exec_reg);
-
- if (compiler->vars[i].ptr_register == 0) {
- orc_compiler_error (compiler,
- "unimplemented: stride on pointer stored in memory");
- }
- break;
- case ORC_VAR_TYPE_CONST:
- case ORC_VAR_TYPE_PARAM:
- case ORC_VAR_TYPE_ACCUMULATOR:
- case ORC_VAR_TYPE_TEMP:
- break;
- default:
- orc_compiler_error (compiler, "bad vartype");
- break;
- }
- }
-}
-
-static int
-get_align_var (OrcCompiler *compiler)
+static void
+avx_move_memoffset_to_register (OrcCompiler *compiler, int size, int offset, int reg1, int reg2, int is_aligned)
{
- for (int i = ORC_VAR_D1; i <= ORC_VAR_S8; i++) {
- if (compiler->vars[i].size == 0)
- continue;
- if ((compiler->vars[i].size << compiler->loop_shift) >= 32) {
- return i;
- }
- }
- for (int i = ORC_VAR_D1; i <= ORC_VAR_S8; i++) {
- if (compiler->vars[i].size == 0)
- continue;
- if ((compiler->vars[i].size << compiler->loop_shift) >= 16) {
- return i;
- }
- }
- for (int i = ORC_VAR_D1; i <= ORC_VAR_S8; i++) {
- if (compiler->vars[i].size == 0)
- continue;
- if ((compiler->vars[i].size << compiler->loop_shift) >= 8) {
- return i;
- }
- }
- for (int i = ORC_VAR_D1; i <= ORC_VAR_S8; i++) {
- if (compiler->vars[i].size == 0)
- continue;
- return i;
- }
-
- orc_compiler_error (compiler, "could not find alignment variable");
+ orc_x86_emit_mov_memoffset_avx (compiler, size, offset, reg1, reg2, is_aligned);
- return -1;
}
static int
-get_shift (int size)
+avx_get_shift (int size)
{
switch (size) {
case 1:
@@ -618,159 +374,51 @@ get_shift (int size)
}
static void
-orc_emit_split_3_regions (OrcCompiler *compiler)
+avx_set_mxcsr (OrcCompiler *c)
{
- int align_var;
- int align_shift;
- int var_size_shift;
-
- align_var = get_align_var (compiler);
- if (align_var < 0)
- return;
- var_size_shift = get_shift (compiler->vars[align_var].size);
- align_shift = var_size_shift + compiler->loop_shift;
-
- /* determine how many iterations until align array is aligned (n1) */
- orc_x86_emit_mov_imm_reg (compiler, 4, 32, X86_EAX);
- // Get the address of the array in question
- // and eax <- eax - addressof(alignment variable)
- orc_x86_emit_sub_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, arrays[align_var]),
- compiler->exec_reg, X86_EAX);
- // How many bytes are needed for alignment? (mask wise)
- orc_x86_emit_and_imm_reg (compiler, 4, (1 << align_shift) - 1, X86_EAX);
- // Undo the shift to determine number of ELEMENTS
- orc_x86_emit_sar_imm_reg (compiler, 4, var_size_shift, X86_EAX);
-
- /* check if n1 is greater than n. */
- orc_x86_emit_cmp_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, n), compiler->exec_reg);
-
- orc_x86_emit_jle (compiler, 6);
-
- /* If so, we have a standard 3-region split. */
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter1), compiler->exec_reg);
-
- /* Calculate n2 */
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, n), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_sub_reg_reg (compiler, 4, X86_EAX, compiler->gp_tmpreg);
-
- orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);
-
- orc_x86_emit_sar_imm_reg (compiler, 4,
- compiler->loop_shift + compiler->unroll_shift, compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter2), compiler->exec_reg);
-
- /* Calculate n3 */
- orc_x86_emit_and_imm_reg (compiler, 4,
- (1 << (compiler->loop_shift + compiler->unroll_shift)) - 1, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter3), compiler->exec_reg);
-
- orc_x86_emit_jmp (compiler, 7);
-
- /* else, iterations are all unaligned: n1=n, n2=0, n3=0 */
- orc_x86_emit_label (compiler, 6);
-
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, n), compiler->exec_reg, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter1), compiler->exec_reg);
- orc_x86_emit_mov_imm_reg (compiler, 4, 0, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter2), compiler->exec_reg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter3), compiler->exec_reg);
-
- orc_x86_emit_label (compiler, 7);
+ orc_avx_set_mxcsr (c);
}
static void
-orc_emit_split_2_regions (OrcCompiler *compiler)
+avx_restore_mxcsr(OrcCompiler *c)
{
- int align_var;
- int align_shift ORC_GNUC_UNUSED;
- int var_size_shift;
-
- align_var = get_align_var (compiler);
- if (align_var < 0)
- return;
- var_size_shift = get_shift (compiler->vars[align_var].size);
- align_shift = var_size_shift + compiler->loop_shift;
-
- /* Calculate n2 */
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, n), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);
- orc_x86_emit_sar_imm_reg (compiler, 4,
- compiler->loop_shift + compiler->unroll_shift, compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter2), compiler->exec_reg);
-
- /* Calculate n3 */
- orc_x86_emit_and_imm_reg (compiler, 4,
- (1 << (compiler->loop_shift + compiler->unroll_shift)) - 1, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter3), compiler->exec_reg);
+ orc_avx_restore_mxcsr (c);
}
-#define LABEL_REGION1_SKIP 1
-#define LABEL_INNER_LOOP_START 2
-#define LABEL_REGION2_SKIP 3
-#define LABEL_OUTER_LOOP 4
-#define LABEL_OUTER_LOOP_SKIP 5
-// XXX: For AVX-512 onwards, check that this range doesn't overlap
-// with the region 1 labels (LABEL_STEP_UP)
-#define LABEL_STEP_DOWN(x) (8 + (x))
-#define LABEL_STEP_UP(x) (16 + (x))
-
-static void
-orc_compiler_avx_save_registers (OrcCompiler *compiler)
+void
+orc_avx_init (void)
{
- orc_uint16 saved = 0;
- for (orc_uint16 i = 0; i < ORC_AVX_REG_AMOUNT; ++i) {
- if (compiler->save_regs[X86_YMM0 + i] == 1) {
- ++saved;
- }
- }
- if (saved > 0) {
- orc_x86_emit_mov_imm_reg (compiler, 4, ORC_AVX_REG_SIZE * saved,
- compiler->gp_tmpreg);
- orc_x86_emit_sub_reg_reg (compiler, compiler->is_64bit ? 8 : 4,
- compiler->gp_tmpreg, X86_ESP);
- saved = 0;
- for (orc_uint16 i = 0; i < ORC_AVX_REG_AMOUNT; ++i) {
- if (compiler->save_regs[X86_YMM0 + i] == 1) {
- orc_x86_emit_mov_avx_memoffset (compiler, ORC_AVX_REG_SIZE,
- X86_YMM0 + i, saved * ORC_AVX_REG_SIZE, X86_ESP, FALSE, FALSE);
- ++saved;
- }
- }
- }
-}
+ // clang-format off
+ static OrcX86Target target = {
+ "avx",
+ avx_get_default_flags,
+ avx_get_flag_name,
+ avx_is_executable,
+ avx_validate_registers,
+ avx_saveable_registers,
+ avx_is_64bit,
+ avx_use_frame_pointer,
+ avx_use_long_jumps,
+ avx_loop_shift,
+ avx_init_accumulator,
+ avx_reduce_accumulator,
+ orc_avx_load_constant,
+ avx_load_constant_long,
+ avx_move_register_to_memoffset,
+ avx_move_memoffset_to_register,
+ avx_get_shift,
+ avx_set_mxcsr,
+ avx_restore_mxcsr,
+ ORC_AVX_REG_SIZE,
+ X86_YMM0,
+ ORC_AVX_REG_AMOUNT,
+ 16,
+ };
+ // clang-format on
+ OrcTarget *t;
-static void
-orc_compiler_avx_restore_registers (OrcCompiler *compiler)
-{
- orc_uint16 saved = 0;
- for (orc_uint16 i = 0; i < ORC_AVX_REG_AMOUNT; ++i) {
- if (compiler->save_regs[X86_YMM0 + i] == 1) {
- orc_x86_emit_mov_memoffset_avx (compiler, ORC_AVX_REG_SIZE,
- saved * ORC_AVX_REG_SIZE, X86_ESP, X86_YMM0 + i, FALSE);
- ++saved;
- }
- }
- if (saved > 0) {
- orc_x86_emit_mov_imm_reg (compiler, 4, ORC_AVX_REG_SIZE * saved,
- compiler->gp_tmpreg);
- orc_x86_emit_add_reg_reg (compiler, compiler->is_64bit ? 8 : 4,
- compiler->gp_tmpreg, X86_ESP);
- }
+ t = orc_x86_register_target (&target);
+ orc_compiler_avx_register_rules (t);
}
/*
@@ -778,7 +426,6 @@ orc_compiler_avx_restore_registers (OrcCompiler *compiler)
* and extended to allow for store reordering and the
* multi-operand VEX syntax.
*/
-
static int
uses_in_destination_register (const OrcCompiler *const compiler,
const OrcInstruction *const insn,
@@ -921,314 +568,3 @@ get_optimised_instruction_order (OrcCompiler *compiler)
return instruction_idx;
}
-static void
-orc_compiler_avx_assemble (OrcCompiler *compiler)
-{
- int set_mxcsr = FALSE;
-
- // Adjust alignment of variables -- AVX requires 32-byte
- for (int i = ORC_VAR_D1; i <= ORC_VAR_S8; i++) {
- if (compiler->vars[i].size == 0)
- continue;
- if (compiler->vars[i].alignment >= 32) {
- compiler->vars[i].is_aligned = TRUE;
- } else {
- compiler->vars[i].is_aligned = FALSE;
- }
- }
-
- const int align_var = get_align_var (compiler);
- if (align_var < 0) {
- orc_x86_assemble_copy (compiler);
- return;
- }
- const int is_aligned = compiler->vars[align_var].is_aligned;
-
- {
- orc_avx_emit_loop (compiler, 0, 0);
-
- compiler->codeptr = compiler->code;
- free (compiler->asm_code);
- compiler->asm_code = NULL;
- compiler->asm_code_len = 0;
- memset (compiler->labels, 0, sizeof (compiler->labels));
- memset (compiler->labels_int, 0, sizeof (compiler->labels_int));
- compiler->n_fixups = 0;
- compiler->n_output_insns = 0;
- }
-
- if (compiler->error)
- return;
-
- orc_x86_emit_prologue (compiler);
-
- orc_compiler_avx_save_registers (compiler);
-
- if (orc_program_has_float (compiler)) {
- set_mxcsr = TRUE;
- orc_avx_set_mxcsr (compiler);
- }
-
- avx_load_constants_outer (compiler);
-
- if (compiler->program->is_2d) {
- if (compiler->program->constant_m > 0) {
- orc_x86_emit_mov_imm_reg (compiler, 4, compiler->program->constant_m,
- X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, params[ORC_VAR_A2]),
- compiler->exec_reg);
- } else {
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, params[ORC_VAR_A1]),
- compiler->exec_reg, X86_EAX);
- orc_x86_emit_test_reg_reg (compiler, 4, X86_EAX, X86_EAX);
- orc_x86_emit_jle (compiler, LABEL_OUTER_LOOP_SKIP);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, params[ORC_VAR_A2]),
- compiler->exec_reg);
- }
-
- orc_x86_emit_label (compiler, LABEL_OUTER_LOOP);
- }
-
- if (compiler->program->constant_n > 0
- && compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) {
- /* don't need to load n */
- } else if (compiler->loop_shift > 0) {
- if (compiler->has_iterator_opcode || is_aligned) {
- orc_emit_split_2_regions (compiler);
- } else {
- /* split n into three regions, with center region being aligned */
- orc_emit_split_3_regions (compiler);
- }
- } else {
- /* loop shift is 0, no need to split */
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, n), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter2), compiler->exec_reg);
- }
-
- avx_load_constants_inner (compiler);
-
- if (compiler->program->constant_n > 0
- && compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) {
- int n_left = compiler->program->constant_n;
- int save_loop_shift;
- int loop_shift;
-
- compiler->offset = 0;
-
- save_loop_shift = compiler->loop_shift;
- while (n_left >= (1 << compiler->loop_shift)) {
- ORC_ASM_CODE (compiler, "# AVX LOOP SHIFT %d\n", compiler->loop_shift);
- orc_avx_emit_loop (compiler, compiler->offset, 0);
-
- n_left -= 1 << compiler->loop_shift;
- compiler->offset += 1 << compiler->loop_shift;
- }
- for (loop_shift = compiler->loop_shift - 1; loop_shift >= 0; loop_shift--) {
- if (n_left >= (1 << loop_shift)) {
- compiler->loop_shift = loop_shift;
- ORC_ASM_CODE (compiler, "# AVX LOOP SHIFT %d\n", loop_shift);
- orc_avx_emit_loop (compiler, compiler->offset, 0);
- n_left -= 1 << loop_shift;
- compiler->offset += 1 << loop_shift;
- }
- }
- compiler->loop_shift = save_loop_shift;
-
- } else {
- int ui, ui_max;
- int emit_region1 = TRUE;
- int emit_region3 = TRUE;
-
- if (compiler->has_iterator_opcode || is_aligned) {
- emit_region1 = FALSE;
- }
- if (compiler->loop_shift == 0) {
- emit_region1 = FALSE;
- emit_region3 = FALSE;
- }
-
- if (emit_region1) {
- int save_loop_shift;
- int l;
-
- save_loop_shift = compiler->loop_shift;
- compiler->vars[align_var].is_aligned = FALSE;
-
- for (l = 0; l < save_loop_shift; l++) {
- compiler->loop_shift = l;
- ORC_ASM_CODE (compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
-
- orc_x86_emit_test_imm_memoffset (compiler, 4, 1 << compiler->loop_shift,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter1), compiler->exec_reg);
- orc_x86_emit_je (compiler, LABEL_STEP_UP (compiler->loop_shift));
- orc_avx_emit_loop (compiler, 0, 1 << compiler->loop_shift);
- orc_x86_emit_label (compiler, LABEL_STEP_UP (compiler->loop_shift));
- }
-
- compiler->loop_shift = save_loop_shift;
- compiler->vars[align_var].is_aligned = TRUE;
- }
-
- orc_x86_emit_label (compiler, LABEL_REGION1_SKIP);
-
- orc_x86_emit_cmp_imm_memoffset (compiler, 4, 0,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter2), compiler->exec_reg);
- orc_x86_emit_je (compiler, LABEL_REGION2_SKIP);
-
- if (compiler->loop_counter != ORC_REG_INVALID) {
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter2), compiler->exec_reg,
- compiler->loop_counter);
- }
-
- ORC_ASM_CODE (compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
- // Instruction fetch windows are 16-byte aligned
- // https://easyperf.net/blog/2018/01/18/Code_alignment_issues
- orc_x86_emit_align (compiler, 4);
- orc_x86_emit_label (compiler, LABEL_INNER_LOOP_START);
- ui_max = 1 << compiler->unroll_shift;
- for (ui = 0; ui < ui_max; ui++) {
- compiler->offset = ui << compiler->loop_shift;
- orc_avx_emit_loop (compiler, compiler->offset,
- (ui == ui_max - 1)
- << (compiler->loop_shift + compiler->unroll_shift));
- }
- compiler->offset = 0;
- if (compiler->loop_counter != ORC_REG_INVALID) {
- orc_x86_emit_add_imm_reg (compiler, 4, -1, compiler->loop_counter, TRUE);
- } else {
- orc_x86_emit_dec_memoffset (compiler, 4,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter2), compiler->exec_reg);
- }
- orc_x86_emit_jne (compiler, LABEL_INNER_LOOP_START);
- orc_x86_emit_label (compiler, LABEL_REGION2_SKIP);
-
- if (emit_region3) {
- int save_loop_shift;
- int l;
-
- save_loop_shift = compiler->loop_shift + compiler->unroll_shift;
- compiler->vars[align_var].is_aligned = FALSE;
-
- for (l = save_loop_shift - 1; l >= 0; l--) {
- compiler->loop_shift = l;
- ORC_ASM_CODE (compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
-
- orc_x86_emit_test_imm_memoffset (compiler, 4, 1 << compiler->loop_shift,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, counter3), compiler->exec_reg);
- orc_x86_emit_je (compiler, LABEL_STEP_DOWN (compiler->loop_shift));
- orc_avx_emit_loop (compiler, 0, 1 << compiler->loop_shift);
- orc_x86_emit_label (compiler, LABEL_STEP_DOWN (compiler->loop_shift));
- }
-
- compiler->loop_shift = save_loop_shift;
- }
- }
-
- if (compiler->program->is_2d && compiler->program->constant_m != 1) {
- avx_add_strides (compiler);
-
- orc_x86_emit_add_imm_memoffset (compiler, 4, -1,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, params[ORC_VAR_A2]),
- compiler->exec_reg);
- orc_x86_emit_jne (compiler, LABEL_OUTER_LOOP);
- orc_x86_emit_label (compiler, LABEL_OUTER_LOOP_SKIP);
- }
-
- avx_save_accumulators (compiler);
-
- if (set_mxcsr) {
- orc_avx_restore_mxcsr (compiler);
- }
-
- orc_compiler_avx_restore_registers (compiler);
-
- orc_x86_emit_epilogue (compiler);
-
- orc_x86_calculate_offsets (compiler);
- orc_x86_output_insns (compiler);
-
- orc_x86_do_fixups (compiler);
-}
-
-static void
-orc_avx_emit_loop (OrcCompiler *compiler, int offset, int update)
-{
- int j;
- int k;
- OrcInstruction *insn;
- OrcStaticOpcode *opcode;
- OrcRule *rule;
-
- int *const insn_idx = get_optimised_instruction_order (compiler);
-
- for (j = 0; j < compiler->n_insns; j++) {
- insn = compiler->insns + insn_idx[j];
- opcode = insn->opcode;
-
- compiler->insn_index = j;
-
- if (insn->flags & ORC_INSN_FLAG_INVARIANT)
- continue;
-
- ORC_ASM_CODE (compiler, "# %d: %s\n", j, insn->opcode->name);
-
- compiler->min_temp_reg = ORC_VEC_REG_BASE;
-
- compiler->insn_shift = compiler->loop_shift;
- if (insn->flags & ORC_INSTRUCTION_FLAG_X2) {
- compiler->insn_shift += 1;
- }
- if (insn->flags & ORC_INSTRUCTION_FLAG_X4) {
- compiler->insn_shift += 2;
- }
-
- rule = insn->rule;
- if (rule && rule->emit) {
- rule->emit (compiler, rule->emit_user, insn);
- } else {
- orc_compiler_error (compiler, "no code generation rule for %s",
- opcode->name);
- }
- }
-
- if (update) {
- for (k = 0; k < ORC_N_COMPILER_VARIABLES; k++) {
- OrcVariable *var = compiler->vars + k;
-
- if (var->name == NULL)
- continue;
- if (var->vartype == ORC_VAR_TYPE_SRC
- || var->vartype == ORC_VAR_TYPE_DEST) {
- int offset;
- if (var->update_type == 0) {
- offset = 0;
- } else if (var->update_type == 1) {
- offset = (var->size * update) >> 1;
- } else {
- offset = var->size * update;
- }
-
- if (offset != 0) {
- if (compiler->vars[k].ptr_register) {
- orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4,
- offset, compiler->vars[k].ptr_register, FALSE);
- } else {
- orc_x86_emit_add_imm_memoffset (compiler,
- compiler->is_64bit ? 8 : 4, offset,
- (int)ORC_STRUCT_OFFSET (OrcExecutor, arrays[k]),
- compiler->exec_reg);
- }
- }
- }
- }
- }
-
- free (insn_idx);
-}
diff --git a/orc/orcprogram-x86.c b/orc/orcprogram-x86.c
index a9f1965..64fc7a0 100644
--- a/orc/orcprogram-x86.c
+++ b/orc/orcprogram-x86.c
@@ -186,7 +186,7 @@ orc_x86_save_accumulators (OrcX86Target *t, OrcCompiler *c)
if (var->vartype != ORC_VAR_TYPE_ACCUMULATOR)
continue;
- t->reduce_accumulator (c, var);
+ t->reduce_accumulator (c, i, var);
}
}
@@ -623,6 +623,26 @@ orc_x86_restore_mxcsr (OrcX86Target *t, OrcCompiler *c)
t->restore_mxcsr (c);
}
+
+static void
+orc_x86_adjust_alignment (OrcX86Target *t, OrcCompiler *compiler)
+{
+ int i;
+
+ /* Adjust alignment of variables
+ * We only care of array vars, as those require memory access
+ */
+ for (i = ORC_VAR_D1; i <= ORC_VAR_S8; i++) {
+ if (compiler->vars[i].size == 0)
+ continue;
+ if ((compiler->vars[i].alignment % t->register_size) == 0) {
+ compiler->vars[i].is_aligned = TRUE;
+ } else {
+ compiler->vars[i].is_aligned = FALSE;
+ }
+ }
+}
+
static void
orc_x86_compile (OrcCompiler *compiler)
{
@@ -637,8 +657,11 @@ orc_x86_compile (OrcCompiler *compiler)
orc_x86_assemble_copy (compiler);
return;
}
- is_aligned = compiler->vars[align_var].is_aligned;
+ /* Align the compiler variables */
+ orc_x86_adjust_alignment (t, compiler);
+
+ is_aligned = compiler->vars[align_var].is_aligned;
{
orc_x86_emit_loop (compiler, 0, 0);
@@ -852,15 +875,16 @@ orc_x86_compile (OrcCompiler *compiler)
orc_x86_do_fixups (compiler);
}
-void
+OrcTarget *
orc_x86_register_target (OrcX86Target *x86t)
{
OrcTarget *t;
+ /* FIXME this needs to be freed */
t = calloc (1, sizeof(OrcTarget));
t->name = x86t->name;
#if defined(HAVE_I386) || defined(HAVE_AMD64)
- t->executable = TRUE;
+ t->executable = x86t->is_executable ();
#else
t->executable = FALSE;
#endif
@@ -871,5 +895,8 @@ orc_x86_register_target (OrcX86Target *x86t)
t->load_constant = orc_x86_load_constant;
t->get_flag_name = x86t->get_flag_name;
t->load_constant_long = x86t->load_constant_long;
+ t->target_data = x86t;
orc_target_register (t);
+
+ return t;
}
diff --git a/orc/orcx86.h b/orc/orcx86.h
index 762af96..62b4743 100644
--- a/orc/orcx86.h
+++ b/orc/orcx86.h
@@ -15,6 +15,7 @@ typedef struct _OrcX86Target
const char *name;
unsigned int (*get_default_flags)(void);
const char * (*get_flag_name)(int shift);
+ int (*is_executable)(void);
/* X86 specific */
void (*validate_registers)(int *regs, int is_64bit);
@@ -24,7 +25,7 @@ typedef struct _OrcX86Target
int (*use_long_jumps)(int flags);
int (*loop_shift)(int max_var_size);
void (*init_accumulator)(OrcCompiler *c, OrcVariable *var);
- void (*reduce_accumulator)(OrcCompiler *c, OrcVariable *var);
+ void (*reduce_accumulator)(OrcCompiler *c, int i, OrcVariable *var);
void (*load_constant)(OrcCompiler *c, int reg, int size, orc_uint64 value);
void (*load_constant_long)(OrcCompiler *c, int reg, OrcConstant *constant);
void (*move_register_to_memoffset)(OrcCompiler *compiler, int size, int reg1, int offset, int reg2, int aligned, int uncached);
@@ -219,6 +220,8 @@ ORC_API void orc_x86_emit_cpuinsn_label (OrcCompiler *p, int index, int label);
ORC_API void orc_x86_emit_cpuinsn_none (OrcCompiler *p, int index);
ORC_API void orc_x86_emit_cpuinsn_align (OrcCompiler *p, int index, int align_shift);
+ORC_API OrcTarget * orc_x86_register_target (OrcX86Target *x86t);
+
#endif
ORC_END_DECLS