summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJorge Zapata <jorgeluis.zapata@gmail.com>2024-01-21 19:52:41 +0100
committerJorge Zapata <jorgeluis.zapata@gmail.com>2024-03-12 10:03:58 +0100
commitf3c13218a8f5a8be28f0f0ea1d3a761c854673f4 (patch)
tree2be64c14e7d299270a5d563ead3ae9a24ebb90a6
parent45c2be4f3a964148f2fa25ded3c37b4dedef08d8 (diff)
Initial migration of SSE target
Part-of: <https://gitlab.freedesktop.org/gstreamer/orc/-/merge_requests/148>
-rw-r--r--orc/orcprogram-sse.c961
1 files changed, 124 insertions, 837 deletions
diff --git a/orc/orcprogram-sse.c b/orc/orcprogram-sse.c
index e7ee77d..2d89cad 100644
--- a/orc/orcprogram-sse.c
+++ b/orc/orcprogram-sse.c
@@ -1,4 +1,3 @@
-
#include "config.h"
#include <stdio.h>
@@ -14,79 +13,14 @@
#include <orc/orcdebug.h>
#include <orc/orcinternal.h>
-#undef MMX
-#ifdef MMX
-# define ORC_REG_SIZE 8
-#else
-# define ORC_REG_SIZE 16
-#endif
-#define SIZE 65536
-
-#define ORC_SSE_ALIGNED_DEST_CUTOFF 64
-
-static void orc_sse_emit_loop (OrcCompiler *compiler, int offset, int update);
-
-void orc_compiler_sse_register_rules (OrcTarget *target);
-static void orc_compiler_sse_init (OrcCompiler *compiler);
-static unsigned int orc_compiler_sse_get_default_flags (void);
-static void orc_compiler_sse_assemble (OrcCompiler *compiler);
-
-void sse_load_constant (OrcCompiler *compiler, int reg, int size, int value);
-void sse_load_constant_long (OrcCompiler *compiler, int reg,
- OrcConstant *constant);
-static const char * sse_get_flag_name (int shift);
-
-static OrcTarget sse_target = {
- "sse",
-#if defined(HAVE_I386) || defined(HAVE_AMD64)
- TRUE,
-#else
- FALSE,
-#endif
- ORC_VEC_REG_BASE,
- orc_compiler_sse_get_default_flags,
- orc_compiler_sse_init,
- orc_compiler_sse_assemble,
- { { 0 } },
- 0,
- NULL,
- sse_load_constant,
- sse_get_flag_name,
- NULL,
- sse_load_constant_long
-};
-
-
extern int orc_x86_sse_flags;
-extern int orc_x86_mmx_flags;
-
-void
-orc_sse_init (void)
-{
-#if defined(HAVE_AMD64) || defined(HAVE_I386)
- /* initializes cache information */
- orc_sse_get_cpu_flags ();
-#endif
-
-#if defined(HAVE_I386)
-#ifndef MMX
- if (!(orc_x86_sse_flags & ORC_TARGET_SSE_SSE2)) {
- sse_target.executable = FALSE;
- }
-#else
- if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMX)) {
- mmx_target.executable = FALSE;
- }
-#endif
-#endif
-
- orc_target_register (&sse_target);
- orc_compiler_sse_register_rules (&sse_target);
-}
+/* TODO To be placed in a common header for private stuff */
+void orc_compiler_sse_register_rules (OrcTarget *target);
+/* X86 specific */
static unsigned int
-orc_compiler_sse_get_default_flags (void)
+sse_get_default_flags (void)
{
unsigned int flags = 0;
@@ -98,20 +32,11 @@ orc_compiler_sse_get_default_flags (void)
}
#if defined(HAVE_AMD64) || defined(HAVE_I386)
-#ifndef MMX
flags |= orc_x86_sse_flags;
#else
- flags |= orc_x86_mmx_flags;
-#endif
-#else
-#ifndef MMX
flags |= ORC_TARGET_SSE_SSE2;
flags |= ORC_TARGET_SSE_SSE3;
flags |= ORC_TARGET_SSE_SSSE3;
-#else
- flags |= ORC_TARGET_MMX_MMX;
- flags |= ORC_TARGET_MMX_3DNOW;
-#endif
#endif
return flags;
@@ -121,13 +46,8 @@ static const char *
sse_get_flag_name (int shift)
{
static const char *flags[] = {
-#ifndef MMX
"sse2", "sse3", "ssse3", "sse41", "sse42", "sse4a", "sse5",
"frame_pointer", "short_jumps", "64bit"
-#else
- "mmx", "mmxext", "3dnow", "3dnowext", "ssse3", "sse41", "",
- "frame_pointer", "short_jumps", "64bit"
-#endif
};
if (shift >= 0 && shift < sizeof(flags)/sizeof(flags[0])) {
@@ -137,163 +57,117 @@ sse_get_flag_name (int shift)
return NULL;
}
-static void
-orc_compiler_sse_init (OrcCompiler *compiler)
+static int
+sse_is_executable (void)
{
- int i;
+#if defined(HAVE_AMD64) || defined(HAVE_I386)
+ /* initializes cache information */
+ const int flags = orc_sse_get_cpu_flags ();
- if (compiler->target_flags & ORC_TARGET_SSE_64BIT) {
- compiler->is_64bit = TRUE;
- }
- if (compiler->target_flags & ORC_TARGET_SSE_FRAME_POINTER) {
- compiler->use_frame_pointer = TRUE;
- }
- if (!(compiler->target_flags & ORC_TARGET_SSE_SHORT_JUMPS)) {
- compiler->long_jumps = TRUE;
+ if (orc_x86_sse_flags & ORC_TARGET_SSE_SSE2) {
+ return TRUE;
}
-
+#endif
+ return FALSE;
+}
- if (compiler->is_64bit) {
- for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){
- compiler->valid_regs[i] = 1;
+static void
+sse_validate_registers (int *regs, int is_64bit)
+{
+ int i;
+
+ if (is_64bit) {
+ for(i = X86_XMM0;i < X86_XMM0 + 16; i++){
+ regs[i] = 1;
}
- compiler->valid_regs[X86_ESP] = 0;
- for(i=X86_XMM0;i<X86_XMM0+ORC_REG_SIZE;i++){
- compiler->valid_regs[i] = 1;
+ } else {
+ for(i = X86_XMM0; i < X86_XMM0 + 8; i++){
+ regs[i] = 1;
}
- compiler->save_regs[X86_EBX] = 1;
- compiler->save_regs[X86_EBP] = 1;
- compiler->save_regs[X86_R12] = 1;
- compiler->save_regs[X86_R13] = 1;
- compiler->save_regs[X86_R14] = 1;
- compiler->save_regs[X86_R15] = 1;
+ }
+}
+
+static void
+sse_saveable_registers (int *regs, int is_64bit)
+{
#ifdef HAVE_OS_WIN32
- compiler->save_regs[X86_EDI] = 1;
- compiler->save_regs[X86_ESI] = 1;
- for(i=X86_XMM0+6;i<X86_XMM0+ORC_REG_SIZE;i++){
- compiler->save_regs[i] = 1;
+ if (is_64bit) {
+ int i;
+ for(i = X86_XMM0 + 6; i < X86_XMM0 + 16; i++){
+ regs[i] = 1;
}
+ }
#endif
+}
+
+static int
+sse_is_64bit (int flags)
+{
+ if (flags & ORC_TARGET_SSE_64BIT) {
+ return TRUE;
} else {
- for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+8;i++){
- compiler->valid_regs[i] = 1;
- }
- compiler->valid_regs[X86_ESP] = 0;
- if (compiler->use_frame_pointer) {
- compiler->valid_regs[X86_EBP] = 0;
- }
- for(i=X86_XMM0;i<X86_XMM0+8;i++){
- compiler->valid_regs[i] = 1;
- }
- compiler->save_regs[X86_EBX] = 1;
- compiler->save_regs[X86_EDI] = 1;
- compiler->save_regs[X86_EBP] = 1;
+ return FALSE;
}
- for(i=0;i<128;i++){
- compiler->alloc_regs[i] = 0;
- compiler->used_regs[i] = 0;
+}
+
+static int
+sse_use_frame_pointer (int flags)
+{
+ if (flags & ORC_TARGET_SSE_FRAME_POINTER) {
+ return TRUE;
+ } else {
+ return FALSE;
}
+}
- if (compiler->is_64bit) {
-#ifdef HAVE_OS_WIN32
- compiler->exec_reg = X86_ECX;
- compiler->gp_tmpreg = X86_EDX;
-#else
- compiler->exec_reg = X86_EDI;
- compiler->gp_tmpreg = X86_ECX;
-#endif
+static int
+sse_use_long_jumps (int flags)
+{
+ if (!(flags & ORC_TARGET_SSE_SHORT_JUMPS)) {
+ return TRUE;
} else {
- compiler->gp_tmpreg = X86_ECX;
- if (compiler->use_frame_pointer) {
- compiler->exec_reg = X86_EBX;
- } else {
- compiler->exec_reg = X86_EBP;
- }
+ return FALSE;
}
- compiler->valid_regs[compiler->gp_tmpreg] = 0;
- compiler->valid_regs[compiler->exec_reg] = 0;
+}
- switch (compiler->max_var_size) {
+static int
+sse_loop_shift (int max_var_size)
+{
+ switch (max_var_size) {
case 1:
- compiler->loop_shift = 4;
- break;
+ return 4;
case 2:
- compiler->loop_shift = 3;
- break;
+ return 3;
case 4:
- compiler->loop_shift = 2;
- break;
+ return 2;
case 8:
- compiler->loop_shift = 1;
- break;
+ return 1;
default:
- ORC_ERROR("unhandled max var size %d", compiler->max_var_size);
+ ORC_ERROR ("unhandled max var size %d", max_var_size);
break;
}
-#ifdef MMX
- compiler->loop_shift--;
-#endif
- /* This limit is arbitrary, but some large functions run slightly
- slower when unrolled (ginger Core2 6,15,6), and only some small
- functions run faster when unrolled. Most are the same speed. */
- if (compiler->n_insns <= 10) {
- compiler->unroll_shift = 1;
- }
- if (!compiler->long_jumps) {
- compiler->unroll_shift = 0;
- }
- if (compiler->loop_shift == 0) {
- /* FIXME something is broken with loop_shift=0, unroll_shift=1 */
- compiler->unroll_shift = 0;
- }
- compiler->alloc_loop_counter = TRUE;
- compiler->allow_gp_on_stack = TRUE;
-
- {
- for(i=0;i<compiler->n_insns;i++){
- OrcInstruction *insn = compiler->insns + i;
- OrcStaticOpcode *opcode = insn->opcode;
-
- if (strcmp (opcode->name, "ldreslinb") == 0 ||
- strcmp (opcode->name, "ldreslinl") == 0 ||
- strcmp (opcode->name, "ldresnearb") == 0 ||
- strcmp (opcode->name, "ldresnearl") == 0) {
- compiler->vars[insn->src_args[0]].need_offset_reg = TRUE;
- }
- }
- }
+ return -1;
}
-void
-sse_save_accumulators (OrcCompiler *compiler)
+static void
+sse_init_accumulator (OrcCompiler *compiler, OrcVariable *var)
{
- int i;
- int src;
- int tmp;
-
- for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
- OrcVariable *var = compiler->vars + i;
+ orc_sse_emit_pxor (compiler, var->alloc, var->alloc);
+}
- if (var->name == NULL) continue;
- switch (var->vartype) {
- case ORC_VAR_TYPE_ACCUMULATOR:
- src = var->alloc;
- tmp = orc_compiler_get_temp_reg (compiler);
+static void
+sse_reduce_accumulator (OrcCompiler *compiler, int i, OrcVariable *var) {
+ const int src = var->alloc;
+ const int tmp = orc_compiler_get_temp_reg (compiler);
-#ifndef MMX
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(3,2,3,2), src, tmp);
-#else
- orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp);
-#endif
-
if (var->size == 2) {
orc_sse_emit_paddw (compiler, tmp, src);
} else {
orc_sse_emit_paddd (compiler, tmp, src);
}
-#ifndef MMX
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,1,1,1), src, tmp);
if (var->size == 2) {
@@ -301,15 +175,9 @@ sse_save_accumulators (OrcCompiler *compiler)
} else {
orc_sse_emit_paddd (compiler, tmp, src);
}
-#endif
if (var->size == 2) {
-#ifndef MMX
orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(1,1,1,1), src, tmp);
-#else
- orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp);
-#endif
-
orc_sse_emit_paddw (compiler, tmp, src);
}
@@ -325,19 +193,8 @@ sse_save_accumulators (OrcCompiler *compiler)
compiler->exec_reg,
var->is_aligned, var->is_uncached);
}
-
- break;
- default:
- break;
- }
- }
}
-void
-sse_load_constant (OrcCompiler *compiler, int reg, int size, int value)
-{
- orc_sse_load_constant (compiler, reg, size, value);
-}
void
orc_sse_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 value)
@@ -360,9 +217,7 @@ orc_sse_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 valu
orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, compiler->exec_reg,
reg, FALSE);
-#ifndef MMX
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,0,1,0), reg, reg);
-#endif
return;
}
@@ -426,11 +281,7 @@ orc_sse_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 valu
orc_x86_emit_mov_imm_reg (compiler, 4, value, compiler->gp_tmpreg);
orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, reg);
-#ifndef MMX
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(0,0,0,0), reg, reg);
-#else
- orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg);
-#endif
}
void
@@ -452,175 +303,25 @@ sse_load_constant_long (OrcCompiler *compiler, int reg,
orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
offset + 4*i, compiler->exec_reg);
}
- orc_x86_emit_mov_memoffset_sse (compiler, ORC_REG_SIZE, offset, compiler->exec_reg,
+ orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, compiler->exec_reg,
reg, FALSE);
}
-void
-sse_load_constants_outer (OrcCompiler *compiler)
-{
- int i;
- for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
- if (compiler->vars[i].name == NULL) continue;
- switch (compiler->vars[i].vartype) {
- case ORC_VAR_TYPE_CONST:
- break;
- case ORC_VAR_TYPE_PARAM:
- break;
- case ORC_VAR_TYPE_SRC:
- case ORC_VAR_TYPE_DEST:
- break;
- case ORC_VAR_TYPE_ACCUMULATOR:
- orc_sse_emit_pxor (compiler,
- compiler->vars[i].alloc, compiler->vars[i].alloc);
- break;
- case ORC_VAR_TYPE_TEMP:
- break;
- default:
- orc_compiler_error(compiler,"bad vartype");
- break;
- }
- }
-
- orc_compiler_emit_invariants (compiler);
-
- /* FIXME move to a better place */
- for(i=0;i<compiler->n_constants;i++){
- compiler->constants[i].alloc_reg =
- orc_compiler_get_constant_reg (compiler);
- }
-
- for(i=0;i<compiler->n_constants;i++){
- if (compiler->constants[i].alloc_reg) {
- if (compiler->constants[i].is_long) {
- sse_load_constant_long (compiler, compiler->constants[i].alloc_reg,
- compiler->constants + i);
- } else {
- sse_load_constant (compiler, compiler->constants[i].alloc_reg,
- 4, compiler->constants[i].value);
- }
- }
- }
-
- {
- for(i=0;i<compiler->n_insns;i++){
- OrcInstruction *insn = compiler->insns + i;
- OrcStaticOpcode *opcode = insn->opcode;
-
- if (strcmp (opcode->name, "ldreslinb") == 0 ||
- strcmp (opcode->name, "ldreslinl") == 0 ||
- strcmp (opcode->name, "ldresnearb") == 0 ||
- strcmp (opcode->name, "ldresnearl") == 0) {
- if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]),
- compiler->exec_reg,
- compiler->vars[insn->src_args[0]].ptr_offset);
- } else {
- orc_x86_emit_mov_imm_reg (compiler, 4,
- compiler->vars[insn->src_args[1]].value.i,
- compiler->vars[insn->src_args[0]].ptr_offset);
- }
- }
- }
- }
-}
-
-void
-sse_load_constants_inner (OrcCompiler *compiler)
+static void
+sse_move_register_to_memoffset (OrcCompiler *compiler, int size, int reg1, int offset, int reg2, int aligned, int uncached)
{
- int i;
- for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
- if (compiler->vars[i].name == NULL) continue;
- switch (compiler->vars[i].vartype) {
- case ORC_VAR_TYPE_CONST:
- break;
- case ORC_VAR_TYPE_PARAM:
- break;
- case ORC_VAR_TYPE_SRC:
- case ORC_VAR_TYPE_DEST:
- if (compiler->vars[i].ptr_register) {
- orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg,
- compiler->vars[i].ptr_register);
- }
- break;
- case ORC_VAR_TYPE_ACCUMULATOR:
- break;
- case ORC_VAR_TYPE_TEMP:
- break;
- default:
- orc_compiler_error(compiler,"bad vartype");
- break;
- }
- }
+ orc_x86_emit_mov_sse_memoffset (compiler, size, reg1, offset, reg2, aligned, uncached);
}
-void
-sse_add_strides (OrcCompiler *compiler)
-{
- int i;
-
- for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
- if (compiler->vars[i].name == NULL) continue;
- switch (compiler->vars[i].vartype) {
- case ORC_VAR_TYPE_CONST:
- break;
- case ORC_VAR_TYPE_PARAM:
- break;
- case ORC_VAR_TYPE_SRC:
- case ORC_VAR_TYPE_DEST:
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, params[i]), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_add_reg_memoffset (compiler, compiler->is_64bit ? 8 : 4,
- compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg);
-
- if (compiler->vars[i].ptr_register == 0) {
- orc_compiler_error (compiler, "unimplemented: stride on pointer stored in memory");
- }
- break;
- case ORC_VAR_TYPE_ACCUMULATOR:
- break;
- case ORC_VAR_TYPE_TEMP:
- break;
- default:
- orc_compiler_error(compiler,"bad vartype");
- break;
- }
- }
-}
-
-static int
-get_align_var (OrcCompiler *compiler)
+static void
+sse_move_memoffset_to_register (OrcCompiler *compiler, int size, int offset, int reg1, int reg2, int is_aligned)
{
- int i;
- for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
- if (compiler->vars[i].size == 0) continue;
- if ((compiler->vars[i].size << compiler->loop_shift) >= 16) {
- return i;
- }
- }
- for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
- if (compiler->vars[i].size == 0) continue;
- if ((compiler->vars[i].size << compiler->loop_shift) >= 8) {
- return i;
- }
- }
- for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
- if (compiler->vars[i].size == 0) continue;
- return i;
- }
-
- orc_compiler_error(compiler, "could not find alignment variable");
-
- return -1;
+ orc_x86_emit_mov_memoffset_sse (compiler, size, reg1, offset, reg2, is_aligned);
}
static int
-get_shift (int size)
+sse_get_shift (int size)
{
switch (size) {
case 1:
@@ -632,469 +333,55 @@ get_shift (int size)
case 8:
return 3;
default:
- ORC_ERROR("bad size %d", size);
+ ORC_ERROR ("bad size %d", size);
}
return -1;
}
-
-static void
-orc_emit_split_3_regions (OrcCompiler *compiler)
-{
- int align_var;
- int align_shift;
- int var_size_shift;
-
- align_var = get_align_var (compiler);
- if (align_var < 0)
- return;
- var_size_shift = get_shift (compiler->vars[align_var].size);
- align_shift = var_size_shift + compiler->loop_shift;
-
- /* determine how many iterations until align array is aligned (n1) */
- orc_x86_emit_mov_imm_reg (compiler, 4, 16, X86_EAX);
- orc_x86_emit_sub_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[align_var]),
- compiler->exec_reg, X86_EAX);
- orc_x86_emit_and_imm_reg (compiler, 4, (1<<align_shift) - 1, X86_EAX);
- orc_x86_emit_sar_imm_reg (compiler, 4, var_size_shift, X86_EAX);
-
- /* check if n1 is greater than n. */
- orc_x86_emit_cmp_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg);
-
- orc_x86_emit_jle (compiler, 6);
-
- /* If so, we have a standard 3-region split. */
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
-
- /* Calculate n2 */
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_sub_reg_reg (compiler, 4, X86_EAX, compiler->gp_tmpreg);
-
- orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);
-
- orc_x86_emit_sar_imm_reg (compiler, 4,
- compiler->loop_shift + compiler->unroll_shift,
- compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
-
- /* Calculate n3 */
- orc_x86_emit_and_imm_reg (compiler, 4,
- (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
-
- orc_x86_emit_jmp (compiler, 7);
-
- /* else, iterations are all unaligned: n1=n, n2=0, n3=0 */
- orc_x86_emit_label (compiler, 6);
-
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
- orc_x86_emit_mov_imm_reg (compiler, 4, 0, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
-
- orc_x86_emit_label (compiler, 7);
-}
-
-static void
-orc_emit_split_2_regions (OrcCompiler *compiler)
-{
- int align_var;
- int align_shift ORC_GNUC_UNUSED;
- int var_size_shift;
-
- align_var = get_align_var (compiler);
- if (align_var < 0)
- return;
- var_size_shift = get_shift (compiler->vars[align_var].size);
- align_shift = var_size_shift + compiler->loop_shift;
-
- /* Calculate n2 */
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);
- orc_x86_emit_sar_imm_reg (compiler, 4,
- compiler->loop_shift + compiler->unroll_shift,
- compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
-
- /* Calculate n3 */
- orc_x86_emit_and_imm_reg (compiler, 4,
- (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
-}
-
-#define LABEL_REGION1_SKIP 1
-#define LABEL_INNER_LOOP_START 2
-#define LABEL_REGION2_SKIP 3
-#define LABEL_OUTER_LOOP 4
-#define LABEL_OUTER_LOOP_SKIP 5
-#define LABEL_STEP_DOWN(x) (8+(x))
-#define LABEL_STEP_UP(x) (13+(x))
-
static void
-orc_compiler_sse_save_registers (OrcCompiler *compiler)
+sse_set_mxcsr (OrcCompiler *c)
{
- int i;
- int saved = 0;
- for (i = 0; i < ORC_REG_SIZE; ++i) {
- if (compiler->save_regs[X86_XMM0 + i] == 1) {
- ++saved;
- }
- }
- if (saved > 0) {
- orc_x86_emit_mov_imm_reg (compiler, 4, ORC_REG_SIZE * saved, compiler->gp_tmpreg);
- orc_x86_emit_sub_reg_reg (compiler, compiler->is_64bit ? 8 : 4,
- compiler->gp_tmpreg, X86_ESP);
- saved = 0;
- for (i = 0; i < ORC_REG_SIZE; ++i) {
- if (compiler->save_regs[X86_XMM0 + i] == 1) {
- orc_x86_emit_mov_sse_memoffset (compiler, ORC_REG_SIZE, X86_XMM0 + i,
- saved * ORC_REG_SIZE, X86_ESP, FALSE, FALSE);
- ++saved;
- }
- }
- }
+ orc_sse_set_mxcsr (c);
}
static void
-orc_compiler_sse_restore_registers (OrcCompiler *compiler)
+sse_restore_mxcsr(OrcCompiler *c)
{
- int i;
- int saved = 0;
- for (i = 0; i < ORC_REG_SIZE; ++i) {
- if (compiler->save_regs[X86_XMM0 + i] == 1) {
- orc_x86_emit_mov_memoffset_sse (compiler, ORC_REG_SIZE, saved * ORC_REG_SIZE, X86_ESP,
- X86_XMM0 + i, FALSE);
- ++saved;
- }
- }
- if (saved > 0) {
- orc_x86_emit_mov_imm_reg (compiler, 4, ORC_REG_SIZE * saved, compiler->gp_tmpreg);
- orc_x86_emit_add_reg_reg (compiler, compiler->is_64bit ? 8 : 4,
- compiler->gp_tmpreg, X86_ESP);
- }
-}
-
-static void
-orc_compiler_sse_assemble (OrcCompiler *compiler)
-{
-#ifndef MMX
- int set_mxcsr = FALSE;
-#endif
- int align_var;
- int is_aligned;
-
- if (0 && orc_x86_assemble_copy_check (compiler)) {
- /* The rep movs implementation isn't faster most of the time */
- orc_x86_assemble_copy (compiler);
- return;
- }
-
- align_var = get_align_var (compiler);
- if (align_var < 0) {
- orc_x86_assemble_copy (compiler);
- return;
- }
- is_aligned = compiler->vars[align_var].is_aligned;
-
- {
- orc_sse_emit_loop (compiler, 0, 0);
-
- compiler->codeptr = compiler->code;
- free (compiler->asm_code);
- compiler->asm_code = NULL;
- compiler->asm_code_len = 0;
- memset (compiler->labels, 0, sizeof (compiler->labels));
- memset (compiler->labels_int, 0, sizeof (compiler->labels_int));
- compiler->n_fixups = 0;
- compiler->n_output_insns = 0;
- }
-
- if (compiler->error) return;
-
- orc_x86_emit_prologue (compiler);
-
- orc_compiler_sse_save_registers (compiler);
-
-#ifndef MMX
- if (orc_program_has_float (compiler)) {
- set_mxcsr = TRUE;
- orc_sse_set_mxcsr (compiler);
- }
-#endif
-
- sse_load_constants_outer (compiler);
-
- if (compiler->program->is_2d) {
- if (compiler->program->constant_m > 0) {
- orc_x86_emit_mov_imm_reg (compiler, 4, compiler->program->constant_m,
- X86_EAX);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]),
- compiler->exec_reg);
- } else {
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]),
- compiler->exec_reg, X86_EAX);
- orc_x86_emit_test_reg_reg (compiler, 4, X86_EAX, X86_EAX);
- orc_x86_emit_jle (compiler, LABEL_OUTER_LOOP_SKIP);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]),
- compiler->exec_reg);
- }
-
- orc_x86_emit_label (compiler, LABEL_OUTER_LOOP);
- }
-
- if (compiler->program->constant_n > 0 &&
- compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) {
- /* don't need to load n */
- } else if (compiler->loop_shift > 0) {
- if (compiler->has_iterator_opcode || is_aligned) {
- orc_emit_split_2_regions (compiler);
- } else {
- /* split n into three regions, with center region being aligned */
- orc_emit_split_3_regions (compiler);
- }
- } else {
- /* loop shift is 0, no need to split */
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
- compiler->gp_tmpreg);
- orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
- }
-
- sse_load_constants_inner (compiler);
-
- if (compiler->program->constant_n > 0 &&
- compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) {
- int n_left = compiler->program->constant_n;
- int save_loop_shift;
- int loop_shift;
-
- compiler->offset = 0;
-
- save_loop_shift = compiler->loop_shift;
- while (n_left >= (1<<compiler->loop_shift)) {
- ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
- orc_sse_emit_loop (compiler, compiler->offset, 0);
-
- n_left -= 1<<compiler->loop_shift;
- compiler->offset += 1<<compiler->loop_shift;
- }
- for(loop_shift = compiler->loop_shift-1; loop_shift>=0; loop_shift--) {
- if (n_left >= (1<<loop_shift)) {
- compiler->loop_shift = loop_shift;
- ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", loop_shift);
- orc_sse_emit_loop (compiler, compiler->offset, 0);
- n_left -= 1<<loop_shift;
- compiler->offset += 1<<loop_shift;
- }
- }
- compiler->loop_shift = save_loop_shift;
-
- } else {
- int ui, ui_max;
- int emit_region1 = TRUE;
- int emit_region3 = TRUE;
-
- if (compiler->has_iterator_opcode || is_aligned) {
- emit_region1 = FALSE;
- }
- if (compiler->loop_shift == 0) {
- emit_region1 = FALSE;
- emit_region3 = FALSE;
- }
-
- if (emit_region1) {
- int save_loop_shift;
- int l;
-
- save_loop_shift = compiler->loop_shift;
- compiler->vars[align_var].is_aligned = FALSE;
-
- for (l=0;l<save_loop_shift;l++){
- compiler->loop_shift = l;
- ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
-
- orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
- orc_x86_emit_je (compiler, LABEL_STEP_UP(compiler->loop_shift));
- orc_sse_emit_loop (compiler, 0, 1<<compiler->loop_shift);
- orc_x86_emit_label (compiler, LABEL_STEP_UP(compiler->loop_shift));
- }
-
- compiler->loop_shift = save_loop_shift;
- compiler->vars[align_var].is_aligned = TRUE;
- }
-
- orc_x86_emit_label (compiler, LABEL_REGION1_SKIP);
-
- orc_x86_emit_cmp_imm_memoffset (compiler, 4, 0,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
- orc_x86_emit_je (compiler, LABEL_REGION2_SKIP);
-
- if (compiler->loop_counter != ORC_REG_INVALID) {
- orc_x86_emit_mov_memoffset_reg (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, counter2), compiler->exec_reg,
- compiler->loop_counter);
- }
-
- ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
- orc_x86_emit_align (compiler, 4);
- orc_x86_emit_label (compiler, LABEL_INNER_LOOP_START);
- ui_max = 1<<compiler->unroll_shift;
- for(ui=0;ui<ui_max;ui++) {
- compiler->offset = ui<<compiler->loop_shift;
- orc_sse_emit_loop (compiler, compiler->offset,
- (ui==ui_max-1) << (compiler->loop_shift + compiler->unroll_shift));
- }
- compiler->offset = 0;
- if (compiler->loop_counter != ORC_REG_INVALID) {
- orc_x86_emit_add_imm_reg (compiler, 4, -1, compiler->loop_counter, TRUE);
- } else {
- orc_x86_emit_dec_memoffset (compiler, 4,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2),
- compiler->exec_reg);
- }
- orc_x86_emit_jne (compiler, LABEL_INNER_LOOP_START);
- orc_x86_emit_label (compiler, LABEL_REGION2_SKIP);
-
- if (emit_region3) {
- int save_loop_shift;
- int l;
-
- save_loop_shift = compiler->loop_shift + compiler->unroll_shift;
- compiler->vars[align_var].is_aligned = FALSE;
-
- for(l=save_loop_shift - 1; l >= 0; l--) {
- compiler->loop_shift = l;
- ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
-
- orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
- orc_x86_emit_je (compiler, LABEL_STEP_DOWN(compiler->loop_shift));
- orc_sse_emit_loop (compiler, 0, 1<<compiler->loop_shift);
- orc_x86_emit_label (compiler, LABEL_STEP_DOWN(compiler->loop_shift));
- }
-
- compiler->loop_shift = save_loop_shift;
- }
- }
-
- if (compiler->program->is_2d && compiler->program->constant_m != 1) {
- sse_add_strides (compiler);
-
- orc_x86_emit_add_imm_memoffset (compiler, 4, -1,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]),
- compiler->exec_reg);
- orc_x86_emit_jne (compiler, LABEL_OUTER_LOOP);
- orc_x86_emit_label (compiler, LABEL_OUTER_LOOP_SKIP);
- }
-
- sse_save_accumulators (compiler);
-
-#ifndef MMX
- if (set_mxcsr) {
- orc_sse_restore_mxcsr (compiler);
- }
-#else
- orc_x86_emit_emms (compiler);
-#endif
-
- orc_compiler_sse_restore_registers (compiler);
-
- orc_x86_emit_epilogue (compiler);
-
- orc_x86_calculate_offsets (compiler);
- orc_x86_output_insns (compiler);
-
- orc_x86_do_fixups (compiler);
+ orc_sse_restore_mxcsr (c);
}
-static void
-orc_sse_emit_loop (OrcCompiler *compiler, int offset, int update)
+void
+orc_sse_init (void)
{
- int j;
- int k;
- OrcInstruction *insn;
- OrcStaticOpcode *opcode;
- OrcRule *rule;
-
- for(j=0;j<compiler->n_insns;j++){
- insn = compiler->insns + j;
- opcode = insn->opcode;
-
- compiler->insn_index = j;
-
- if (insn->flags & ORC_INSN_FLAG_INVARIANT) continue;
-
- ORC_ASM_CODE(compiler,"# %d: %s\n", j, insn->opcode->name);
-
- compiler->min_temp_reg = ORC_VEC_REG_BASE;
-
- compiler->insn_shift = compiler->loop_shift;
- if (insn->flags & ORC_INSTRUCTION_FLAG_X2) {
- compiler->insn_shift += 1;
- }
- if (insn->flags & ORC_INSTRUCTION_FLAG_X4) {
- compiler->insn_shift += 2;
- }
-
- rule = insn->rule;
- if (rule && rule->emit) {
- rule->emit (compiler, rule->emit_user, insn);
- } else {
- orc_compiler_error (compiler, "no code generation rule for %s",
- opcode->name);
- }
- }
-
- if (update) {
- for(k=0;k<ORC_N_COMPILER_VARIABLES;k++){
- OrcVariable *var = compiler->vars + k;
-
- if (var->name == NULL) continue;
- if (var->vartype == ORC_VAR_TYPE_SRC ||
- var->vartype == ORC_VAR_TYPE_DEST) {
- int offset;
- if (var->update_type == 0) {
- offset = 0;
- } else if (var->update_type == 1) {
- offset = (var->size * update) >> 1;
- } else {
- offset = var->size * update;
- }
+ // clang-format off
+ static OrcX86Target target = {
+ "sse",
+ sse_get_default_flags,
+ sse_get_flag_name,
+ sse_is_executable,
+ sse_validate_registers,
+ sse_saveable_registers,
+ sse_is_64bit,
+ sse_use_frame_pointer,
+ sse_use_long_jumps,
+ sse_loop_shift,
+ sse_init_accumulator,
+ sse_reduce_accumulator,
+ orc_sse_load_constant,
+ sse_load_constant_long,
+ sse_move_register_to_memoffset,
+ sse_move_memoffset_to_register,
+ sse_get_shift,
+ sse_set_mxcsr,
+ sse_restore_mxcsr,
+ 16,
+ X86_XMM0,
+ 16,
+ 13,
+ };
+ // clang-format on
+ OrcTarget *t;
- if (offset != 0) {
- if (compiler->vars[k].ptr_register) {
- orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4,
- offset,
- compiler->vars[k].ptr_register, FALSE);
- } else {
- orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4,
- offset,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]),
- compiler->exec_reg);
- }
- }
- }
- }
- }
+ t = orc_x86_register_target (&target);
+ orc_compiler_sse_register_rules (t);
}