diff options
author | Jorge Zapata <jorgeluis.zapata@gmail.com> | 2024-01-21 19:52:41 +0100 |
---|---|---|
committer | Jorge Zapata <jorgeluis.zapata@gmail.com> | 2024-03-12 10:03:58 +0100 |
commit | f3c13218a8f5a8be28f0f0ea1d3a761c854673f4 (patch) | |
tree | 2be64c14e7d299270a5d563ead3ae9a24ebb90a6 | |
parent | 45c2be4f3a964148f2fa25ded3c37b4dedef08d8 (diff) |
Initial migration of SSE target
Part-of: <https://gitlab.freedesktop.org/gstreamer/orc/-/merge_requests/148>
-rw-r--r-- | orc/orcprogram-sse.c | 961 |
1 files changed, 124 insertions, 837 deletions
diff --git a/orc/orcprogram-sse.c b/orc/orcprogram-sse.c index e7ee77d..2d89cad 100644 --- a/orc/orcprogram-sse.c +++ b/orc/orcprogram-sse.c @@ -1,4 +1,3 @@ - #include "config.h" #include <stdio.h> @@ -14,79 +13,14 @@ #include <orc/orcdebug.h> #include <orc/orcinternal.h> -#undef MMX -#ifdef MMX -# define ORC_REG_SIZE 8 -#else -# define ORC_REG_SIZE 16 -#endif -#define SIZE 65536 - -#define ORC_SSE_ALIGNED_DEST_CUTOFF 64 - -static void orc_sse_emit_loop (OrcCompiler *compiler, int offset, int update); - -void orc_compiler_sse_register_rules (OrcTarget *target); -static void orc_compiler_sse_init (OrcCompiler *compiler); -static unsigned int orc_compiler_sse_get_default_flags (void); -static void orc_compiler_sse_assemble (OrcCompiler *compiler); - -void sse_load_constant (OrcCompiler *compiler, int reg, int size, int value); -void sse_load_constant_long (OrcCompiler *compiler, int reg, - OrcConstant *constant); -static const char * sse_get_flag_name (int shift); - -static OrcTarget sse_target = { - "sse", -#if defined(HAVE_I386) || defined(HAVE_AMD64) - TRUE, -#else - FALSE, -#endif - ORC_VEC_REG_BASE, - orc_compiler_sse_get_default_flags, - orc_compiler_sse_init, - orc_compiler_sse_assemble, - { { 0 } }, - 0, - NULL, - sse_load_constant, - sse_get_flag_name, - NULL, - sse_load_constant_long -}; - - extern int orc_x86_sse_flags; -extern int orc_x86_mmx_flags; - -void -orc_sse_init (void) -{ -#if defined(HAVE_AMD64) || defined(HAVE_I386) - /* initializes cache information */ - orc_sse_get_cpu_flags (); -#endif - -#if defined(HAVE_I386) -#ifndef MMX - if (!(orc_x86_sse_flags & ORC_TARGET_SSE_SSE2)) { - sse_target.executable = FALSE; - } -#else - if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMX)) { - mmx_target.executable = FALSE; - } -#endif -#endif - - orc_target_register (&sse_target); - orc_compiler_sse_register_rules (&sse_target); -} +/* TODO To be placed in a common header for private stuff */ +void orc_compiler_sse_register_rules (OrcTarget *target); +/* X86 specific */ static unsigned int -orc_compiler_sse_get_default_flags (void) +sse_get_default_flags (void) { unsigned int flags = 0; @@ -98,20 +32,11 @@ orc_compiler_sse_get_default_flags (void) } #if defined(HAVE_AMD64) || defined(HAVE_I386) -#ifndef MMX flags |= orc_x86_sse_flags; #else - flags |= orc_x86_mmx_flags; -#endif -#else -#ifndef MMX flags |= ORC_TARGET_SSE_SSE2; flags |= ORC_TARGET_SSE_SSE3; flags |= ORC_TARGET_SSE_SSSE3; -#else - flags |= ORC_TARGET_MMX_MMX; - flags |= ORC_TARGET_MMX_3DNOW; -#endif #endif return flags; @@ -121,13 +46,8 @@ static const char * sse_get_flag_name (int shift) { static const char *flags[] = { -#ifndef MMX "sse2", "sse3", "ssse3", "sse41", "sse42", "sse4a", "sse5", "frame_pointer", "short_jumps", "64bit" -#else - "mmx", "mmxext", "3dnow", "3dnowext", "ssse3", "sse41", "", - "frame_pointer", "short_jumps", "64bit" -#endif }; if (shift >= 0 && shift < sizeof(flags)/sizeof(flags[0])) { @@ -137,163 +57,117 @@ sse_get_flag_name (int shift) return NULL; } -static void -orc_compiler_sse_init (OrcCompiler *compiler) +static int +sse_is_executable (void) { - int i; +#if defined(HAVE_AMD64) || defined(HAVE_I386) + /* initializes cache information */ + const int flags = orc_sse_get_cpu_flags (); - if (compiler->target_flags & ORC_TARGET_SSE_64BIT) { - compiler->is_64bit = TRUE; - } - if (compiler->target_flags & ORC_TARGET_SSE_FRAME_POINTER) { - compiler->use_frame_pointer = TRUE; - } - if (!(compiler->target_flags & ORC_TARGET_SSE_SHORT_JUMPS)) { - compiler->long_jumps = TRUE; + if (orc_x86_sse_flags & ORC_TARGET_SSE_SSE2) { + return TRUE; } - +#endif + return FALSE; +} - if (compiler->is_64bit) { - for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){ - compiler->valid_regs[i] = 1; +static void +sse_validate_registers (int *regs, int is_64bit) +{ + int i; + + if (is_64bit) { + for(i = X86_XMM0;i < X86_XMM0 + 16; i++){ + regs[i] = 1; } - compiler->valid_regs[X86_ESP] = 0; - for(i=X86_XMM0;i<X86_XMM0+ORC_REG_SIZE;i++){ - compiler->valid_regs[i] = 1; + } else { + for(i = X86_XMM0; i < X86_XMM0 + 8; i++){ + regs[i] = 1; } - compiler->save_regs[X86_EBX] = 1; - compiler->save_regs[X86_EBP] = 1; - compiler->save_regs[X86_R12] = 1; - compiler->save_regs[X86_R13] = 1; - compiler->save_regs[X86_R14] = 1; - compiler->save_regs[X86_R15] = 1; + } +} + +static void +sse_saveable_registers (int *regs, int is_64bit) +{ #ifdef HAVE_OS_WIN32 - compiler->save_regs[X86_EDI] = 1; - compiler->save_regs[X86_ESI] = 1; - for(i=X86_XMM0+6;i<X86_XMM0+ORC_REG_SIZE;i++){ - compiler->save_regs[i] = 1; + if (is_64bit) { + int i; + for(i = X86_XMM0 + 6; i < X86_XMM0 + 16; i++){ + regs[i] = 1; } + } #endif +} + +static int +sse_is_64bit (int flags) +{ + if (flags & ORC_TARGET_SSE_64BIT) { + return TRUE; } else { - for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+8;i++){ - compiler->valid_regs[i] = 1; - } - compiler->valid_regs[X86_ESP] = 0; - if (compiler->use_frame_pointer) { - compiler->valid_regs[X86_EBP] = 0; - } - for(i=X86_XMM0;i<X86_XMM0+8;i++){ - compiler->valid_regs[i] = 1; - } - compiler->save_regs[X86_EBX] = 1; - compiler->save_regs[X86_EDI] = 1; - compiler->save_regs[X86_EBP] = 1; + return FALSE; } - for(i=0;i<128;i++){ - compiler->alloc_regs[i] = 0; - compiler->used_regs[i] = 0; +} + +static int +sse_use_frame_pointer (int flags) +{ + if (flags & ORC_TARGET_SSE_FRAME_POINTER) { + return TRUE; + } else { + return FALSE; } +} - if (compiler->is_64bit) { -#ifdef HAVE_OS_WIN32 - compiler->exec_reg = X86_ECX; - compiler->gp_tmpreg = X86_EDX; -#else - compiler->exec_reg = X86_EDI; - compiler->gp_tmpreg = X86_ECX; -#endif +static int +sse_use_long_jumps (int flags) +{ + if (!(flags & ORC_TARGET_SSE_SHORT_JUMPS)) { + return TRUE; } else { - compiler->gp_tmpreg = X86_ECX; - if (compiler->use_frame_pointer) { - compiler->exec_reg = X86_EBX; - } else { - compiler->exec_reg = X86_EBP; - } + return FALSE; } - compiler->valid_regs[compiler->gp_tmpreg] = 0; - compiler->valid_regs[compiler->exec_reg] = 0; +} - switch (compiler->max_var_size) { +static int +sse_loop_shift (int max_var_size) +{ + switch (max_var_size) { case 1: - compiler->loop_shift = 4; - break; + return 4; case 2: - compiler->loop_shift = 3; - break; + return 3; case 4: - compiler->loop_shift = 2; - break; + return 2; case 8: - compiler->loop_shift = 1; - break; + return 1; default: - ORC_ERROR("unhandled max var size %d", compiler->max_var_size); + ORC_ERROR ("unhandled max var size %d", max_var_size); break; } -#ifdef MMX - compiler->loop_shift--; -#endif - /* This limit is arbitrary, but some large functions run slightly - slower when unrolled (ginger Core2 6,15,6), and only some small - functions run faster when unrolled. Most are the same speed. */ - if (compiler->n_insns <= 10) { - compiler->unroll_shift = 1; - } - if (!compiler->long_jumps) { - compiler->unroll_shift = 0; - } - if (compiler->loop_shift == 0) { - /* FIXME something is broken with loop_shift=0, unroll_shift=1 */ - compiler->unroll_shift = 0; - } - compiler->alloc_loop_counter = TRUE; - compiler->allow_gp_on_stack = TRUE; - - { - for(i=0;i<compiler->n_insns;i++){ - OrcInstruction *insn = compiler->insns + i; - OrcStaticOpcode *opcode = insn->opcode; - - if (strcmp (opcode->name, "ldreslinb") == 0 || - strcmp (opcode->name, "ldreslinl") == 0 || - strcmp (opcode->name, "ldresnearb") == 0 || - strcmp (opcode->name, "ldresnearl") == 0) { - compiler->vars[insn->src_args[0]].need_offset_reg = TRUE; - } - } - } + return -1; } -void -sse_save_accumulators (OrcCompiler *compiler) +static void +sse_init_accumulator (OrcCompiler *compiler, OrcVariable *var) { - int i; - int src; - int tmp; - - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - OrcVariable *var = compiler->vars + i; + orc_sse_emit_pxor (compiler, var->alloc, var->alloc); +} - if (var->name == NULL) continue; - switch (var->vartype) { - case ORC_VAR_TYPE_ACCUMULATOR: - src = var->alloc; - tmp = orc_compiler_get_temp_reg (compiler); +static void +sse_reduce_accumulator (OrcCompiler *compiler, int i, OrcVariable *var) { + const int src = var->alloc; + const int tmp = orc_compiler_get_temp_reg (compiler); -#ifndef MMX orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(3,2,3,2), src, tmp); -#else - orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp); -#endif - if (var->size == 2) { orc_sse_emit_paddw (compiler, tmp, src); } else { orc_sse_emit_paddd (compiler, tmp, src); } -#ifndef MMX orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,1,1,1), src, tmp); if (var->size == 2) { @@ -301,15 +175,9 @@ sse_save_accumulators (OrcCompiler *compiler) } else { orc_sse_emit_paddd (compiler, tmp, src); } -#endif if (var->size == 2) { -#ifndef MMX orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(1,1,1,1), src, tmp); -#else - orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp); -#endif - orc_sse_emit_paddw (compiler, tmp, src); } @@ -325,19 +193,8 @@ sse_save_accumulators (OrcCompiler *compiler) compiler->exec_reg, var->is_aligned, var->is_uncached); } - - break; - default: - break; - } - } } -void -sse_load_constant (OrcCompiler *compiler, int reg, int size, int value) -{ - orc_sse_load_constant (compiler, reg, size, value); -} void orc_sse_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 value) @@ -360,9 +217,7 @@ orc_sse_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 valu orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, compiler->exec_reg, reg, FALSE); -#ifndef MMX orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,0,1,0), reg, reg); -#endif return; } @@ -426,11 +281,7 @@ orc_sse_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 valu orc_x86_emit_mov_imm_reg (compiler, 4, value, compiler->gp_tmpreg); orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, reg); -#ifndef MMX orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(0,0,0,0), reg, reg); -#else - orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg); -#endif } void @@ -452,175 +303,25 @@ sse_load_constant_long (OrcCompiler *compiler, int reg, orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, offset + 4*i, compiler->exec_reg); } - orc_x86_emit_mov_memoffset_sse (compiler, ORC_REG_SIZE, offset, compiler->exec_reg, + orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, compiler->exec_reg, reg, FALSE); } -void -sse_load_constants_outer (OrcCompiler *compiler) -{ - int i; - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - if (compiler->vars[i].name == NULL) continue; - switch (compiler->vars[i].vartype) { - case ORC_VAR_TYPE_CONST: - break; - case ORC_VAR_TYPE_PARAM: - break; - case ORC_VAR_TYPE_SRC: - case ORC_VAR_TYPE_DEST: - break; - case ORC_VAR_TYPE_ACCUMULATOR: - orc_sse_emit_pxor (compiler, - compiler->vars[i].alloc, compiler->vars[i].alloc); - break; - case ORC_VAR_TYPE_TEMP: - break; - default: - orc_compiler_error(compiler,"bad vartype"); - break; - } - } - - orc_compiler_emit_invariants (compiler); - - /* FIXME move to a better place */ - for(i=0;i<compiler->n_constants;i++){ - compiler->constants[i].alloc_reg = - orc_compiler_get_constant_reg (compiler); - } - - for(i=0;i<compiler->n_constants;i++){ - if (compiler->constants[i].alloc_reg) { - if (compiler->constants[i].is_long) { - sse_load_constant_long (compiler, compiler->constants[i].alloc_reg, - compiler->constants + i); - } else { - sse_load_constant (compiler, compiler->constants[i].alloc_reg, - 4, compiler->constants[i].value); - } - } - } - - { - for(i=0;i<compiler->n_insns;i++){ - OrcInstruction *insn = compiler->insns + i; - OrcStaticOpcode *opcode = insn->opcode; - - if (strcmp (opcode->name, "ldreslinb") == 0 || - strcmp (opcode->name, "ldreslinl") == 0 || - strcmp (opcode->name, "ldresnearb") == 0 || - strcmp (opcode->name, "ldresnearl") == 0) { - if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]), - compiler->exec_reg, - compiler->vars[insn->src_args[0]].ptr_offset); - } else { - orc_x86_emit_mov_imm_reg (compiler, 4, - compiler->vars[insn->src_args[1]].value.i, - compiler->vars[insn->src_args[0]].ptr_offset); - } - } - } - } -} - -void -sse_load_constants_inner (OrcCompiler *compiler) +static void +sse_move_register_to_memoffset (OrcCompiler *compiler, int size, int reg1, int offset, int reg2, int aligned, int uncached) { - int i; - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - if (compiler->vars[i].name == NULL) continue; - switch (compiler->vars[i].vartype) { - case ORC_VAR_TYPE_CONST: - break; - case ORC_VAR_TYPE_PARAM: - break; - case ORC_VAR_TYPE_SRC: - case ORC_VAR_TYPE_DEST: - if (compiler->vars[i].ptr_register) { - orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, - compiler->vars[i].ptr_register); - } - break; - case ORC_VAR_TYPE_ACCUMULATOR: - break; - case ORC_VAR_TYPE_TEMP: - break; - default: - orc_compiler_error(compiler,"bad vartype"); - break; - } - } + orc_x86_emit_mov_sse_memoffset (compiler, size, reg1, offset, reg2, aligned, uncached); } -void -sse_add_strides (OrcCompiler *compiler) -{ - int i; - - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - if (compiler->vars[i].name == NULL) continue; - switch (compiler->vars[i].vartype) { - case ORC_VAR_TYPE_CONST: - break; - case ORC_VAR_TYPE_PARAM: - break; - case ORC_VAR_TYPE_SRC: - case ORC_VAR_TYPE_DEST: - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[i]), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_add_reg_memoffset (compiler, compiler->is_64bit ? 8 : 4, - compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg); - - if (compiler->vars[i].ptr_register == 0) { - orc_compiler_error (compiler, "unimplemented: stride on pointer stored in memory"); - } - break; - case ORC_VAR_TYPE_ACCUMULATOR: - break; - case ORC_VAR_TYPE_TEMP: - break; - default: - orc_compiler_error(compiler,"bad vartype"); - break; - } - } -} - -static int -get_align_var (OrcCompiler *compiler) +static void +sse_move_memoffset_to_register (OrcCompiler *compiler, int size, int offset, int reg1, int reg2, int is_aligned) { - int i; - for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){ - if (compiler->vars[i].size == 0) continue; - if ((compiler->vars[i].size << compiler->loop_shift) >= 16) { - return i; - } - } - for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){ - if (compiler->vars[i].size == 0) continue; - if ((compiler->vars[i].size << compiler->loop_shift) >= 8) { - return i; - } - } - for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){ - if (compiler->vars[i].size == 0) continue; - return i; - } - - orc_compiler_error(compiler, "could not find alignment variable"); - - return -1; + orc_x86_emit_mov_memoffset_sse (compiler, size, reg1, offset, reg2, is_aligned); } static int -get_shift (int size) +sse_get_shift (int size) { switch (size) { case 1: @@ -632,469 +333,55 @@ get_shift (int size) case 8: return 3; default: - ORC_ERROR("bad size %d", size); + ORC_ERROR ("bad size %d", size); } return -1; } - -static void -orc_emit_split_3_regions (OrcCompiler *compiler) -{ - int align_var; - int align_shift; - int var_size_shift; - - align_var = get_align_var (compiler); - if (align_var < 0) - return; - var_size_shift = get_shift (compiler->vars[align_var].size); - align_shift = var_size_shift + compiler->loop_shift; - - /* determine how many iterations until align array is aligned (n1) */ - orc_x86_emit_mov_imm_reg (compiler, 4, 16, X86_EAX); - orc_x86_emit_sub_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[align_var]), - compiler->exec_reg, X86_EAX); - orc_x86_emit_and_imm_reg (compiler, 4, (1<<align_shift) - 1, X86_EAX); - orc_x86_emit_sar_imm_reg (compiler, 4, var_size_shift, X86_EAX); - - /* check if n1 is greater than n. */ - orc_x86_emit_cmp_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg); - - orc_x86_emit_jle (compiler, 6); - - /* If so, we have a standard 3-region split. */ - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg); - - /* Calculate n2 */ - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_sub_reg_reg (compiler, 4, X86_EAX, compiler->gp_tmpreg); - - orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX); - - orc_x86_emit_sar_imm_reg (compiler, 4, - compiler->loop_shift + compiler->unroll_shift, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - - /* Calculate n3 */ - orc_x86_emit_and_imm_reg (compiler, 4, - (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); - - orc_x86_emit_jmp (compiler, 7); - - /* else, iterations are all unaligned: n1=n, n2=0, n3=0 */ - orc_x86_emit_label (compiler, 6); - - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg); - orc_x86_emit_mov_imm_reg (compiler, 4, 0, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); - - orc_x86_emit_label (compiler, 7); -} - -static void -orc_emit_split_2_regions (OrcCompiler *compiler) -{ - int align_var; - int align_shift ORC_GNUC_UNUSED; - int var_size_shift; - - align_var = get_align_var (compiler); - if (align_var < 0) - return; - var_size_shift = get_shift (compiler->vars[align_var].size); - align_shift = var_size_shift + compiler->loop_shift; - - /* Calculate n2 */ - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX); - orc_x86_emit_sar_imm_reg (compiler, 4, - compiler->loop_shift + compiler->unroll_shift, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - - /* Calculate n3 */ - orc_x86_emit_and_imm_reg (compiler, 4, - (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); -} - -#define LABEL_REGION1_SKIP 1 -#define LABEL_INNER_LOOP_START 2 -#define LABEL_REGION2_SKIP 3 -#define LABEL_OUTER_LOOP 4 -#define LABEL_OUTER_LOOP_SKIP 5 -#define LABEL_STEP_DOWN(x) (8+(x)) -#define LABEL_STEP_UP(x) (13+(x)) - static void -orc_compiler_sse_save_registers (OrcCompiler *compiler) +sse_set_mxcsr (OrcCompiler *c) { - int i; - int saved = 0; - for (i = 0; i < ORC_REG_SIZE; ++i) { - if (compiler->save_regs[X86_XMM0 + i] == 1) { - ++saved; - } - } - if (saved > 0) { - orc_x86_emit_mov_imm_reg (compiler, 4, ORC_REG_SIZE * saved, compiler->gp_tmpreg); - orc_x86_emit_sub_reg_reg (compiler, compiler->is_64bit ? 8 : 4, - compiler->gp_tmpreg, X86_ESP); - saved = 0; - for (i = 0; i < ORC_REG_SIZE; ++i) { - if (compiler->save_regs[X86_XMM0 + i] == 1) { - orc_x86_emit_mov_sse_memoffset (compiler, ORC_REG_SIZE, X86_XMM0 + i, - saved * ORC_REG_SIZE, X86_ESP, FALSE, FALSE); - ++saved; - } - } - } + orc_sse_set_mxcsr (c); } static void -orc_compiler_sse_restore_registers (OrcCompiler *compiler) +sse_restore_mxcsr(OrcCompiler *c) { - int i; - int saved = 0; - for (i = 0; i < ORC_REG_SIZE; ++i) { - if (compiler->save_regs[X86_XMM0 + i] == 1) { - orc_x86_emit_mov_memoffset_sse (compiler, ORC_REG_SIZE, saved * ORC_REG_SIZE, X86_ESP, - X86_XMM0 + i, FALSE); - ++saved; - } - } - if (saved > 0) { - orc_x86_emit_mov_imm_reg (compiler, 4, ORC_REG_SIZE * saved, compiler->gp_tmpreg); - orc_x86_emit_add_reg_reg (compiler, compiler->is_64bit ? 8 : 4, - compiler->gp_tmpreg, X86_ESP); - } -} - -static void -orc_compiler_sse_assemble (OrcCompiler *compiler) -{ -#ifndef MMX - int set_mxcsr = FALSE; -#endif - int align_var; - int is_aligned; - - if (0 && orc_x86_assemble_copy_check (compiler)) { - /* The rep movs implementation isn't faster most of the time */ - orc_x86_assemble_copy (compiler); - return; - } - - align_var = get_align_var (compiler); - if (align_var < 0) { - orc_x86_assemble_copy (compiler); - return; - } - is_aligned = compiler->vars[align_var].is_aligned; - - { - orc_sse_emit_loop (compiler, 0, 0); - - compiler->codeptr = compiler->code; - free (compiler->asm_code); - compiler->asm_code = NULL; - compiler->asm_code_len = 0; - memset (compiler->labels, 0, sizeof (compiler->labels)); - memset (compiler->labels_int, 0, sizeof (compiler->labels_int)); - compiler->n_fixups = 0; - compiler->n_output_insns = 0; - } - - if (compiler->error) return; - - orc_x86_emit_prologue (compiler); - - orc_compiler_sse_save_registers (compiler); - -#ifndef MMX - if (orc_program_has_float (compiler)) { - set_mxcsr = TRUE; - orc_sse_set_mxcsr (compiler); - } -#endif - - sse_load_constants_outer (compiler); - - if (compiler->program->is_2d) { - if (compiler->program->constant_m > 0) { - orc_x86_emit_mov_imm_reg (compiler, 4, compiler->program->constant_m, - X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]), - compiler->exec_reg); - } else { - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]), - compiler->exec_reg, X86_EAX); - orc_x86_emit_test_reg_reg (compiler, 4, X86_EAX, X86_EAX); - orc_x86_emit_jle (compiler, LABEL_OUTER_LOOP_SKIP); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]), - compiler->exec_reg); - } - - orc_x86_emit_label (compiler, LABEL_OUTER_LOOP); - } - - if (compiler->program->constant_n > 0 && - compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) { - /* don't need to load n */ - } else if (compiler->loop_shift > 0) { - if (compiler->has_iterator_opcode || is_aligned) { - orc_emit_split_2_regions (compiler); - } else { - /* split n into three regions, with center region being aligned */ - orc_emit_split_3_regions (compiler); - } - } else { - /* loop shift is 0, no need to split */ - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - } - - sse_load_constants_inner (compiler); - - if (compiler->program->constant_n > 0 && - compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) { - int n_left = compiler->program->constant_n; - int save_loop_shift; - int loop_shift; - - compiler->offset = 0; - - save_loop_shift = compiler->loop_shift; - while (n_left >= (1<<compiler->loop_shift)) { - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - orc_sse_emit_loop (compiler, compiler->offset, 0); - - n_left -= 1<<compiler->loop_shift; - compiler->offset += 1<<compiler->loop_shift; - } - for(loop_shift = compiler->loop_shift-1; loop_shift>=0; loop_shift--) { - if (n_left >= (1<<loop_shift)) { - compiler->loop_shift = loop_shift; - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", loop_shift); - orc_sse_emit_loop (compiler, compiler->offset, 0); - n_left -= 1<<loop_shift; - compiler->offset += 1<<loop_shift; - } - } - compiler->loop_shift = save_loop_shift; - - } else { - int ui, ui_max; - int emit_region1 = TRUE; - int emit_region3 = TRUE; - - if (compiler->has_iterator_opcode || is_aligned) { - emit_region1 = FALSE; - } - if (compiler->loop_shift == 0) { - emit_region1 = FALSE; - emit_region3 = FALSE; - } - - if (emit_region1) { - int save_loop_shift; - int l; - - save_loop_shift = compiler->loop_shift; - compiler->vars[align_var].is_aligned = FALSE; - - for (l=0;l<save_loop_shift;l++){ - compiler->loop_shift = l; - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - - orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg); - orc_x86_emit_je (compiler, LABEL_STEP_UP(compiler->loop_shift)); - orc_sse_emit_loop (compiler, 0, 1<<compiler->loop_shift); - orc_x86_emit_label (compiler, LABEL_STEP_UP(compiler->loop_shift)); - } - - compiler->loop_shift = save_loop_shift; - compiler->vars[align_var].is_aligned = TRUE; - } - - orc_x86_emit_label (compiler, LABEL_REGION1_SKIP); - - orc_x86_emit_cmp_imm_memoffset (compiler, 4, 0, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - orc_x86_emit_je (compiler, LABEL_REGION2_SKIP); - - if (compiler->loop_counter != ORC_REG_INVALID) { - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, counter2), compiler->exec_reg, - compiler->loop_counter); - } - - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - orc_x86_emit_align (compiler, 4); - orc_x86_emit_label (compiler, LABEL_INNER_LOOP_START); - ui_max = 1<<compiler->unroll_shift; - for(ui=0;ui<ui_max;ui++) { - compiler->offset = ui<<compiler->loop_shift; - orc_sse_emit_loop (compiler, compiler->offset, - (ui==ui_max-1) << (compiler->loop_shift + compiler->unroll_shift)); - } - compiler->offset = 0; - if (compiler->loop_counter != ORC_REG_INVALID) { - orc_x86_emit_add_imm_reg (compiler, 4, -1, compiler->loop_counter, TRUE); - } else { - orc_x86_emit_dec_memoffset (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), - compiler->exec_reg); - } - orc_x86_emit_jne (compiler, LABEL_INNER_LOOP_START); - orc_x86_emit_label (compiler, LABEL_REGION2_SKIP); - - if (emit_region3) { - int save_loop_shift; - int l; - - save_loop_shift = compiler->loop_shift + compiler->unroll_shift; - compiler->vars[align_var].is_aligned = FALSE; - - for(l=save_loop_shift - 1; l >= 0; l--) { - compiler->loop_shift = l; - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - - orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); - orc_x86_emit_je (compiler, LABEL_STEP_DOWN(compiler->loop_shift)); - orc_sse_emit_loop (compiler, 0, 1<<compiler->loop_shift); - orc_x86_emit_label (compiler, LABEL_STEP_DOWN(compiler->loop_shift)); - } - - compiler->loop_shift = save_loop_shift; - } - } - - if (compiler->program->is_2d && compiler->program->constant_m != 1) { - sse_add_strides (compiler); - - orc_x86_emit_add_imm_memoffset (compiler, 4, -1, - (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]), - compiler->exec_reg); - orc_x86_emit_jne (compiler, LABEL_OUTER_LOOP); - orc_x86_emit_label (compiler, LABEL_OUTER_LOOP_SKIP); - } - - sse_save_accumulators (compiler); - -#ifndef MMX - if (set_mxcsr) { - orc_sse_restore_mxcsr (compiler); - } -#else - orc_x86_emit_emms (compiler); -#endif - - orc_compiler_sse_restore_registers (compiler); - - orc_x86_emit_epilogue (compiler); - - orc_x86_calculate_offsets (compiler); - orc_x86_output_insns (compiler); - - orc_x86_do_fixups (compiler); + orc_sse_restore_mxcsr (c); } -static void -orc_sse_emit_loop (OrcCompiler *compiler, int offset, int update) +void +orc_sse_init (void) { - int j; - int k; - OrcInstruction *insn; - OrcStaticOpcode *opcode; - OrcRule *rule; - - for(j=0;j<compiler->n_insns;j++){ - insn = compiler->insns + j; - opcode = insn->opcode; - - compiler->insn_index = j; - - if (insn->flags & ORC_INSN_FLAG_INVARIANT) continue; - - ORC_ASM_CODE(compiler,"# %d: %s\n", j, insn->opcode->name); - - compiler->min_temp_reg = ORC_VEC_REG_BASE; - - compiler->insn_shift = compiler->loop_shift; - if (insn->flags & ORC_INSTRUCTION_FLAG_X2) { - compiler->insn_shift += 1; - } - if (insn->flags & ORC_INSTRUCTION_FLAG_X4) { - compiler->insn_shift += 2; - } - - rule = insn->rule; - if (rule && rule->emit) { - rule->emit (compiler, rule->emit_user, insn); - } else { - orc_compiler_error (compiler, "no code generation rule for %s", - opcode->name); - } - } - - if (update) { - for(k=0;k<ORC_N_COMPILER_VARIABLES;k++){ - OrcVariable *var = compiler->vars + k; - - if (var->name == NULL) continue; - if (var->vartype == ORC_VAR_TYPE_SRC || - var->vartype == ORC_VAR_TYPE_DEST) { - int offset; - if (var->update_type == 0) { - offset = 0; - } else if (var->update_type == 1) { - offset = (var->size * update) >> 1; - } else { - offset = var->size * update; - } + // clang-format off + static OrcX86Target target = { + "sse", + sse_get_default_flags, + sse_get_flag_name, + sse_is_executable, + sse_validate_registers, + sse_saveable_registers, + sse_is_64bit, + sse_use_frame_pointer, + sse_use_long_jumps, + sse_loop_shift, + sse_init_accumulator, + sse_reduce_accumulator, + orc_sse_load_constant, + sse_load_constant_long, + sse_move_register_to_memoffset, + sse_move_memoffset_to_register, + sse_get_shift, + sse_set_mxcsr, + sse_restore_mxcsr, + 16, + X86_XMM0, + 16, + 13, + }; + // clang-format on + OrcTarget *t; - if (offset != 0) { - if (compiler->vars[k].ptr_register) { - orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4, - offset, - compiler->vars[k].ptr_register, FALSE); - } else { - orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4, - offset, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]), - compiler->exec_reg); - } - } - } - } - } + t = orc_x86_register_target (&target); + orc_compiler_sse_register_rules (t); } |