diff options
author | Jorge Zapata <jorgeluis.zapata@gmail.com> | 2024-01-21 19:52:30 +0100 |
---|---|---|
committer | Jorge Zapata <jorgeluis.zapata@gmail.com> | 2024-03-12 10:03:58 +0100 |
commit | 45c2be4f3a964148f2fa25ded3c37b4dedef08d8 (patch) | |
tree | db4271709825ded525486b7c51edfec64292660b | |
parent | 59f2bb1554798ca36e4973e28cc1b1d65aff1b31 (diff) |
Initial migration of MMX target
Part-of: <https://gitlab.freedesktop.org/gstreamer/orc/-/merge_requests/148>
-rw-r--r-- | orc/orcprogram-mmx.c | 1022 |
1 files changed, 148 insertions, 874 deletions
diff --git a/orc/orcprogram-mmx.c b/orc/orcprogram-mmx.c index a456e93..0893996 100644 --- a/orc/orcprogram-mmx.c +++ b/orc/orcprogram-mmx.c @@ -1,4 +1,3 @@ - #include "config.h" #include <stdio.h> @@ -14,79 +13,16 @@ #include <orc/orcdebug.h> #include <orc/orcinternal.h> -#define MMX 1 -#ifdef MMX -# define ORC_REG_SIZE 8 -#else -# define ORC_REG_SIZE 16 -#endif -#define SIZE 65536 - -#define ORC_MMX_ALIGNED_DEST_CUTOFF 64 - -static void orc_mmx_emit_loop (OrcCompiler *compiler, int offset, int update); - -void orc_compiler_mmx_register_rules (OrcTarget *target); -static void orc_compiler_mmx_init (OrcCompiler *compiler); -static unsigned int orc_compiler_mmx_get_default_flags (void); -static void orc_compiler_mmx_assemble (OrcCompiler *compiler); - -void mmx_load_constant (OrcCompiler *compiler, int reg, int size, int value); -void mmx_load_constant_long (OrcCompiler *compiler, int reg, - OrcConstant *constant); -static const char * mmx_get_flag_name (int shift); - -static OrcTarget mmx_target = { - "mmx", -#if defined(HAVE_I386) || defined(HAVE_AMD64) - TRUE, -#else - FALSE, -#endif - ORC_VEC_REG_BASE, - orc_compiler_mmx_get_default_flags, - orc_compiler_mmx_init, - orc_compiler_mmx_assemble, - { { 0 } }, - 0, - NULL, - mmx_load_constant, - mmx_get_flag_name, - NULL, - mmx_load_constant_long -}; - +#define ORC_REG_SIZE 8 extern int orc_x86_mmx_flags; -extern int orc_x86_mmx_flags; - -void -orc_mmx_init (void) -{ -#if defined(HAVE_AMD64) || defined(HAVE_I386) - /* initializes cache information */ - orc_mmx_get_cpu_flags (); -#endif -#if defined(HAVE_I386) -#ifndef MMX - if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMXEXT)) { - mmx_target.executable = FALSE; - } -#else - if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMX)) { - mmx_target.executable = FALSE; - } -#endif -#endif - - orc_target_register (&mmx_target); - - orc_compiler_mmx_register_rules (&mmx_target); -} +/* TODO To be placed in a common header for private stuff */ +void orc_compiler_mmx_register_rules (OrcTarget *target); +/* X86 specific */ static unsigned int -orc_compiler_mmx_get_default_flags (void) +mmx_get_default_flags (void) { unsigned int flags = 0; @@ -98,21 +34,11 @@ orc_compiler_mmx_get_default_flags (void) } #if defined(HAVE_AMD64) || defined(HAVE_I386) -#ifndef MMX flags |= orc_x86_mmx_flags; #else - flags |= orc_x86_mmx_flags; -#endif -#else -#ifndef MMX - flags |= ORC_TARGET_MMX_MMXEXT; - flags |= ORC_TARGET_MMX_SSE3; - flags |= ORC_TARGET_MMX_SSSE3; -#else flags |= ORC_TARGET_MMX_MMX; flags |= ORC_TARGET_MMX_3DNOW; #endif -#endif return flags; } @@ -121,13 +47,8 @@ static const char * mmx_get_flag_name (int shift) { static const char *flags[] = { -#ifndef MMX - "sse2", "sse3", "ssse3", "sse41", "sse42", "sse4a", "sse5", + "mmx", "mmxext", "3dnow", "3dnowext", "smmx3", "mmx41", "", "frame_pointer", "short_jumps", "64bit" -#else - "mmx", "mmxext", "3dnow", "3dnowext", "ssse3", "sse41", "", - "frame_pointer", "short_jumps", "64bit" -#endif }; if (shift >= 0 && shift < sizeof(flags)/sizeof(flags[0])) { @@ -137,210 +58,140 @@ mmx_get_flag_name (int shift) return NULL; } -static void -orc_compiler_mmx_init (OrcCompiler *compiler) +static int +mmx_is_executable (void) { - int i; +#if defined(HAVE_AMD64) || defined(HAVE_I386) + /* initializes cache information */ + const int flags = orc_mmx_get_cpu_flags (); - if (compiler->target_flags & ORC_TARGET_MMX_64BIT) { - compiler->is_64bit = TRUE; - } - if (compiler->target_flags & ORC_TARGET_MMX_FRAME_POINTER) { - compiler->use_frame_pointer = TRUE; - } - if (!(compiler->target_flags & ORC_TARGET_MMX_SHORT_JUMPS)) { - compiler->long_jumps = TRUE; + if (orc_x86_mmx_flags & ORC_TARGET_MMX_MMX) { + return TRUE; } - +#endif + return FALSE; +} - if (compiler->is_64bit) { - for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){ - compiler->valid_regs[i] = 1; +static void +mmx_validate_registers (int *regs, int is_64bit) +{ + int i; + + if (is_64bit) { + for (i = X86_MM0; i < X86_MM0 + ORC_REG_SIZE; i++) { + regs[i] = 1; } - compiler->valid_regs[X86_ESP] = 0; - for(i=X86_MM0;i<X86_MM0+ORC_REG_SIZE;i++){ - compiler->valid_regs[i] = 1; + } else { + for (i = X86_MM0; i < X86_MM0 + ORC_REG_SIZE; i++) { + regs[i] = 1; } - compiler->save_regs[X86_EBX] = 1; - compiler->save_regs[X86_EBP] = 1; - compiler->save_regs[X86_R12] = 1; - compiler->save_regs[X86_R13] = 1; - compiler->save_regs[X86_R14] = 1; - compiler->save_regs[X86_R15] = 1; + } +} + +static void +mmx_saveable_registers (int *regs, int is_64bit) +{ #ifdef HAVE_OS_WIN32 - compiler->save_regs[X86_EDI] = 1; - compiler->save_regs[X86_ESI] = 1; - for(i=X86_MM0+6;i<X86_MM0+ORC_REG_SIZE;i++){ - compiler->save_regs[i] = 1; + if (is_64bit) { + int i; + for(i = X86_MM0 + 6; i < X86_MM0 + ORC_REG_SIZE; i++){ + regs[i] = 1; } + } #endif +} + +static int +mmx_is_64bit (int flags) +{ + if (flags & ORC_TARGET_SSE_64BIT) { + return TRUE; } else { - for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+8;i++){ - compiler->valid_regs[i] = 1; - } - compiler->valid_regs[X86_ESP] = 0; - if (compiler->use_frame_pointer) { - compiler->valid_regs[X86_EBP] = 0; - } - for(i=X86_MM0;i<X86_MM0+8;i++){ - compiler->valid_regs[i] = 1; - } - compiler->save_regs[X86_EBX] = 1; - compiler->save_regs[X86_EDI] = 1; - compiler->save_regs[X86_EBP] = 1; + return FALSE; } - for(i=0;i<128;i++){ - compiler->alloc_regs[i] = 0; - compiler->used_regs[i] = 0; +} + +static int +mmx_use_frame_pointer (int flags) +{ + if (flags & ORC_TARGET_SSE_FRAME_POINTER) { + return TRUE; + } else { + return FALSE; } +} - if (compiler->is_64bit) { -#ifdef HAVE_OS_WIN32 - compiler->exec_reg = X86_ECX; - compiler->gp_tmpreg = X86_EDX; -#else - compiler->exec_reg = X86_EDI; - compiler->gp_tmpreg = X86_ECX; -#endif +static int +mmx_use_long_jumps (int flags) +{ + if (!(flags & ORC_TARGET_SSE_SHORT_JUMPS)) { + return TRUE; } else { - compiler->gp_tmpreg = X86_ECX; - if (compiler->use_frame_pointer) { - compiler->exec_reg = X86_EBX; - } else { - compiler->exec_reg = X86_EBP; - } + return FALSE; } - compiler->valid_regs[compiler->gp_tmpreg] = 0; - compiler->valid_regs[compiler->exec_reg] = 0; +} - switch (compiler->max_var_size) { +static int +mmx_loop_shift (int max_var_size) +{ + switch (max_var_size) { case 1: - compiler->loop_shift = 4; - break; + return 3; case 2: - compiler->loop_shift = 3; - break; + return 2; case 4: - compiler->loop_shift = 2; - break; + return 1; case 8: - compiler->loop_shift = 1; - break; + return 0; default: - ORC_ERROR("unhandled max var size %d", compiler->max_var_size); + ORC_ERROR ("unhandled max var size %d", max_var_size); break; } -#ifdef MMX - compiler->loop_shift--; -#endif - /* This limit is arbitrary, but some large functions run slightly - slower when unrolled (ginger Core2 6,15,6), and only some small - functions run faster when unrolled. Most are the same speed. */ - if (compiler->n_insns <= 10) { - compiler->unroll_shift = 1; - } - if (!compiler->long_jumps) { - compiler->unroll_shift = 0; - } - if (compiler->loop_shift == 0) { - /* FIXME something is broken with loop_shift=0, unroll_shift=1 */ - compiler->unroll_shift = 0; - } - compiler->alloc_loop_counter = TRUE; - compiler->allow_gp_on_stack = TRUE; - - { - for(i=0;i<compiler->n_insns;i++){ - OrcInstruction *insn = compiler->insns + i; - OrcStaticOpcode *opcode = insn->opcode; - - if (strcmp (opcode->name, "ldreslinb") == 0 || - strcmp (opcode->name, "ldreslinl") == 0 || - strcmp (opcode->name, "ldresnearb") == 0 || - strcmp (opcode->name, "ldresnearl") == 0) { - compiler->vars[insn->src_args[0]].need_offset_reg = TRUE; - } - } - } + return -1; } -void -mmx_save_accumulators (OrcCompiler *compiler) +static void +mmx_init_accumulator (OrcCompiler *compiler, OrcVariable *var) { - int i; - int src; - int tmp; - - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - OrcVariable *var = compiler->vars + i; - - if (var->name == NULL) continue; - switch (var->vartype) { - case ORC_VAR_TYPE_ACCUMULATOR: - src = var->alloc; - tmp = orc_compiler_get_temp_reg (compiler); - -#ifndef MMX - orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp); -#else - orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp); -#endif - - if (var->size == 2) { - orc_mmx_emit_paddw (compiler, tmp, src); - } else { - orc_mmx_emit_paddd (compiler, tmp, src); - } - -#ifndef MMX - orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp); - - if (var->size == 2) { - orc_mmx_emit_paddw (compiler, tmp, src); - } else { - orc_mmx_emit_paddd (compiler, tmp, src); - } -#endif - - if (var->size == 2) { -#ifndef MMX - orc_mmx_emit_pshuflw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp); -#else - orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp); -#endif + orc_mmx_emit_pxor (compiler, var->alloc, var->alloc); +} - orc_mmx_emit_paddw (compiler, tmp, src); - } +static void +mmx_reduce_accumulator (OrcCompiler *compiler, int i, OrcVariable *var) +{ + const int src = var->alloc; + const int tmp = orc_compiler_get_temp_reg (compiler); - if (var->size == 2) { - orc_mmx_emit_movd_store_register (compiler, src, compiler->gp_tmpreg); - orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, compiler->gp_tmpreg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]), - compiler->exec_reg); - } else { - orc_x86_emit_mov_mmx_memoffset (compiler, 4, src, - (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]), - compiler->exec_reg, - var->is_aligned, var->is_uncached); - } + orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp); - break; - default: - break; - } + if (var->size == 2) { + orc_mmx_emit_paddw (compiler, tmp, src); + } else { + orc_mmx_emit_paddd (compiler, tmp, src); } -} -void -mmx_load_constant (OrcCompiler *compiler, int reg, int size, int value) -{ - orc_mmx_load_constant (compiler, reg, size, value); + if (var->size == 2) { + orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp); + orc_mmx_emit_paddw (compiler, tmp, src); + } + if (var->size == 2) { + orc_mmx_emit_movd_store_register (compiler, src, compiler->gp_tmpreg); + orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, compiler->gp_tmpreg); + orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, + (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]), + compiler->exec_reg); + } else { + orc_x86_emit_mov_mmx_memoffset (compiler, 4, src, + (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]), + compiler->exec_reg, + var->is_aligned, var->is_uncached); + } } void -orc_mmx_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 value) +orc_mmx_load_constant (OrcCompiler *compiler, int reg, int size, + orc_uint64 value) { int i; @@ -360,9 +211,6 @@ orc_mmx_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 valu orc_x86_emit_mov_memoffset_mmx (compiler, 8, offset, compiler->exec_reg, reg, FALSE); -#ifndef MMX - orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg); -#endif return; } @@ -426,11 +274,7 @@ orc_mmx_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 valu orc_x86_emit_mov_imm_reg (compiler, 4, value, compiler->gp_tmpreg); orc_mmx_emit_movd_load_register (compiler, compiler->gp_tmpreg, reg); -#ifndef MMX - orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(0,0,0,0), reg, reg); -#else orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg); -#endif } void @@ -454,173 +298,22 @@ mmx_load_constant_long (OrcCompiler *compiler, int reg, } orc_x86_emit_mov_memoffset_mmx (compiler, ORC_REG_SIZE, offset, compiler->exec_reg, reg, FALSE); - -} - -void -mmx_load_constants_outer (OrcCompiler *compiler) -{ - int i; - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - if (compiler->vars[i].name == NULL) continue; - switch (compiler->vars[i].vartype) { - case ORC_VAR_TYPE_CONST: - break; - case ORC_VAR_TYPE_PARAM: - break; - case ORC_VAR_TYPE_SRC: - case ORC_VAR_TYPE_DEST: - break; - case ORC_VAR_TYPE_ACCUMULATOR: - orc_mmx_emit_pxor (compiler, - compiler->vars[i].alloc, compiler->vars[i].alloc); - break; - case ORC_VAR_TYPE_TEMP: - break; - default: - orc_compiler_error(compiler,"bad vartype"); - break; - } - } - - orc_compiler_emit_invariants (compiler); - - /* FIXME move to a better place */ - for(i=0;i<compiler->n_constants;i++){ - compiler->constants[i].alloc_reg = - orc_compiler_get_constant_reg (compiler); - } - - for(i=0;i<compiler->n_constants;i++){ - if (compiler->constants[i].alloc_reg) { - if (compiler->constants[i].is_long) { - mmx_load_constant_long (compiler, compiler->constants[i].alloc_reg, - compiler->constants + i); - } else { - mmx_load_constant (compiler, compiler->constants[i].alloc_reg, - 4, compiler->constants[i].value); - } - } - } - - { - for(i=0;i<compiler->n_insns;i++){ - OrcInstruction *insn = compiler->insns + i; - OrcStaticOpcode *opcode = insn->opcode; - - if (strcmp (opcode->name, "ldreslinb") == 0 || - strcmp (opcode->name, "ldreslinl") == 0 || - strcmp (opcode->name, "ldresnearb") == 0 || - strcmp (opcode->name, "ldresnearl") == 0) { - if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]), - compiler->exec_reg, - compiler->vars[insn->src_args[0]].ptr_offset); - } else { - orc_x86_emit_mov_imm_reg (compiler, 4, - compiler->vars[insn->src_args[1]].value.i, - compiler->vars[insn->src_args[0]].ptr_offset); - } - } - } - } -} - -void -mmx_load_constants_inner (OrcCompiler *compiler) -{ - int i; - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - if (compiler->vars[i].name == NULL) continue; - switch (compiler->vars[i].vartype) { - case ORC_VAR_TYPE_CONST: - break; - case ORC_VAR_TYPE_PARAM: - break; - case ORC_VAR_TYPE_SRC: - case ORC_VAR_TYPE_DEST: - if (compiler->vars[i].ptr_register) { - orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, - compiler->vars[i].ptr_register); - } - break; - case ORC_VAR_TYPE_ACCUMULATOR: - break; - case ORC_VAR_TYPE_TEMP: - break; - default: - orc_compiler_error(compiler,"bad vartype"); - break; - } - } } -void -mmx_add_strides (OrcCompiler *compiler) +static void +mmx_move_register_to_memoffset (OrcCompiler *compiler, int size, int reg1, int offset, int reg2, int aligned, int uncached) { - int i; - - for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ - if (compiler->vars[i].name == NULL) continue; - switch (compiler->vars[i].vartype) { - case ORC_VAR_TYPE_CONST: - break; - case ORC_VAR_TYPE_PARAM: - break; - case ORC_VAR_TYPE_SRC: - case ORC_VAR_TYPE_DEST: - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[i]), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_add_reg_memoffset (compiler, compiler->is_64bit ? 8 : 4, - compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg); - - if (compiler->vars[i].ptr_register == 0) { - orc_compiler_error (compiler, "unimplemented: stride on pointer stored in memory"); - } - break; - case ORC_VAR_TYPE_ACCUMULATOR: - break; - case ORC_VAR_TYPE_TEMP: - break; - default: - orc_compiler_error(compiler,"bad vartype"); - break; - } - } + orc_x86_emit_mov_mmx_memoffset (compiler, size, reg1, offset, reg2, aligned, uncached); } -static int -get_align_var (OrcCompiler *compiler) +static void +mmx_move_memoffset_to_register (OrcCompiler *compiler, int size, int offset, int reg1, int reg2, int is_aligned) { - int i; - for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){ - if (compiler->vars[i].size == 0) continue; - if ((compiler->vars[i].size << compiler->loop_shift) >= 16) { - return i; - } - } - for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){ - if (compiler->vars[i].size == 0) continue; - if ((compiler->vars[i].size << compiler->loop_shift) >= 8) { - return i; - } - } - for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){ - if (compiler->vars[i].size == 0) continue; - return i; - } - - orc_compiler_error(compiler, "could not find alignment variable"); - - return -1; + orc_x86_emit_mov_memoffset_mmx (compiler, size, reg1, offset, reg2, is_aligned); } static int -get_shift (int size) +mmx_get_shift (int size) { switch (size) { case 1: @@ -632,469 +325,50 @@ get_shift (int size) case 8: return 3; default: - ORC_ERROR("bad size %d", size); + ORC_ERROR ("bad size %d", size); } return -1; } - static void -orc_emit_split_3_regions (OrcCompiler *compiler) +mmx_restore_mxcsr(OrcCompiler *c) { - int align_var; - int align_shift; - int var_size_shift; - - align_var = get_align_var (compiler); - if (align_var < 0) - return; - var_size_shift = get_shift (compiler->vars[align_var].size); - align_shift = var_size_shift + compiler->loop_shift; - - /* determine how many iterations until align array is aligned (n1) */ - orc_x86_emit_mov_imm_reg (compiler, 4, 16, X86_EAX); - orc_x86_emit_sub_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[align_var]), - compiler->exec_reg, X86_EAX); - orc_x86_emit_and_imm_reg (compiler, 4, (1<<align_shift) - 1, X86_EAX); - orc_x86_emit_sar_imm_reg (compiler, 4, var_size_shift, X86_EAX); - - /* check if n1 is greater than n. */ - orc_x86_emit_cmp_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg); - - orc_x86_emit_jle (compiler, 6); - - /* If so, we have a standard 3-region split. */ - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg); - - /* Calculate n2 */ - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_sub_reg_reg (compiler, 4, X86_EAX, compiler->gp_tmpreg); - - orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX); - - orc_x86_emit_sar_imm_reg (compiler, 4, - compiler->loop_shift + compiler->unroll_shift, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - - /* Calculate n3 */ - orc_x86_emit_and_imm_reg (compiler, 4, - (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); - - orc_x86_emit_jmp (compiler, 7); - - /* else, iterations are all unaligned: n1=n, n2=0, n3=0 */ - orc_x86_emit_label (compiler, 6); - - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg); - orc_x86_emit_mov_imm_reg (compiler, 4, 0, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); - - orc_x86_emit_label (compiler, 7); + /* FIXME */ + orc_x86_emit_emms (c); } -static void -orc_emit_split_2_regions (OrcCompiler *compiler) -{ - int align_var; - int align_shift ORC_GNUC_UNUSED; - int var_size_shift; - - align_var = get_align_var (compiler); - if (align_var < 0) - return; - var_size_shift = get_shift (compiler->vars[align_var].size); - align_shift = var_size_shift + compiler->loop_shift; - - /* Calculate n2 */ - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX); - orc_x86_emit_sar_imm_reg (compiler, 4, - compiler->loop_shift + compiler->unroll_shift, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - - /* Calculate n3 */ - orc_x86_emit_and_imm_reg (compiler, 4, - (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); -} - -#define LABEL_REGION1_SKIP 1 -#define LABEL_INNER_LOOP_START 2 -#define LABEL_REGION2_SKIP 3 -#define LABEL_OUTER_LOOP 4 -#define LABEL_OUTER_LOOP_SKIP 5 -#define LABEL_STEP_DOWN(x) (8+(x)) -#define LABEL_STEP_UP(x) (13+(x)) - -static void -orc_compiler_mmx_save_registers (OrcCompiler *compiler) -{ - int i; - int saved = 0; - for (i = 0; i < ORC_REG_SIZE; ++i) { - if (compiler->save_regs[X86_MM0 + i] == 1) { - ++saved; - } - } - if (saved > 0) { - orc_x86_emit_mov_imm_reg (compiler, 4, ORC_REG_SIZE * saved, compiler->gp_tmpreg); - orc_x86_emit_sub_reg_reg (compiler, compiler->is_64bit ? 8 : 4, - compiler->gp_tmpreg, X86_ESP); - saved = 0; - for (i = 0; i < ORC_REG_SIZE; ++i) { - if (compiler->save_regs[X86_MM0 + i] == 1) { - orc_x86_emit_mov_mmx_memoffset (compiler, ORC_REG_SIZE, X86_MM0 + i, - saved * ORC_REG_SIZE, X86_ESP, FALSE, FALSE); - ++saved; - } - } - } -} - -static void -orc_compiler_mmx_restore_registers (OrcCompiler *compiler) -{ - int i; - int saved = 0; - for (i = 0; i < ORC_REG_SIZE; ++i) { - if (compiler->save_regs[X86_MM0 + i] == 1) { - orc_x86_emit_mov_memoffset_mmx (compiler, ORC_REG_SIZE, saved * ORC_REG_SIZE, X86_ESP, - X86_MM0 + i, FALSE); - ++saved; - } - } - if (saved > 0) { - orc_x86_emit_mov_imm_reg (compiler, 4, ORC_REG_SIZE * saved, compiler->gp_tmpreg); - orc_x86_emit_add_reg_reg (compiler, compiler->is_64bit ? 8 : 4, - compiler->gp_tmpreg, X86_ESP); - } -} - -static void -orc_compiler_mmx_assemble (OrcCompiler *compiler) -{ -#ifndef MMX - int set_mxcsr = FALSE; -#endif - int align_var; - int is_aligned; - - if (0 && orc_x86_assemble_copy_check (compiler)) { - /* The rep movs implementation isn't faster most of the time */ - orc_x86_assemble_copy (compiler); - return; - } - - align_var = get_align_var (compiler); - if (align_var < 0) { - orc_x86_assemble_copy (compiler); - return; - } - is_aligned = compiler->vars[align_var].is_aligned; - - { - orc_mmx_emit_loop (compiler, 0, 0); - - compiler->codeptr = compiler->code; - free (compiler->asm_code); - compiler->asm_code = NULL; - compiler->asm_code_len = 0; - memset (compiler->labels, 0, sizeof (compiler->labels)); - memset (compiler->labels_int, 0, sizeof (compiler->labels_int)); - compiler->n_fixups = 0; - compiler->n_output_insns = 0; - } - - if (compiler->error) return; - - orc_x86_emit_prologue (compiler); - - orc_compiler_mmx_save_registers (compiler); - -#ifndef MMX - if (orc_program_has_float (compiler)) { - set_mxcsr = TRUE; - orc_mmx_set_mxcsr (compiler); - } -#endif - - mmx_load_constants_outer (compiler); - - if (compiler->program->is_2d) { - if (compiler->program->constant_m > 0) { - orc_x86_emit_mov_imm_reg (compiler, 4, compiler->program->constant_m, - X86_EAX); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]), - compiler->exec_reg); - } else { - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]), - compiler->exec_reg, X86_EAX); - orc_x86_emit_test_reg_reg (compiler, 4, X86_EAX, X86_EAX); - orc_x86_emit_jle (compiler, LABEL_OUTER_LOOP_SKIP); - orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]), - compiler->exec_reg); - } - - orc_x86_emit_label (compiler, LABEL_OUTER_LOOP); - } - - if (compiler->program->constant_n > 0 && - compiler->program->constant_n <= ORC_MMX_ALIGNED_DEST_CUTOFF) { - /* don't need to load n */ - } else if (compiler->loop_shift > 0) { - if (compiler->has_iterator_opcode || is_aligned) { - orc_emit_split_2_regions (compiler); - } else { - /* split n into three regions, with center region being aligned */ - orc_emit_split_3_regions (compiler); - } - } else { - /* loop shift is 0, no need to split */ - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, - compiler->gp_tmpreg); - orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - } - - mmx_load_constants_inner (compiler); - - if (compiler->program->constant_n > 0 && - compiler->program->constant_n <= ORC_MMX_ALIGNED_DEST_CUTOFF) { - int n_left = compiler->program->constant_n; - int save_loop_shift; - int loop_shift; - - compiler->offset = 0; - - save_loop_shift = compiler->loop_shift; - while (n_left >= (1<<compiler->loop_shift)) { - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - orc_mmx_emit_loop (compiler, compiler->offset, 0); - - n_left -= 1<<compiler->loop_shift; - compiler->offset += 1<<compiler->loop_shift; - } - for(loop_shift = compiler->loop_shift-1; loop_shift>=0; loop_shift--) { - if (n_left >= (1<<loop_shift)) { - compiler->loop_shift = loop_shift; - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", loop_shift); - orc_mmx_emit_loop (compiler, compiler->offset, 0); - n_left -= 1<<loop_shift; - compiler->offset += 1<<loop_shift; - } - } - compiler->loop_shift = save_loop_shift; - - } else { - int ui, ui_max; - int emit_region1 = TRUE; - int emit_region3 = TRUE; - - if (compiler->has_iterator_opcode || is_aligned) { - emit_region1 = FALSE; - } - if (compiler->loop_shift == 0) { - emit_region1 = FALSE; - emit_region3 = FALSE; - } - - if (emit_region1) { - int save_loop_shift; - int l; - - save_loop_shift = compiler->loop_shift; - compiler->vars[align_var].is_aligned = FALSE; - - for (l=0;l<save_loop_shift;l++){ - compiler->loop_shift = l; - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - - orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg); - orc_x86_emit_je (compiler, LABEL_STEP_UP(compiler->loop_shift)); - orc_mmx_emit_loop (compiler, 0, 1<<compiler->loop_shift); - orc_x86_emit_label (compiler, LABEL_STEP_UP(compiler->loop_shift)); - } - - compiler->loop_shift = save_loop_shift; - compiler->vars[align_var].is_aligned = TRUE; - } - - orc_x86_emit_label (compiler, LABEL_REGION1_SKIP); - - orc_x86_emit_cmp_imm_memoffset (compiler, 4, 0, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); - orc_x86_emit_je (compiler, LABEL_REGION2_SKIP); - - if (compiler->loop_counter != ORC_REG_INVALID) { - orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor, counter2), compiler->exec_reg, - compiler->loop_counter); - } - - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - orc_x86_emit_align (compiler, 4); - orc_x86_emit_label (compiler, LABEL_INNER_LOOP_START); - ui_max = 1<<compiler->unroll_shift; - for(ui=0;ui<ui_max;ui++) { - compiler->offset = ui<<compiler->loop_shift; - orc_mmx_emit_loop (compiler, compiler->offset, - (ui==ui_max-1) << (compiler->loop_shift + compiler->unroll_shift)); - } - compiler->offset = 0; - if (compiler->loop_counter != ORC_REG_INVALID) { - orc_x86_emit_add_imm_reg (compiler, 4, -1, compiler->loop_counter, TRUE); - } else { - orc_x86_emit_dec_memoffset (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), - compiler->exec_reg); - } - orc_x86_emit_jne (compiler, LABEL_INNER_LOOP_START); - orc_x86_emit_label (compiler, LABEL_REGION2_SKIP); - - if (emit_region3) { - int save_loop_shift; - int l; - - save_loop_shift = compiler->loop_shift + compiler->unroll_shift; - compiler->vars[align_var].is_aligned = FALSE; - - for(l=save_loop_shift - 1; l >= 0; l--) { - compiler->loop_shift = l; - ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift); - - orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); - orc_x86_emit_je (compiler, LABEL_STEP_DOWN(compiler->loop_shift)); - orc_mmx_emit_loop (compiler, 0, 1<<compiler->loop_shift); - orc_x86_emit_label (compiler, LABEL_STEP_DOWN(compiler->loop_shift)); - } - - compiler->loop_shift = save_loop_shift; - } - } - - if (compiler->program->is_2d && compiler->program->constant_m != 1) { - mmx_add_strides (compiler); - - orc_x86_emit_add_imm_memoffset (compiler, 4, -1, - (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]), - compiler->exec_reg); - orc_x86_emit_jne (compiler, LABEL_OUTER_LOOP); - orc_x86_emit_label (compiler, LABEL_OUTER_LOOP_SKIP); - } - - mmx_save_accumulators (compiler); - -#ifndef MMX - if (set_mxcsr) { - orc_mmx_restore_mxcsr (compiler); - } -#else - orc_x86_emit_emms (compiler); -#endif - - orc_compiler_mmx_restore_registers (compiler); - - orc_x86_emit_epilogue (compiler); - - orc_x86_calculate_offsets (compiler); - orc_x86_output_insns (compiler); - - orc_x86_do_fixups (compiler); -} - -static void -orc_mmx_emit_loop (OrcCompiler *compiler, int offset, int update) +void +orc_mmx_init (void) { - int j; - int k; - OrcInstruction *insn; - OrcStaticOpcode *opcode; - OrcRule *rule; - - for(j=0;j<compiler->n_insns;j++){ - insn = compiler->insns + j; - opcode = insn->opcode; - - compiler->insn_index = j; - - if (insn->flags & ORC_INSN_FLAG_INVARIANT) continue; - - ORC_ASM_CODE(compiler,"# %d: %s\n", j, insn->opcode->name); - - compiler->min_temp_reg = ORC_VEC_REG_BASE; - - compiler->insn_shift = compiler->loop_shift; - if (insn->flags & ORC_INSTRUCTION_FLAG_X2) { - compiler->insn_shift += 1; - } - if (insn->flags & ORC_INSTRUCTION_FLAG_X4) { - compiler->insn_shift += 2; - } - - rule = insn->rule; - if (rule && rule->emit) { - rule->emit (compiler, rule->emit_user, insn); - } else { - orc_compiler_error (compiler, "no code generation rule for %s", - opcode->name); - } - } - - if (update) { - for(k=0;k<ORC_N_COMPILER_VARIABLES;k++){ - OrcVariable *var = compiler->vars + k; - - if (var->name == NULL) continue; - if (var->vartype == ORC_VAR_TYPE_SRC || - var->vartype == ORC_VAR_TYPE_DEST) { - int offset; - if (var->update_type == 0) { - offset = 0; - } else if (var->update_type == 1) { - offset = (var->size * update) >> 1; - } else { - offset = var->size * update; - } + // clang-format off + static OrcX86Target target = { + "mmx", + mmx_get_default_flags, + mmx_get_flag_name, + mmx_is_executable, + mmx_validate_registers, + mmx_saveable_registers, + mmx_is_64bit, + mmx_use_frame_pointer, + mmx_use_long_jumps, + mmx_loop_shift, + mmx_init_accumulator, + mmx_reduce_accumulator, + orc_mmx_load_constant, + mmx_load_constant_long, + mmx_move_register_to_memoffset, + mmx_move_memoffset_to_register, + mmx_get_shift, + NULL, + NULL, + 8, + X86_MM0, + ORC_REG_SIZE, + 13, + }; + // clang-format on + OrcTarget *t; - if (offset != 0) { - if (compiler->vars[k].ptr_register) { - orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4, - offset, - compiler->vars[k].ptr_register, FALSE); - } else { - orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4, - offset, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]), - compiler->exec_reg); - } - } - } - } - } + t = orc_x86_register_target (&target); + orc_compiler_mmx_register_rules (t); } |