From 02083f367e50bcb2025a280b9046acd404777fed Mon Sep 17 00:00:00 2001 From: David Schleef Date: Wed, 8 Sep 2010 13:11:34 -0700 Subject: mmx: Update from sse --- orc/orcmmx.c | 29 +++++ orc/orcmmx.h | 2 + orc/orcprogram-mmx.c | 149 +++++++++++++++++++---- orc/orcrules-mmx.c | 327 +++++++++++++++++++++++++++------------------------ orc/orcrules-sse.c | 6 + 5 files changed, 338 insertions(+), 175 deletions(-) diff --git a/orc/orcmmx.c b/orc/orcmmx.c index 85ef53d..a82678f 100644 --- a/orc/orcmmx.c +++ b/orc/orcmmx.c @@ -149,6 +149,35 @@ orc_mmx_emit_shiftimm (OrcCompiler *p, const char *insn_name, int code, *p->codeptr++ = shift; } +void +orc_x86_emit_mov_memindex_mmx (OrcCompiler *compiler, int size, int offset, + int reg1, int regindex, int shift, int reg2, int is_aligned) +{ + switch (size) { + case 4: + ORC_ASM_CODE(compiler," movd %d(%%%s,%%%s,%d), %%%s\n", offset, + orc_x86_get_regname_ptr(compiler, reg1), + orc_x86_get_regname_ptr(compiler, regindex), 1<codeptr++ = 0x0f; + *compiler->codeptr++ = 0x6e; + break; + case 8: + ORC_ASM_CODE(compiler," movq %d(%%%s,%%%s,%d), %%%s\n", offset, orc_x86_get_regname_ptr(compiler, reg1), + orc_x86_get_regname_ptr(compiler, regindex), 1<codeptr++ = 0x0f; + *compiler->codeptr++ = 0x7e; + break; + default: + ORC_COMPILER_ERROR(compiler, "bad size"); + break; + } + orc_x86_emit_modrm_memindex (compiler, reg2, offset, reg1, regindex, shift); +} + void orc_x86_emit_mov_memoffset_mmx (OrcCompiler *compiler, int size, int offset, int reg1, int reg2, int is_aligned) diff --git a/orc/orcmmx.h b/orc/orcmmx.h index 23eca1c..4606eaf 100644 --- a/orc/orcmmx.h +++ b/orc/orcmmx.h @@ -32,6 +32,8 @@ typedef enum { const char * orc_x86_get_regname_mmx(int i); void orc_x86_emit_mov_memoffset_mmx (OrcCompiler *compiler, int size, int offset, int reg1, int reg2, int is_aligned); +void orc_x86_emit_mov_memindex_mmx (OrcCompiler *compiler, int size, int offset, + int reg1, int regindex, int shift, int reg2, int is_aligned); void orc_x86_emit_mov_mmx_memoffset (OrcCompiler *compiler, int size, int reg1, int offset, int reg2, int aligned, int uncached); void orc_x86_emit_mov_mmx_reg_reg (OrcCompiler *compiler, int reg1, int reg2); diff --git a/orc/orcprogram-mmx.c b/orc/orcprogram-mmx.c index 872f108..15af7cf 100644 --- a/orc/orcprogram-mmx.c +++ b/orc/orcprogram-mmx.c @@ -30,6 +30,8 @@ void orc_mmx_emit_invariants (OrcCompiler *compiler); void orc_compiler_rewrite_vars (OrcCompiler *compiler); void orc_compiler_dump (OrcCompiler *compiler); void mmx_load_constant (OrcCompiler *compiler, int reg, int size, int value); +void mmx_load_constant_long (OrcCompiler *compiler, int reg, + OrcConstant *constant); static const char * mmx_get_flag_name (int shift); static OrcTarget mmx_target = { @@ -48,10 +50,13 @@ static OrcTarget mmx_target = { NULL, mmx_load_constant, mmx_get_flag_name, - NULL + NULL, + mmx_load_constant_long }; +extern int orc_x86_mmx_flags; +extern int orc_x86_mmx_flags; void orc_mmx_init (void) @@ -63,11 +68,11 @@ orc_mmx_init (void) #if defined(HAVE_I386) #ifndef MMX - if (!(orc_mmx_get_cpu_flags () & ORC_TARGET_MMX_MMXEXT)) { + if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMXEXT)) { mmx_target.executable = FALSE; } #else - if (!(orc_mmx_get_cpu_flags () & ORC_TARGET_MMX_MMX)) { + if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMX)) { mmx_target.executable = FALSE; } #endif @@ -91,7 +96,11 @@ orc_compiler_mmx_get_default_flags (void) } #if defined(HAVE_AMD64) || defined(HAVE_I386) - flags |= orc_mmx_get_cpu_flags (); +#ifndef MMX + flags |= orc_x86_mmx_flags; +#else + flags |= orc_x86_mmx_flags; +#endif #else #ifndef MMX flags |= ORC_TARGET_MMX_MMXEXT; @@ -146,7 +155,6 @@ orc_compiler_mmx_init (OrcCompiler *compiler) for(i=ORC_GP_REG_BASE;ivalid_regs[i] = 1; } - compiler->valid_regs[X86_EDI] = 0; compiler->valid_regs[X86_ESP] = 0; #ifndef MMX for(i=X86_MM0;isave_regs[X86_R13] = 1; compiler->save_regs[X86_R14] = 1; compiler->save_regs[X86_R15] = 1; +#ifdef HAVE_OS_WIN32 + compiler->save_regs[X86_EDI] = 1; + compiler->save_regs[X86_ESI] = 1; + for(i=X86_MM0+6;isave_regs[i] = 1; + } +#endif } else { for(i=ORC_GP_REG_BASE;ivalid_regs[i] = 1; @@ -183,18 +198,23 @@ orc_compiler_mmx_init (OrcCompiler *compiler) compiler->used_regs[i] = 0; } - compiler->gp_tmpreg = X86_ECX; - compiler->valid_regs[compiler->gp_tmpreg] = 0; - if (compiler->is_64bit) { +#ifdef HAVE_OS_WIN32 + compiler->exec_reg = X86_ECX; + compiler->gp_tmpreg = X86_EDX; +#else compiler->exec_reg = X86_EDI; + compiler->gp_tmpreg = X86_ECX; +#endif } else { + compiler->gp_tmpreg = X86_ECX; if (compiler->use_frame_pointer) { compiler->exec_reg = X86_EBX; } else { compiler->exec_reg = X86_EBP; } } + compiler->valid_regs[compiler->gp_tmpreg] = 0; compiler->valid_regs[compiler->exec_reg] = 0; switch (orc_program_get_max_var_size (compiler->program)) { @@ -234,6 +254,20 @@ orc_compiler_mmx_init (OrcCompiler *compiler) } compiler->alloc_loop_counter = TRUE; compiler->allow_gp_on_stack = TRUE; + + { + for(i=0;in_insns;i++){ + OrcInstruction *insn = compiler->insns + i; + OrcStaticOpcode *opcode = insn->opcode; + + if (strcmp (opcode->name, "ldreslinb") == 0 || + strcmp (opcode->name, "ldreslinl") == 0 || + strcmp (opcode->name, "ldresnearb") == 0 || + strcmp (opcode->name, "ldresnearl") == 0) { + compiler->vars[insn->src_args[0]].need_offset_reg = TRUE; + } + } + } } void @@ -376,6 +410,30 @@ mmx_load_constant (OrcCompiler *compiler, int reg, int size, int value) #endif } +void +mmx_load_constant_long (OrcCompiler *compiler, int reg, + OrcConstant *constant) +{ + int i; + int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); + + /* FIXME this is slower than it could be */ + + ORC_ASM_CODE(compiler, "# loading constant %08x %08x %08x %08x\n", + constant->full_value[0], constant->full_value[1], + constant->full_value[2], constant->full_value[3]); + + for(i=0;i<4;i++){ + orc_x86_emit_mov_imm_reg (compiler, 4, constant->full_value[i], + compiler->gp_tmpreg); + orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, + offset + 4*i, compiler->exec_reg); + } + orc_x86_emit_mov_memoffset_mmx (compiler, 16, offset, compiler->exec_reg, + reg, FALSE); + +} + void mmx_load_constants_outer (OrcCompiler *compiler) { @@ -412,8 +470,36 @@ mmx_load_constants_outer (OrcCompiler *compiler) for(i=0;in_constants;i++){ if (compiler->constants[i].alloc_reg) { - mmx_load_constant (compiler, compiler->constants[i].alloc_reg, - 4, compiler->constants[i].value); + if (compiler->constants[i].is_long) { + mmx_load_constant_long (compiler, compiler->constants[i].alloc_reg, + compiler->constants + i); + } else { + mmx_load_constant (compiler, compiler->constants[i].alloc_reg, + 4, compiler->constants[i].value); + } + } + } + + { + for(i=0;in_insns;i++){ + OrcInstruction *insn = compiler->insns + i; + OrcStaticOpcode *opcode = insn->opcode; + + if (strcmp (opcode->name, "ldreslinb") == 0 || + strcmp (opcode->name, "ldreslinl") == 0 || + strcmp (opcode->name, "ldresnearb") == 0 || + strcmp (opcode->name, "ldresnearl") == 0) { + if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { + orc_x86_emit_mov_memoffset_reg (compiler, 4, + (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]), + compiler->exec_reg, + compiler->vars[insn->src_args[0]].ptr_offset); + } else { + orc_x86_emit_mov_imm_reg (compiler, 4, + compiler->vars[insn->src_args[1]].value.i, + compiler->vars[insn->src_args[0]].ptr_offset); + } + } } } } @@ -464,10 +550,14 @@ mmx_add_strides (OrcCompiler *compiler) case ORC_VAR_TYPE_DEST: orc_x86_emit_mov_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[i]), compiler->exec_reg, - X86_ECX); + compiler->gp_tmpreg); orc_x86_emit_add_reg_memoffset (compiler, compiler->is_64bit ? 8 : 4, - X86_ECX, + compiler->gp_tmpreg, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg); + + if (compiler->vars[i].ptr_register == 0) { + ORC_COMPILER_ERROR(compiler, "unimplemented: stride on mem pointer"); + } break; case ORC_VAR_TYPE_ACCUMULATOR: break; @@ -906,18 +996,31 @@ orc_mmx_emit_loop (OrcCompiler *compiler, int offset, int update) if (update) { for(k=0;kvars[k].name == NULL) continue; - if (compiler->vars[k].vartype == ORC_VAR_TYPE_SRC || - compiler->vars[k].vartype == ORC_VAR_TYPE_DEST) { - if (compiler->vars[k].ptr_register) { - orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4, - compiler->vars[k].size * update, - compiler->vars[k].ptr_register, FALSE); + OrcVariable *var = compiler->vars + k; + + if (var->name == NULL) continue; + if (var->vartype == ORC_VAR_TYPE_SRC || + var->vartype == ORC_VAR_TYPE_DEST) { + int offset; + if (var->update_type == 0) { + offset = 0; + } else if (var->update_type == 1) { + offset = (var->size * update) >> 1; } else { - orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4, - compiler->vars[k].size * update, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]), - compiler->exec_reg); + offset = var->size * update; + } + + if (offset != 0) { + if (compiler->vars[k].ptr_register) { + orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4, + offset, + compiler->vars[k].ptr_register, FALSE); + } else { + orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4, + offset, + (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]), + compiler->exec_reg); + } } } } diff --git a/orc/orcrules-mmx.c b/orc/orcrules-mmx.c index e41908f..a4dcda1 100644 --- a/orc/orcrules-mmx.c +++ b/orc/orcrules-mmx.c @@ -97,6 +97,8 @@ mmx_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn) src->size << compiler->loop_shift); break; } + + src->update_type = 2; } static void @@ -150,6 +152,8 @@ mmx_rule_loadoffX (OrcCompiler *compiler, void *user, OrcInstruction *insn) src->size << compiler->loop_shift); break; } + + src->update_type = 2; } static void @@ -161,7 +165,7 @@ mmx_rule_loadupib (OrcCompiler *compiler, void *user, OrcInstruction *insn) int offset = 0; int tmp = orc_compiler_get_temp_reg (compiler); - offset = compiler->offset * src->size; + offset = (compiler->offset * src->size) >> 1; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, @@ -209,17 +213,7 @@ mmx_rule_loadupib (OrcCompiler *compiler, void *user, OrcInstruction *insn) orc_mmx_emit_pavgb (compiler, dest->alloc, tmp); orc_mmx_emit_punpcklbw (compiler, tmp, dest->alloc); - /* FIXME hack */ - if (src->ptr_register) { - orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4, - -(src->size << compiler->loop_shift)>>1, - src->ptr_register, FALSE); - } else { - orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4, - -(src->size << compiler->loop_shift)>>1, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[insn->src_args[0]]), - compiler->exec_reg); - } + src->update_type = 1; } static void @@ -230,7 +224,7 @@ mmx_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) int ptr_reg; int offset = 0; - offset = compiler->offset * src->size; + offset = (compiler->offset * src->size) >> 1; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, @@ -278,17 +272,8 @@ mmx_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) orc_mmx_emit_punpckldq (compiler, dest->alloc, dest->alloc); break; } - /* FIXME hack */ - if (src->ptr_register) { - orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4, - -(src->size << compiler->loop_shift)>>1, - src->ptr_register, FALSE); - } else { - orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4, - -(src->size << compiler->loop_shift)>>1, - (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[insn->src_args[0]]), - compiler->exec_reg); - } + + src->update_type = 1; } static void @@ -347,6 +332,116 @@ mmx_rule_storeX (OrcCompiler *compiler, void *user, OrcInstruction *insn) ORC_COMPILER_ERROR(compiler,"bad size"); break; } + + dest->update_type = 2; +} + +#if try1 +static void +mmx_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn) +{ + OrcVariable *src = compiler->vars + insn->src_args[0]; + OrcVariable *dest = compiler->vars + insn->dest_args[0]; + int tmp = orc_compiler_get_temp_reg (compiler); + int tmp2 = orc_compiler_get_temp_reg (compiler); + int tmpc; + + orc_x86_emit_mov_mmx_reg (compiler, X86_MM6, compiler->gp_tmpreg); + orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); + + ORC_ASM_CODE(compiler," movdqu 0(%%%s,%%%s,4), %%%s\n", + orc_x86_get_regname_ptr(compiler, src->ptr_register), + orc_x86_get_regname_ptr(compiler, compiler->gp_tmpreg), + orc_x86_get_regname_mmx(dest->alloc)); + *compiler->codeptr++ = 0xf3; + orc_x86_emit_rex(compiler, 0, dest->ptr_register, 0, dest->alloc); + *compiler->codeptr++ = 0x0f; + *compiler->codeptr++ = 0x6f; + orc_x86_emit_modrm_memindex (compiler, dest->alloc, 0, + src->ptr_register, compiler->gp_tmpreg, 2); + +#if 0 + orc_mmx_emit_movq (compiler, X86_MM6, tmp); + orc_mmx_emit_pslld (compiler, 10, tmp); + orc_mmx_emit_psrld (compiler, 26, tmp); + orc_mmx_emit_pslld (compiler, 2, tmp); + + orc_mmx_emit_movq (compiler, tmp, tmp2); + orc_mmx_emit_pslld (compiler, 8, tmp2); + orc_mmx_emit_por (compiler, tmp2, tmp); + orc_mmx_emit_movq (compiler, tmp, tmp2); + orc_mmx_emit_pslld (compiler, 16, tmp2); + orc_mmx_emit_por (compiler, tmp2, tmp); +#else + orc_mmx_emit_movq (compiler, X86_MM6, tmp); + tmpc = orc_compiler_get_constant_long (compiler, 0x02020202, + 0x06060606, 0x0a0a0a0a, 0x0e0e0e0e); + orc_mmx_emit_pshufb (compiler, tmpc, tmp); + orc_mmx_emit_paddb (compiler, tmp, tmp); + orc_mmx_emit_paddb (compiler, tmp, tmp); +#endif + + orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(0,0,0,0), tmp, tmp2); + orc_mmx_emit_psubd (compiler, tmp2, tmp); + tmpc = orc_compiler_get_constant (compiler, 4, 0x03020100); + orc_mmx_emit_paddd (compiler, tmpc, tmp); + + orc_mmx_emit_pshufb (compiler, tmp, dest->alloc); + + orc_mmx_emit_movq (compiler, X86_MM7, tmp); + orc_mmx_emit_pslld (compiler, compiler->loop_shift, tmp); + + orc_mmx_emit_paddd (compiler, tmp, X86_MM6); + + src->update_type = 0; +} +#endif + +static void +mmx_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn) +{ + OrcVariable *src = compiler->vars + insn->src_args[0]; + int increment_var = insn->src_args[2]; + OrcVariable *dest = compiler->vars + insn->dest_args[0]; + int tmp = orc_compiler_get_temp_reg (compiler); + int i; + + for(i=0;i<(1<loop_shift);i++){ + if (i == 0) { + orc_x86_emit_mov_memoffset_mmx (compiler, 4, 0, + src->ptr_register, dest->alloc, FALSE); + } else { + orc_x86_emit_mov_memindex_mmx (compiler, 4, 0, + src->ptr_register, compiler->gp_tmpreg, 2, tmp, FALSE); +#ifdef MMX + //orc_mmx_emit_punpckldq (compiler, tmp, dest->alloc); + orc_mmx_emit_psllq (compiler, 8*4*i, tmp); + orc_mmx_emit_por (compiler, tmp, dest->alloc); +#else + orc_mmx_emit_pslldq (compiler, 4*i, tmp); + orc_mmx_emit_por (compiler, tmp, dest->alloc); +#endif + } + + if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { + orc_x86_emit_add_memoffset_reg (compiler, 4, + (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), + compiler->exec_reg, src->ptr_offset); + } else { + orc_x86_emit_add_imm_reg (compiler, 4, + compiler->vars[increment_var].value.i, + src->ptr_offset, FALSE); + } + + orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); + orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); + } + + orc_x86_emit_add_reg_reg_shift (compiler, 4, compiler->gp_tmpreg, + src->ptr_register, 2); + orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); + + src->update_type = 0; } static void @@ -997,64 +1092,6 @@ mmx_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) orc_mmx_emit_movq (p, a, dest); } - -static void -mmx_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) -{ - /* About 40.7 cycles per array member on ginger. I.e., really slow */ - int i; - int regsize = p->is_64bit ? 8 : 4; - int stackframe; - - stackframe = 32 + 2*regsize; - stackframe = (stackframe + 0xf) & (~0xf); - - orc_x86_emit_add_imm_reg (p, regsize, -stackframe, X86_ESP, FALSE); - orc_x86_emit_mov_mmx_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, - 0, X86_ESP, FALSE, FALSE); - orc_x86_emit_mov_mmx_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, - 16, X86_ESP, FALSE, FALSE); - orc_x86_emit_mov_reg_memoffset (p, 4, X86_EAX, 32, X86_ESP); - orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, 32 + regsize, X86_ESP); - - for(i=0;i<(1<loop_shift);i++) { - int label = p->label_index++; - - orc_x86_emit_mov_memoffset_reg (p, 2, 16 + 2*i, X86_ESP, X86_ECX); - orc_x86_emit_mov_imm_reg (p, 4, 0, X86_EDX); - orc_x86_emit_mov_imm_reg (p, 2, 0x00ff, X86_EAX); - orc_x86_emit_and_imm_reg (p, 2, 0x00ff, X86_ECX); - orc_x86_emit_je (p, label); - orc_x86_emit_mov_memoffset_reg (p, 2, 2*i, X86_ESP, X86_EAX); - - ORC_ASM_CODE(p," div %%cx\n"); - *p->codeptr++ = 0x66; - *p->codeptr++ = 0xf7; - orc_x86_emit_modrm_reg (p, X86_ECX, 6); - - ORC_ASM_CODE(p," testw $0xff00, %%ax\n"); - *p->codeptr++ = 0x66; - *p->codeptr++ = 0xa9; - //*p->codeptr++ = 0xf7; - //orc_x86_emit_modrm_reg (p, X86_EAX, 0); - *p->codeptr++ = 0x00; - *p->codeptr++ = 0xff; - orc_x86_emit_je (p, label); - - orc_x86_emit_mov_imm_reg (p, 2, 0x00ff, X86_EAX); - - orc_x86_emit_label (p, label); - - orc_x86_emit_mov_reg_memoffset (p, 2, X86_EAX, 2*i, X86_ESP); - } - - orc_x86_emit_mov_memoffset_mmx (p, 16, 0, X86_ESP, - p->vars[insn->dest_args[0]].alloc, FALSE); - orc_x86_emit_mov_memoffset_reg (p, 4, 32, X86_ESP, X86_EAX); - orc_x86_emit_mov_memoffset_reg (p, 4, 32 + regsize, X86_ESP, X86_EDX); - - orc_x86_emit_add_imm_reg (p, regsize, stackframe, X86_ESP, FALSE); -} #endif static void @@ -1194,29 +1231,24 @@ static void mmx_rule_mulll_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int i; - int stackframe; - - stackframe = 32; - stackframe = (stackframe + 0xf) & (~0xf); + int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); - orc_x86_emit_add_imm_reg (p, p->is_64bit ? 8 : 4, -stackframe, X86_ESP, - FALSE); orc_x86_emit_mov_mmx_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, - 0, X86_ESP, FALSE, FALSE); + offset, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_mmx_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, - 16, X86_ESP, FALSE, FALSE); + offset + 16, p->exec_reg, FALSE, FALSE); for(i=0;i<(1<loop_shift);i++) { - orc_x86_emit_mov_memoffset_reg (p, 4, 4*i, X86_ESP, X86_ECX); - orc_x86_emit_imul_memoffset_reg (p, 4, 16+4*i, X86_ESP, X86_ECX); - orc_x86_emit_mov_reg_memoffset (p, 4, X86_ECX, 4*i, X86_ESP); + orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, + p->gp_tmpreg); + orc_x86_emit_imul_memoffset_reg (p, 4, offset + 16+4*i, p->exec_reg, + p->gp_tmpreg); + orc_x86_emit_mov_reg_memoffset (p, 4, p->gp_tmpreg, offset + 4*i, + p->exec_reg); } - orc_x86_emit_mov_memoffset_mmx (p, 16, 0, X86_ESP, + orc_x86_emit_mov_memoffset_mmx (p, 16, offset, p->exec_reg, p->vars[insn->dest_args[0]].alloc, FALSE); - - orc_x86_emit_add_imm_reg (p, p->is_64bit ? 8 : 4, stackframe, X86_ESP, - FALSE); } #ifndef MMX @@ -1243,35 +1275,31 @@ mmx_rule_mulhsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int i; int regsize = p->is_64bit ? 8 : 4; - int stackframe; - - stackframe = 32 + 2*regsize; - stackframe = (stackframe + 0xf) & (~0xf); + int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); - orc_x86_emit_add_imm_reg (p, regsize, -stackframe, X86_ESP, FALSE); orc_x86_emit_mov_mmx_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, - 0, X86_ESP, FALSE, FALSE); + offset, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_mmx_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, - 16, X86_ESP, FALSE, FALSE); - orc_x86_emit_mov_reg_memoffset (p, 4, X86_EAX, 32, X86_ESP); - orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, 32 + regsize, X86_ESP); + offset + 16, p->exec_reg, FALSE, FALSE); + orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EAX, offset + 32, + p->exec_reg); + orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EDX, offset + 40, + p->exec_reg); for(i=0;i<(1<loop_shift);i++) { - orc_x86_emit_mov_memoffset_reg (p, 4, 4*i, X86_ESP, X86_EAX); - ORC_ASM_CODE(p," imull %d(%%%s)\n", 16+4*i, - orc_x86_get_regname_ptr(p, X86_ESP)); - orc_x86_emit_rex(p, 4, 0, 0, X86_ESP); + orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, X86_EAX); + ORC_ASM_CODE(p," imull %d(%%%s)\n", offset + 16 + 4*i, + orc_x86_get_regname_ptr(p, p->exec_reg)); + orc_x86_emit_rex(p, 4, 0, 0, p->exec_reg); *p->codeptr++ = 0xf7; - orc_x86_emit_modrm_memoffset (p, 5, 16+4*i, X86_ESP); - orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, 4*i, X86_ESP); + orc_x86_emit_modrm_memoffset (p, 5, offset + 16 + 4*i, p->exec_reg); + orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, offset + 4*i, p->exec_reg); } - orc_x86_emit_mov_memoffset_mmx (p, 16, 0, X86_ESP, + orc_x86_emit_mov_memoffset_mmx (p, 16, offset, p->exec_reg, p->vars[insn->dest_args[0]].alloc, FALSE); - orc_x86_emit_mov_memoffset_reg (p, 4, 32, X86_ESP, X86_EAX); - orc_x86_emit_mov_memoffset_reg (p, 4, 32 + regsize, X86_ESP, X86_EDX); - - orc_x86_emit_add_imm_reg (p, regsize, stackframe, X86_ESP, FALSE); + orc_x86_emit_mov_memoffset_reg (p, 8, offset + 32, p->exec_reg, X86_EAX); + orc_x86_emit_mov_memoffset_reg (p, 8, offset + 40, p->exec_reg, X86_EDX); } #ifndef MMX @@ -1478,45 +1506,38 @@ mmx_rule_swapq (OrcCompiler *p, void *user, OrcInstruction *insn) orc_mmx_emit_por (p, tmp, dest); } -#define LOAD_MASK_IS_SLOW -#ifndef LOAD_MASK_IS_SLOW +#ifndef MMX static void -mmx_emit_load_mask (OrcCompiler *p, unsigned int mask1, unsigned int mask2) +mmx_rule_swapw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { - int tmp = orc_compiler_get_temp_reg (p); - int gptmp = p->gp_tmpreg; - int tmp2 = orc_compiler_get_temp_reg (p); + int dest = p->vars[insn->dest_args[0]].alloc; + int tmp; - orc_x86_emit_mov_imm_reg (p, 4, mask1, gptmp); - orc_x86_emit_mov_reg_mmx (p, gptmp, tmp); - orc_mmx_emit_pshufd (p, 0, tmp, tmp); - orc_x86_emit_mov_imm_reg (p, 4, mask2, gptmp); - orc_x86_emit_mov_reg_mmx (p, gptmp, tmp2); - orc_mmx_emit_punpcklbw (p, tmp2, tmp2); - orc_mmx_emit_punpcklwd (p, tmp2, tmp2); - orc_mmx_emit_paddb (p, tmp2, tmp); + tmp = orc_compiler_get_constant_long (p, + 0x02030001, 0x06070405, 0x0a0b0809, 0x0e0f0c0d); + orc_mmx_emit_pshufb (p, tmp, dest); } static void -mmx_rule_swapw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) +mmx_rule_swapl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { - int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; - int tmp = orc_compiler_get_temp_reg (p); + int tmp; - mmx_emit_load_mask (p, 0x02030001, 0x0c080400); + tmp = orc_compiler_get_constant_long (p, + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); orc_mmx_emit_pshufb (p, tmp, dest); } static void -mmx_rule_swapl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) +mmx_rule_swapq_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { - int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; - int tmp = orc_compiler_get_temp_reg (p); + int tmp; - mmx_emit_load_mask (p, 0x00010203, 0x0c080400); + tmp = orc_compiler_get_constant_long (p, + 0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); orc_mmx_emit_pshufb (p, tmp, dest); } @@ -1524,11 +1545,11 @@ mmx_rule_swapl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) static void mmx_rule_select0lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { - int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; - int tmp = orc_compiler_get_temp_reg (p); + int tmp; - mmx_emit_load_mask (p, 0x05040100, 0x08000800); + tmp = orc_compiler_get_constant_long (p, + 0x05040100, 0x0d0c0908, 0x05040100, 0x0d0c0908); orc_mmx_emit_pshufb (p, tmp, dest); } @@ -1536,11 +1557,11 @@ mmx_rule_select0lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) static void mmx_rule_select1lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { - int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; - int tmp = orc_compiler_get_temp_reg (p); + int tmp; - mmx_emit_load_mask (p, 0x07060302, 0x08000800); + tmp = orc_compiler_get_constant_long (p, + 0x07060302, 0x0f0e0b0a, 0x07060302, 0x0f0e0b0a); orc_mmx_emit_pshufb (p, tmp, dest); } @@ -1548,11 +1569,11 @@ mmx_rule_select1lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) static void mmx_rule_select0wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { - int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; - int tmp = orc_compiler_get_temp_reg (p); + int tmp; - mmx_emit_load_mask (p, 0x06040200, 0x08000800); + tmp = orc_compiler_get_constant_long (p, + 0x06040200, 0x0e0c0a08, 0x06040200, 0x0e0c0a08); orc_mmx_emit_pshufb (p, tmp, dest); } @@ -1560,11 +1581,11 @@ mmx_rule_select0wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) static void mmx_rule_select1wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { - int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; - int tmp = orc_compiler_get_temp_reg (p); + int tmp; - mmx_emit_load_mask (p, 0x07050301, 0x08000800); + tmp = orc_compiler_get_constant_long (p, + 0x07050301, 0x0f0d0b09, 0x07050301, 0x0f0d0b09); orc_mmx_emit_pshufb (p, tmp, dest); } @@ -2261,6 +2282,7 @@ orc_compiler_mmx_register_rules (OrcTarget *target) orc_rule_register (rule_set, "loadpw", mmx_rule_loadpX, (void *)2); orc_rule_register (rule_set, "loadpl", mmx_rule_loadpX, (void *)4); orc_rule_register (rule_set, "loadpq", mmx_rule_loadpX, (void *)8); + orc_rule_register (rule_set, "ldresnearl", mmx_rule_ldresnearl, NULL); orc_rule_register (rule_set, "storeb", mmx_rule_storeX, NULL); orc_rule_register (rule_set, "storew", mmx_rule_storeX, NULL); @@ -2445,9 +2467,10 @@ orc_compiler_mmx_register_rules (OrcTarget *target) REG(absb); REG(absw); REG(absl); -#ifndef LOAD_MASK_IS_SLOW +#ifndef MMX orc_rule_register (rule_set, "swapw", mmx_rule_swapw_ssse3, NULL); orc_rule_register (rule_set, "swapl", mmx_rule_swapl_ssse3, NULL); + orc_rule_register (rule_set, "swapq", mmx_rule_swapq_ssse3, NULL); orc_rule_register (rule_set, "select0lw", mmx_rule_select0lw_ssse3, NULL); orc_rule_register (rule_set, "select1lw", mmx_rule_select1lw_ssse3, NULL); orc_rule_register (rule_set, "select0wb", mmx_rule_select0wb_ssse3, NULL); diff --git a/orc/orcrules-sse.c b/orc/orcrules-sse.c index c66511f..8445a6d 100644 --- a/orc/orcrules-sse.c +++ b/orc/orcrules-sse.c @@ -413,8 +413,14 @@ sse_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn) } else { orc_x86_emit_mov_memindex_sse (compiler, 4, 0, src->ptr_register, compiler->gp_tmpreg, 2, tmp, FALSE); +#ifdef MMX + //orc_mmx_emit_punpckldq (compiler, tmp, dest->alloc); + orc_sse_emit_psllq (compiler, 8*4*i, tmp); + orc_sse_emit_por (compiler, tmp, dest->alloc); +#else orc_sse_emit_pslldq (compiler, 4*i, tmp); orc_sse_emit_por (compiler, tmp, dest->alloc); +#endif } if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { -- cgit v1.2.3