#include "config.h" #include #include #include #include #include #include #include #undef MMX #define SIZE 65536 /* sse rules */ static void sse_rule_loadpX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int reg; int size = ORC_PTR_TO_INT(user); if (src->vartype == ORC_VAR_TYPE_PARAM) { reg = dest->alloc; if (size == 8 && src->size == 8) { orc_x86_emit_mov_memoffset_sse (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0]]), compiler->exec_reg, reg, FALSE); #ifndef MMX orc_sse_emit_movhps_load_memoffset (compiler, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0] + (ORC_VAR_T1 - ORC_VAR_P1)]), compiler->exec_reg, reg); orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(2,0,2,0), reg, reg); #else /* FIXME yes, I understand this is terrible */ orc_sse_emit_pinsrw_memoffset (compiler, 2, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0] + (ORC_VAR_T1 - ORC_VAR_P1)]) + 0, compiler->exec_reg, reg); orc_sse_emit_pinsrw_memoffset (compiler, 3, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0] + (ORC_VAR_T1 - ORC_VAR_P1)]) + 2, compiler->exec_reg, reg); #ifndef MMX orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,0,1,0), reg, reg); #endif #endif } else { orc_x86_emit_mov_memoffset_sse (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0]]), compiler->exec_reg, reg, FALSE); if (size < 8) { if (size == 1) { orc_sse_emit_punpcklbw (compiler, reg, reg); } #ifndef MMX if (size <= 2) { orc_sse_emit_pshuflw (compiler, 0, reg, reg); } orc_sse_emit_pshufd (compiler, 0, reg, reg); #else if (size <= 2) { orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(0,0,0,0), reg, reg); } else { orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg); } #endif } else { #ifndef MMX orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,0,1,0), reg, reg); #endif } } } else if (src->vartype == ORC_VAR_TYPE_CONST) { orc_sse_load_constant (compiler, dest->alloc, size, src->value.i); } else { ORC_ASSERT(0); } } static void sse_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; offset = compiler->offset * src->size; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, compiler->gp_tmpreg); orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, dest->alloc); break; case 2: orc_sse_emit_pxor (compiler, dest->alloc, dest->alloc); orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); break; case 4: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, src->is_aligned); break; default: orc_compiler_error (compiler, "bad load size %d", src->size << compiler->loop_shift); break; } src->update_type = 2; } static void sse_rule_loadoffX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; if (compiler->vars[insn->src_args[1]].vartype != ORC_VAR_TYPE_CONST) { orc_compiler_error (compiler, "code generation rule for %s only works with constant offset", insn->opcode->name); return; } offset = (compiler->offset + compiler->vars[insn->src_args[1]].value.i) * src->size; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, compiler->gp_tmpreg); orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, dest->alloc); break; case 2: orc_sse_emit_pxor (compiler, dest->alloc, dest->alloc); orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); break; case 4: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, src->is_aligned); break; default: orc_compiler_error (compiler,"bad load size %d", src->size << compiler->loop_shift); break; } src->update_type = 2; } static void sse_rule_loadupib (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; int tmp = orc_compiler_get_temp_reg (compiler); offset = (compiler->offset * src->size) >> 1; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: case 2: orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); orc_sse_emit_movdqa (compiler, dest->alloc, tmp); orc_sse_emit_psrlw_imm (compiler, 8, tmp); break; case 4: orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); orc_sse_emit_pinsrw_memoffset (compiler, 0, offset + 1, ptr_reg, tmp); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, FALSE); orc_x86_emit_mov_memoffset_sse (compiler, 4, offset + 1, ptr_reg, tmp, FALSE); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, FALSE); orc_x86_emit_mov_memoffset_sse (compiler, 8, offset + 1, ptr_reg, tmp, FALSE); break; case 32: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, FALSE); orc_x86_emit_mov_memoffset_sse (compiler, 16, offset + 1, ptr_reg, tmp, FALSE); break; default: orc_compiler_error(compiler,"bad load size %d", src->size << compiler->loop_shift); break; } orc_sse_emit_pavgb (compiler, dest->alloc, tmp); orc_sse_emit_punpcklbw (compiler, tmp, dest->alloc); src->update_type = 1; } static void sse_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; offset = (compiler->offset * src->size) >> 1; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: case 2: orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, compiler->gp_tmpreg); orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, dest->alloc); break; case 4: orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 32: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, src->is_aligned); break; default: orc_compiler_error(compiler,"bad load size %d", src->size << compiler->loop_shift); break; } switch (src->size) { case 1: orc_sse_emit_punpcklbw (compiler, dest->alloc, dest->alloc); break; case 2: orc_sse_emit_punpcklwd (compiler, dest->alloc, dest->alloc); break; case 4: orc_sse_emit_punpckldq (compiler, dest->alloc, dest->alloc); break; } src->update_type = 1; } static void sse_rule_storeX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int offset; int ptr_reg; offset = compiler->offset * dest->size; if (dest->ptr_register == 0) { orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, dest->ptr_offset, compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = dest->ptr_register; } switch (dest->size << compiler->loop_shift) { case 1: /* FIXME we might be using ecx twice here */ if (ptr_reg == compiler->gp_tmpreg) { orc_compiler_error (compiler, "unimplemented corner case in %s", insn->opcode->name); } orc_sse_emit_movd_store_register (compiler, src->alloc, compiler->gp_tmpreg); orc_x86_emit_mov_reg_memoffset (compiler, 1, compiler->gp_tmpreg, offset, ptr_reg); break; case 2: if (compiler->target_flags & ORC_TARGET_SSE_SSE4_1) { orc_sse_emit_pextrw_memoffset (compiler, 0, offset, src->alloc, ptr_reg); } else { /* FIXME we might be using ecx twice here */ if (ptr_reg == compiler->gp_tmpreg) { orc_compiler_error(compiler, "unimplemented corner case in %s", insn->opcode->name); } orc_sse_emit_movd_store_register (compiler, src->alloc, compiler->gp_tmpreg); orc_x86_emit_mov_reg_memoffset (compiler, 2, compiler->gp_tmpreg, offset, ptr_reg); } break; case 4: orc_x86_emit_mov_sse_memoffset (compiler, 4, src->alloc, offset, ptr_reg, dest->is_aligned, dest->is_uncached); break; case 8: orc_x86_emit_mov_sse_memoffset (compiler, 8, src->alloc, offset, ptr_reg, dest->is_aligned, dest->is_uncached); break; case 16: orc_x86_emit_mov_sse_memoffset (compiler, 16, src->alloc, offset, ptr_reg, dest->is_aligned, dest->is_uncached); break; default: orc_compiler_error (compiler, "bad size"); break; } dest->update_type = 2; } #if try1 static void sse_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int tmp = orc_compiler_get_temp_reg (compiler); int tmp2 = orc_compiler_get_temp_reg (compiler); int tmpc; orc_sse_emit_movd_store_register (compiler, X86_XMM6, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_sse_emit_movdqu_load_memindex (compiler, 0, src->ptr_register, compiler->gp_tmpreg, 4, dest->alloc); #if 0 orc_sse_emit_movdqa (compiler, X86_XMM6, tmp); orc_sse_emit_pslld_imm (compiler, 10, tmp); orc_sse_emit_psrld_imm (compiler, 26, tmp); orc_sse_emit_pslld_imm (compiler, 2, tmp); orc_sse_emit_movdqa (compiler, tmp, tmp2); orc_sse_emit_pslld_imm (compiler, 8, tmp2); orc_sse_emit_por (compiler, tmp2, tmp); orc_sse_emit_movdqa (compiler, tmp, tmp2); orc_sse_emit_pslld_imm (compiler, 16, tmp2); orc_sse_emit_por (compiler, tmp2, tmp); #else orc_sse_emit_movdqa (compiler, X86_XMM6, tmp); tmpc = orc_compiler_get_constant_long (compiler, 0x02020202, 0x06060606, 0x0a0a0a0a, 0x0e0e0e0e); orc_sse_emit_pshufb (compiler, tmpc, tmp); orc_sse_emit_paddb (compiler, tmp, tmp); orc_sse_emit_paddb (compiler, tmp, tmp); #endif orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(0,0,0,0), tmp, tmp2); orc_sse_emit_psubd (compiler, tmp2, tmp); tmpc = orc_compiler_get_constant (compiler, 4, 0x03020100); orc_sse_emit_paddd (compiler, tmpc, tmp); orc_sse_emit_pshufb (compiler, tmp, dest->alloc); orc_sse_emit_movdqa (compiler, X86_XMM7, tmp); orc_sse_emit_pslld_imm (compiler, compiler->loop_shift, tmp); orc_sse_emit_paddd (compiler, tmp, X86_XMM6); src->update_type = 0; } #endif static void sse_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; int increment_var = insn->src_args[2]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int tmp = orc_compiler_get_temp_reg (compiler); int i; for(i=0;i<(1<loop_shift);i++){ if (i == 0) { orc_x86_emit_mov_memoffset_sse (compiler, 4, 0, src->ptr_register, dest->alloc, FALSE); } else { orc_x86_emit_mov_memindex_sse (compiler, 4, 0, src->ptr_register, compiler->gp_tmpreg, 2, tmp, FALSE); #ifdef MMX //orc_mmx_emit_punpckldq (compiler, tmp, dest->alloc); orc_sse_emit_psllq_imm (compiler, 8*4*i, tmp); orc_sse_emit_por (compiler, tmp, dest->alloc); #else orc_sse_emit_pslldq_imm (compiler, 4*i, tmp); orc_sse_emit_por (compiler, tmp, dest->alloc); #endif } if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, 4, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); } orc_x86_emit_add_reg_reg_shift (compiler, compiler->is_64bit ? 8 : 4, compiler->gp_tmpreg, src->ptr_register, 2); orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); src->update_type = 0; } #ifndef MMX static void sse_rule_ldreslinl (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; int increment_var = insn->src_args[2]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int tmp = orc_compiler_get_temp_reg (compiler); int tmp2 = orc_compiler_get_temp_reg (compiler); int regsize = compiler->is_64bit ? 8 : 4; int i; if (compiler->loop_shift == 0) { orc_x86_emit_mov_memoffset_sse (compiler, 8, 0, src->ptr_register, tmp, FALSE); orc_sse_emit_pxor (compiler, tmp2, tmp2); orc_sse_emit_punpcklbw (compiler, tmp2, tmp); orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(3,2,3,2), tmp, tmp2); orc_sse_emit_psubw (compiler, tmp, tmp2); orc_sse_emit_movd_load_register (compiler, src->ptr_offset, tmp); orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(0,0,0,0), tmp, tmp); orc_sse_emit_psrlw_imm (compiler, 8, tmp); orc_sse_emit_pmullw (compiler, tmp2, tmp); orc_sse_emit_psraw_imm (compiler, 8, tmp); orc_sse_emit_pxor (compiler, tmp2, tmp2); orc_sse_emit_packsswb (compiler, tmp2, tmp); orc_x86_emit_mov_memoffset_sse (compiler, 4, 0, src->ptr_register, dest->alloc, FALSE); orc_sse_emit_paddb (compiler, tmp, dest->alloc); if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, regsize, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_x86_emit_add_reg_reg_shift (compiler, regsize, compiler->gp_tmpreg, src->ptr_register, 2); orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); } else { int tmp3 = orc_compiler_get_temp_reg (compiler); int tmp4 = orc_compiler_get_temp_reg (compiler); for(i=0;i<(1<loop_shift);i+=2){ orc_x86_emit_mov_memoffset_sse (compiler, 8, 0, src->ptr_register, tmp, FALSE); orc_sse_emit_movd_load_register (compiler, src->ptr_offset, tmp4); if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, 4, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_x86_emit_mov_memindex_sse (compiler, 8, 0, src->ptr_register, compiler->gp_tmpreg, 2, tmp2, FALSE); orc_sse_emit_punpckldq (compiler, tmp2, tmp); orc_sse_emit_movdqa (compiler, tmp, tmp2); if (i == 0) { orc_sse_emit_movdqa (compiler, tmp, dest->alloc); } else { orc_sse_emit_punpcklqdq (compiler, tmp, dest->alloc); } orc_sse_emit_pxor (compiler, tmp3, tmp3); orc_sse_emit_punpcklbw (compiler, tmp3, tmp); orc_sse_emit_punpckhbw (compiler, tmp3, tmp2); orc_sse_emit_psubw (compiler, tmp, tmp2); orc_sse_emit_pinsrw_register (compiler, 1, src->ptr_offset, tmp4); #if 0 orc_sse_emit_punpcklwd (compiler, tmp4, tmp4); orc_sse_emit_punpckldq (compiler, tmp4, tmp4); #else orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(1,1,0,0), tmp4, tmp4); orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,1,0,0), tmp4, tmp4); #endif orc_sse_emit_psrlw_imm (compiler, 8, tmp4); orc_sse_emit_pmullw (compiler, tmp4, tmp2); orc_sse_emit_psraw_imm (compiler, 8, tmp2); orc_sse_emit_pxor (compiler, tmp, tmp); orc_sse_emit_packsswb (compiler, tmp, tmp2); if (i != 0) { orc_sse_emit_pslldq_imm (compiler, 8, tmp2); } orc_sse_emit_paddb (compiler, tmp2, dest->alloc); if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, 4, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_x86_emit_add_reg_reg_shift (compiler, 8, compiler->gp_tmpreg, src->ptr_register, 2); orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); } } src->update_type = 0; } #else static void mmx_rule_ldreslinl (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; int increment_var = insn->src_args[2]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int tmp = orc_compiler_get_temp_reg (compiler); int tmp2 = orc_compiler_get_temp_reg (compiler); int zero; int regsize = compiler->is_64bit ? 8 : 4; int i; zero = orc_compiler_get_constant (compiler, 1, 0); for(i=0;i<(1<loop_shift);i++){ orc_x86_emit_mov_memoffset_mmx (compiler, 4, 0, src->ptr_register, tmp, FALSE); orc_x86_emit_mov_memoffset_mmx (compiler, 4, 4, src->ptr_register, tmp2, FALSE); orc_mmx_emit_punpcklbw (compiler, zero, tmp); orc_mmx_emit_punpcklbw (compiler, zero, tmp2); orc_mmx_emit_psubw (compiler, tmp, tmp2); orc_sse_emit_movd_load_register (compiler, src->ptr_offset, tmp); orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(0,0,0,0), tmp, tmp); orc_mmx_emit_psrlw_imm (compiler, 8, tmp); orc_mmx_emit_pmullw (compiler, tmp2, tmp); orc_mmx_emit_psraw_imm (compiler, 8, tmp); orc_mmx_emit_pxor (compiler, tmp2, tmp2); orc_mmx_emit_packsswb (compiler, tmp2, tmp); if (i == 0) { orc_x86_emit_mov_memoffset_mmx (compiler, 4, 0, src->ptr_register, dest->alloc, FALSE); orc_mmx_emit_paddb (compiler, tmp, dest->alloc); } else { orc_x86_emit_mov_memoffset_mmx (compiler, 4, 0, src->ptr_register, tmp2, FALSE); orc_mmx_emit_paddb (compiler, tmp, tmp2); orc_mmx_emit_psllq_imm (compiler, 32, tmp2); orc_mmx_emit_por (compiler, tmp2, dest->alloc); } if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, regsize, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_x86_emit_add_reg_reg_shift (compiler, regsize, compiler->gp_tmpreg, src->ptr_register, 2); orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); } src->update_type = 0; } #endif static void sse_rule_copyx (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->vars[insn->src_args[0]].alloc == p->vars[insn->dest_args[0]].alloc) { return; } orc_sse_emit_movdqa (p, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } #define UNARY(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } #define BINARY(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[1]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } UNARY(absb,pabsb,0x381c) BINARY(addb,paddb,0xfc) BINARY(addssb,paddsb,0xec) BINARY(addusb,paddusb,0xdc) BINARY(andb,pand,0xdb) BINARY(andnb,pandn,0xdf) BINARY(avgub,pavgb,0xe0) BINARY(cmpeqb,pcmpeqb,0x74) BINARY(cmpgtsb,pcmpgtb,0x64) BINARY(maxsb,pmaxsb,0x383c) BINARY(maxub,pmaxub,0xde) BINARY(minsb,pminsb,0x3838) BINARY(minub,pminub,0xda) //BINARY(mullb,pmullb,0xd5) //BINARY(mulhsb,pmulhb,0xe5) //BINARY(mulhub,pmulhub,0xe4) BINARY(orb,por,0xeb) //UNARY(signb,psignb,0x3808) BINARY(subb,psubb,0xf8) BINARY(subssb,psubsb,0xe8) BINARY(subusb,psubusb,0xd8) BINARY(xorb,pxor,0xef) UNARY(absw,pabsw,0x381d) BINARY(addw,paddw,0xfd) BINARY(addssw,paddsw,0xed) BINARY(addusw,paddusw,0xdd) BINARY(andw,pand,0xdb) BINARY(andnw,pandn,0xdf) BINARY(avguw,pavgw,0xe3) BINARY(cmpeqw,pcmpeqw,0x75) BINARY(cmpgtsw,pcmpgtw,0x65) BINARY(maxsw,pmaxsw,0xee) BINARY(maxuw,pmaxuw,0x383e) BINARY(minsw,pminsw,0xea) BINARY(minuw,pminuw,0x383a) BINARY(mullw,pmullw,0xd5) BINARY(mulhsw,pmulhw,0xe5) BINARY(mulhuw,pmulhuw,0xe4) BINARY(orw,por,0xeb) //UNARY(signw,psignw,0x3809) BINARY(subw,psubw,0xf9) BINARY(subssw,psubsw,0xe9) BINARY(subusw,psubusw,0xd9) BINARY(xorw,pxor,0xef) UNARY(absl,pabsd,0x381e) BINARY(addl,paddd,0xfe) //BINARY(addssl,paddsd,0xed) //BINARY(addusl,paddusd,0xdd) BINARY(andl,pand,0xdb) BINARY(andnl,pandn,0xdf) //BINARY(avgul,pavgd,0xe3) BINARY(cmpeql,pcmpeqd,0x76) BINARY(cmpgtsl,pcmpgtd,0x66) BINARY(maxsl,pmaxsd,0x383d) BINARY(maxul,pmaxud,0x383f) BINARY(minsl,pminsd,0x3839) BINARY(minul,pminud,0x383b) BINARY(mulll,pmulld,0x3840) //BINARY(mulhsl,pmulhd,0xe5) //BINARY(mulhul,pmulhud,0xe4) BINARY(orl,por,0xeb) //UNARY(signl,psignd,0x380a) BINARY(subl,psubd,0xfa) //BINARY(subssl,psubsd,0xe9) //BINARY(subusl,psubusd,0xd9) BINARY(xorl,pxor,0xef) BINARY(andq,pand,0xdb) BINARY(andnq,pandn,0xdf) BINARY(orq,por,0xeb) BINARY(xorq,pxor,0xef) BINARY(cmpeqq,pcmpeqq,0x3829) BINARY(cmpgtsq,pcmpgtq,0x3837) #ifndef MMX BINARY(addq,paddq,0xd4) BINARY(subq,psubq,0xfb) #endif static void sse_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_paddw (p, src, dest); } static void sse_rule_accl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; #ifndef MMX if (p->loop_shift == 0) { orc_sse_emit_pslldq_imm (p, 12, src); } #endif orc_sse_emit_paddd (p, src, dest); } static void sse_rule_accsadubl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src1 = p->vars[insn->src_args[0]].alloc; int src2 = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); #ifndef MMX if (p->loop_shift <= 2) { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_pslldq_imm (p, 16 - (1<loop_shift), tmp); orc_sse_emit_movdqa (p, src2, tmp2); orc_sse_emit_pslldq_imm (p, 16 - (1<loop_shift), tmp2); orc_sse_emit_psadbw (p, tmp2, tmp); } else if (p->loop_shift == 3) { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psadbw (p, src2, tmp); orc_sse_emit_pslldq_imm (p, 8, tmp); } else { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psadbw (p, src2, tmp); } #else if (p->loop_shift <= 2) { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psllq_imm (p, 8*(8 - (1<loop_shift)), tmp); orc_sse_emit_movdqa (p, src2, tmp2); orc_sse_emit_psllq_imm (p, 8*(8 - (1<loop_shift)), tmp2); orc_sse_emit_psadbw (p, tmp2, tmp); } else { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psadbw (p, src2, tmp); } #endif orc_sse_emit_paddd (p, tmp, dest); } #ifndef MMX static void sse_rule_signX_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int opcodes[] = { ORC_X86_psignb, ORC_X86_psignw, ORC_X86_psignd }; int type = ORC_PTR_TO_INT(user); int tmpc; tmpc = orc_compiler_get_temp_constant (p, 1<vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 2, 0x0001); orc_sse_emit_pminsw (p, tmp, dest); tmp = orc_compiler_get_constant (p, 2, 0xffff); orc_sse_emit_pmaxsw (p, tmp, dest); } static void sse_rule_absb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_pxor (p, tmp, tmp); orc_sse_emit_pcmpgtb (p, src, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_psubb (p, tmp, dest); } static void sse_rule_absw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (src == dest) { orc_sse_emit_movdqa (p, src, tmp); } else { orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, tmp, dest); } orc_sse_emit_psraw_imm (p, 15, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_psubw (p, tmp, dest); } static void sse_rule_absl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (src == dest) { orc_sse_emit_movdqa (p, src, tmp); } else { orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, tmp, dest); } orc_sse_emit_psrad_imm (p, 31, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_psubd (p, tmp, dest); } static void sse_rule_shift (OrcCompiler *p, void *user, OrcInstruction *insn) { int type = ORC_PTR_TO_INT(user); //int imm_code1[] = { 0x71, 0x71, 0x71, 0x72, 0x72, 0x72, 0x73, 0x73 }; //int imm_code2[] = { 6, 2, 4, 6, 2, 4, 6, 2 }; //int reg_code[] = { 0xf1, 0xd1, 0xe1, 0xf2, 0xd2, 0xe2, 0xf3, 0xd3 }; //const char *code[] = { "psllw", "psrlw", "psraw", "pslld", "psrld", "psrad", "psllq", "psrlq" }; const int opcodes[] = { ORC_X86_psllw, ORC_X86_psrlw, ORC_X86_psraw, ORC_X86_pslld, ORC_X86_psrld, ORC_X86_psrad, ORC_X86_psllq, ORC_X86_psrlq }; const int opcodes_imm[] = { ORC_X86_psllw_imm, ORC_X86_psrlw_imm, ORC_X86_psraw_imm, ORC_X86_pslld_imm, ORC_X86_psrld_imm, ORC_X86_psrad_imm, ORC_X86_psllq_imm, ORC_X86_psrlq_imm }; if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_x86_emit_cpuinsn_imm (p, opcodes_imm[type], p->vars[insn->src_args[1]].value.i, 16, p->vars[insn->dest_args[0]].alloc); } else if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { int tmp = orc_compiler_get_temp_reg (p); /* FIXME this is a gross hack to reload the register with a * 64-bit version of the parameter. */ orc_x86_emit_mov_memoffset_sse (p, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]), p->exec_reg, tmp, FALSE); orc_x86_emit_cpuinsn_size (p, opcodes[type], 16, tmp, p->vars[insn->dest_args[0]].alloc); } else { orc_compiler_error (p, "code generation rule for %s only works with " "constant or parameter shifts", insn->opcode->name); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shlb (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_sse_emit_psllw_imm (p, p->vars[insn->src_args[1]].value.i, dest); tmp = orc_compiler_get_constant (p, 1, 0xff&(0xff<vars[insn->src_args[1]].value.i)); orc_sse_emit_pand (p, tmp, dest); } else { orc_compiler_error (p, "code generation rule for %s only works with " "constant shifts", insn->opcode->name); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shrsb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psllw_imm (p, 8, tmp); orc_sse_emit_psraw_imm (p, p->vars[insn->src_args[1]].value.i, tmp); orc_sse_emit_psrlw_imm (p, 8, tmp); orc_sse_emit_psraw_imm (p, 8 + p->vars[insn->src_args[1]].value.i, dest); orc_sse_emit_psllw_imm (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } else { orc_compiler_error (p, "code generation rule for %s only works with " "constant shifts", insn->opcode->name); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shrub (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_sse_emit_psrlw_imm (p, p->vars[insn->src_args[1]].value.i, dest); tmp = orc_compiler_get_constant (p, 1, (0xff>>p->vars[insn->src_args[1]].value.i)); orc_sse_emit_pand (p, tmp, dest); } else { orc_compiler_error (p, "code generation rule for %s only works with " "constant shifts", insn->opcode->name); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shrsq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,3,1,1), src, tmp); #else orc_mmx_emit_pshufw (p, ORC_MMX_SHUF(3,2,3,2), src, tmp); #endif orc_sse_emit_psrad_imm (p, 31, tmp); orc_sse_emit_psllq_imm (p, 64-p->vars[insn->src_args[1]].value.i, tmp); orc_sse_emit_psrlq_imm (p, p->vars[insn->src_args[1]].value.i, dest); orc_sse_emit_por (p, tmp, dest); } else { orc_compiler_error (p, "code generation rule for %s only works with " "constant shifts", insn->opcode->name); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_convsbw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, src, dest); orc_sse_emit_psraw_imm (p, 8, dest); } static void sse_rule_convubw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* FIXME need a zero register */ if (0) { orc_sse_emit_punpcklbw (p, src, dest); orc_sse_emit_psrlw_imm (p, 8, dest); } else { orc_sse_emit_pxor(p, tmp, tmp); orc_sse_emit_punpcklbw (p, tmp, dest); } } static void sse_rule_convssswb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packsswb (p, src, dest); } static void sse_rule_convsuswb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packuswb (p, src, dest); } static void sse_rule_convuuswb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, src, dest); orc_sse_emit_psrlw_imm (p, 15, tmp); orc_sse_emit_psllw_imm (p, 14, tmp); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_psllw_imm (p, 1, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_packuswb (p, dest, dest); } static void sse_rule_convwb (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_psllw_imm (p, 8, dest); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_packuswb (p, dest, dest); } static void sse_rule_convhwb (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_packuswb (p, dest, dest); } static void sse_rule_convswl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklwd (p, src, dest); orc_sse_emit_psrad_imm (p, 16, dest); } static void sse_rule_convuwl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* FIXME need a zero register */ if (0) { orc_sse_emit_punpcklwd (p, src, dest); orc_sse_emit_psrld_imm (p, 16, dest); } else { orc_sse_emit_pxor(p, tmp, tmp); orc_sse_emit_punpcklwd (p, tmp, dest); } } static void sse_rule_convlw (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_pslld_imm (p, 16, dest); orc_sse_emit_psrad_imm (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_convhlw (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_psrad_imm (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_convssslw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packssdw (p, src, dest); } static void sse_rule_convsuslw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packusdw (p, src, dest); } static void sse_rule_convslq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psrad_imm (p, 31, tmp); orc_sse_emit_punpckldq (p, tmp, dest); } static void sse_rule_convulq (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 4, 0); orc_sse_emit_punpckldq (p, tmp, dest); } static void sse_rule_convql (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest); #else orc_sse_emit_movdqa (p, src, dest); #endif } static void sse_rule_splatw3q (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; #ifndef MMX orc_sse_emit_pshuflw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest); orc_sse_emit_pshufhw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest); #else orc_mmx_emit_pshufw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest); #endif } static void sse_rule_splatbw (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, dest, dest); } static void sse_rule_splatbl (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, dest, dest); orc_sse_emit_punpcklwd (p, dest, dest); } static void sse_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmpc; tmpc = orc_compiler_get_constant (p, 2, 0x0080); orc_sse_emit_paddw (p, tmpc, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psrlw_imm (p, 8, tmp); orc_sse_emit_paddw (p, tmp, dest); orc_sse_emit_psrlw_imm (p, 8, dest); } #if 1 static void sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) { /* About 5.2 cycles per array member on ginger */ int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int a = orc_compiler_get_temp_reg (p); int j = orc_compiler_get_temp_reg (p); int j2 = orc_compiler_get_temp_reg (p); int l = orc_compiler_get_temp_reg (p); int divisor = orc_compiler_get_temp_reg (p); int tmp; int i; orc_sse_emit_movdqa (p, src, divisor); orc_sse_emit_psllw_imm (p, 8, divisor); orc_sse_emit_psrlw_imm (p, 1, divisor); orc_sse_load_constant (p, a, 2, 0x00ff); tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_movdqa (p, tmp, j); orc_sse_emit_psrlw_imm (p, 8, j); orc_sse_emit_pxor (p, tmp, dest); for(i=0;i<7;i++){ orc_sse_emit_movdqa (p, divisor, l); orc_sse_emit_pxor (p, tmp, l); orc_sse_emit_pcmpgtw (p, dest, l); orc_sse_emit_movdqa (p, l, j2); orc_sse_emit_pandn (p, divisor, l); orc_sse_emit_psubw (p, l, dest); orc_sse_emit_psrlw_imm (p, 1, divisor); orc_sse_emit_pand (p, j, j2); orc_sse_emit_pxor (p, j2, a); orc_sse_emit_psrlw_imm (p, 1, j); } orc_sse_emit_movdqa (p, divisor, l); orc_sse_emit_pxor (p, tmp, l); orc_sse_emit_pcmpgtw (p, dest, l); orc_sse_emit_pand (p, j, l); orc_sse_emit_pxor (p, l, a); orc_sse_emit_movdqa (p, a, dest); } #else static void sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) { /* About 8.4 cycles per array member on ginger */ int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int b = orc_compiler_get_temp_reg (p); int a = orc_compiler_get_temp_reg (p); int k = orc_compiler_get_temp_reg (p); int j = orc_compiler_get_temp_reg (p); int tmp; int i; orc_sse_emit_movdqa (p, dest, b); tmp = orc_compiler_get_constant (p, 2, 0x00ff); orc_sse_emit_pand (p, tmp, src); tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor (p, tmp, b); orc_sse_emit_pxor (p, a, a); orc_sse_emit_movdqa (p, tmp, j); orc_sse_emit_psrlw_imm (p, 8, j); for(i=0;i<8;i++){ orc_sse_emit_por (p, j, a); orc_sse_emit_movdqa (p, a, k); orc_sse_emit_pmullw (p, src, k); orc_sse_emit_pxor (p, tmp, k); orc_sse_emit_pcmpgtw (p, b, k); orc_sse_emit_pand (p, j, k); orc_sse_emit_pxor (p, k, a); orc_sse_emit_psrlw_imm (p, 1, j); } orc_sse_emit_movdqa (p, a, dest); } #endif static void sse_rule_mulsbw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_punpcklbw (p, src, tmp); orc_sse_emit_psraw_imm (p, 8, tmp); orc_sse_emit_punpcklbw (p, dest, dest); orc_sse_emit_psraw_imm (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); } static void sse_rule_mulubw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_punpcklbw (p, src, tmp); orc_sse_emit_psrlw_imm (p, 8, tmp); orc_sse_emit_punpcklbw (p, dest, dest); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); } static void sse_rule_mullb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pmullw (p, src, dest); orc_sse_emit_psllw_imm (p, 8, dest); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_psraw_imm (p, 8, tmp2); orc_sse_emit_psraw_imm (p, 8, tmp); orc_sse_emit_pmullw (p, tmp2, tmp); orc_sse_emit_psllw_imm (p, 8, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_mulhsb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, dest, tmp2); orc_sse_emit_psllw_imm (p, 8, tmp); orc_sse_emit_psraw_imm (p, 8, tmp); orc_sse_emit_psllw_imm (p, 8, dest); orc_sse_emit_psraw_imm (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psraw_imm (p, 8, tmp); orc_sse_emit_psraw_imm (p, 8, tmp2); orc_sse_emit_pmullw (p, tmp, tmp2); orc_sse_emit_psrlw_imm (p, 8, tmp2); orc_sse_emit_psllw_imm (p, 8, tmp2); orc_sse_emit_por (p, tmp2, dest); } static void sse_rule_mulhub (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, dest, tmp2); orc_sse_emit_psllw_imm (p, 8, tmp); orc_sse_emit_psrlw_imm (p, 8, tmp); orc_sse_emit_psllw_imm (p, 8, dest); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psrlw_imm (p, 8, tmp); orc_sse_emit_psrlw_imm (p, 8, tmp2); orc_sse_emit_pmullw (p, tmp, tmp2); orc_sse_emit_psrlw_imm (p, 8, tmp2); orc_sse_emit_psllw_imm (p, 8, tmp2); orc_sse_emit_por (p, tmp2, dest); } static void sse_rule_mulswl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pmulhw (p, src, tmp); orc_sse_emit_pmullw (p, src, dest); orc_sse_emit_punpcklwd (p, tmp, dest); } static void sse_rule_muluwl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pmulhuw (p, src, tmp); orc_sse_emit_pmullw (p, src, dest); orc_sse_emit_punpcklwd (p, tmp, dest); } static void sse_rule_mulll_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int i; int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, offset, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, offset + 16, p->exec_reg, FALSE, FALSE); for(i=0;i<(1<insn_shift);i++) { orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, p->gp_tmpreg); orc_x86_emit_imul_memoffset_reg (p, 4, offset + 16+4*i, p->exec_reg, p->gp_tmpreg); orc_x86_emit_mov_reg_memoffset (p, 4, p->gp_tmpreg, offset + 4*i, p->exec_reg); } orc_x86_emit_mov_memoffset_sse (p, 16, offset, p->exec_reg, p->vars[insn->dest_args[0]].alloc, FALSE); } #ifndef MMX static void sse_rule_mulhsl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, tmp); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), src, tmp2); orc_sse_emit_pmuldq (p, src, dest); orc_sse_emit_pmuldq (p, tmp, tmp2); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), dest, dest); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), tmp2, tmp2); orc_sse_emit_punpckldq (p, tmp2, dest); } #endif #ifndef MMX static void sse_rule_mulhsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int i; int regsize = p->is_64bit ? 8 : 4; int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, offset, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, offset + 16, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EAX, offset + 32, p->exec_reg); orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EDX, offset + 40, p->exec_reg); for(i=0;i<(1<insn_shift);i++) { orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, X86_EAX); orc_x86_emit_cpuinsn_memoffset (p, ORC_X86_imul_rm, 4, offset + 16 + 4*i, p->exec_reg); orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, offset + 4*i, p->exec_reg); } orc_x86_emit_mov_memoffset_sse (p, 16, offset, p->exec_reg, p->vars[insn->dest_args[0]].alloc, FALSE); orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 32, p->exec_reg, X86_EAX); orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 40, p->exec_reg, X86_EDX); } #endif #ifndef MMX static void sse_rule_mulhul (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, tmp); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), src, tmp2); orc_sse_emit_pmuludq (p, src, dest); orc_sse_emit_pmuludq (p, tmp, tmp2); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), dest, dest); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), tmp2, tmp2); orc_sse_emit_punpckldq (p, tmp2, dest); } #endif static void sse_rule_mulslq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_punpckldq (p, dest, dest); orc_sse_emit_punpckldq (p, tmp, tmp); orc_sse_emit_pmuldq (p, tmp, dest); } #ifndef MMX static void sse_rule_mulslq_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int i; int regsize = p->is_64bit ? 8 : 4; int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); orc_x86_emit_mov_sse_memoffset (p, 8, p->vars[insn->src_args[0]].alloc, offset, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_sse_memoffset (p, 8, p->vars[insn->src_args[1]].alloc, offset + 8, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EAX, offset + 32, p->exec_reg); orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EDX, offset + 40, p->exec_reg); for(i=0;i<(1<insn_shift);i++) { orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, X86_EAX); orc_x86_emit_cpuinsn_memoffset (p, ORC_X86_imul_rm, 4, offset + 8 + 4*i, p->exec_reg); orc_x86_emit_mov_reg_memoffset (p, 4, X86_EAX, offset + 16 + 8*i, p->exec_reg); orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, offset + 16 + 8*i + 4, p->exec_reg); } orc_x86_emit_mov_memoffset_sse (p, 16, offset + 16, p->exec_reg, p->vars[insn->dest_args[0]].alloc, FALSE); orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 32, p->exec_reg, X86_EAX); orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 40, p->exec_reg, X86_EDX); } #endif #ifndef MMX static void sse_rule_mululq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_punpckldq (p, dest, dest); orc_sse_emit_punpckldq (p, tmp, tmp); orc_sse_emit_pmuludq (p, tmp, dest); } #endif static void sse_rule_select0lw (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ /* same as convlw */ orc_sse_emit_pslld_imm (p, 16, dest); orc_sse_emit_psrad_imm (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_select1lw (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ orc_sse_emit_psrad_imm (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_select0ql (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* same as convql */ #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest); #else orc_sse_emit_movdqa (p, src, dest); #endif } static void sse_rule_select1ql (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_psrlq_imm (p, 32, dest); #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest); #else orc_sse_emit_movdqa (p, src, dest); #endif } static void sse_rule_select0wb (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ /* same as convwb */ orc_sse_emit_psllw_imm (p, 8, dest); orc_sse_emit_psraw_imm (p, 8, dest); orc_sse_emit_packsswb (p, dest, dest); } static void sse_rule_select1wb (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ orc_sse_emit_psraw_imm (p, 8, dest); orc_sse_emit_packsswb (p, dest, dest); } static void sse_rule_splitql (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest1 = p->vars[insn->dest_args[0]].alloc; int dest2 = p->vars[insn->dest_args[1]].alloc; #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest2); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,1,3,1), src, dest1); #else orc_sse_emit_movdqa (p, src, dest2); orc_sse_emit_pshufw (p, ORC_SSE_SHUF(3,2,3,2), src, dest1); #endif } static void sse_rule_splitlw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest1 = p->vars[insn->dest_args[0]].alloc; int dest2 = p->vars[insn->dest_args[1]].alloc; /* FIXME slow */ orc_sse_emit_psrad_imm (p, 16, dest1); orc_sse_emit_packssdw (p, dest1, dest1); if (dest2 != src) { orc_sse_emit_movdqa (p, src, dest2); } orc_sse_emit_pslld_imm (p, 16, dest2); orc_sse_emit_psrad_imm (p, 16, dest2); orc_sse_emit_packssdw (p, dest2, dest2); } static void sse_rule_splitwb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest1 = p->vars[insn->dest_args[0]].alloc; int dest2 = p->vars[insn->dest_args[1]].alloc; int tmp = orc_compiler_get_constant (p, 2, 0xff); /* FIXME slow */ orc_sse_emit_psraw_imm (p, 8, dest1); orc_sse_emit_packsswb (p, dest1, dest1); if (dest2 != src) { orc_sse_emit_movdqa (p, src, dest2); } #if 0 orc_sse_emit_psllw_imm (p, 8, dest2); orc_sse_emit_psraw_imm (p, 8, dest2); orc_sse_emit_packsswb (p, dest2, dest2); #else orc_sse_emit_pand (p, tmp, dest2); orc_sse_emit_packuswb (p, dest2, dest2); #endif } static void sse_rule_mergebw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, src, dest); } static void sse_rule_mergewl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklwd (p, src, dest); } static void sse_rule_mergelq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpckldq (p, src, dest); } static void sse_rule_swapw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psllw_imm (p, 8, tmp); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swapl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pslld_imm (p, 16, tmp); orc_sse_emit_psrld_imm (p, 16, dest); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psllw_imm (p, 8, tmp); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swapwl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pslld_imm (p, 16, tmp); orc_sse_emit_psrld_imm (p, 16, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swapq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psllq_imm (p, 32, tmp); orc_sse_emit_psrlq_imm (p, 32, dest); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pslld_imm (p, 16, tmp); orc_sse_emit_psrld_imm (p, 16, dest); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psllw_imm (p, 8, tmp); orc_sse_emit_psrlw_imm (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swaplq (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, dest); #else orc_mmx_emit_pshufw (p, ORC_MMX_SHUF(1,0,3,2), dest, dest); #endif } #ifndef MMX static void sse_rule_swapw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x02030001, 0x06070405, 0x0a0b0809, 0x0e0f0c0d); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapw (p, user, insn); } } static void sse_rule_swapl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapl (p, user, insn); } } static void sse_rule_swapwl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapl (p, user, insn); } } static void sse_rule_swapq_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapq (p, user, insn); } } static void sse_rule_select0lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x05040100, 0x0d0c0908, 0x05040100, 0x0d0c0908); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select0lw (p, user, insn); } } static void sse_rule_select1lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x07060302, 0x0f0e0b0a, 0x07060302, 0x0f0e0b0a); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select1lw (p, user, insn); } } static void sse_rule_select0wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x06040200, 0x0e0c0a08, 0x06040200, 0x0e0c0a08); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select0wb (p, user, insn); } } static void sse_rule_select1wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x07050301, 0x0f0d0b09, 0x07050301, 0x0f0d0b09); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select1wb (p, user, insn); } } #endif /* slow rules */ static void sse_rule_maxuw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pmaxsw (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_minuw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pminsw (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_avgsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 1, 0x80); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pavgb (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_avgsw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pavgw (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_maxsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pcmpgtb (p, src, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_minsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pcmpgtb (p, dest, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_maxsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pcmpgtd (p, src, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_minsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pcmpgtd (p, dest, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_maxul_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmpc; tmpc = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pcmpgtd (p, src, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); } static void sse_rule_minul_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmpc; tmpc = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pcmpgtd (p, dest, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); } static void sse_rule_avgsl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* (a+b+1) >> 1 = (a|b) - ((a^b)>>1) */ orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pxor(p, src, tmp); orc_sse_emit_psrad_imm(p, 1, tmp); orc_sse_emit_por(p, src, dest); orc_sse_emit_psubd(p, tmp, dest); } static void sse_rule_avgul (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* (a+b+1) >> 1 = (a|b) - ((a^b)>>1) */ orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pxor(p, src, tmp); orc_sse_emit_psrld_imm(p, 1, tmp); orc_sse_emit_por(p, src, dest); orc_sse_emit_psubd(p, tmp, dest); } static void sse_rule_addssl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); #if 0 int tmp2 = orc_compiler_get_temp_reg (p); int tmp3 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pand (p, dest, tmp); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_pxor (p, dest, tmp2); orc_sse_emit_psrad_imm (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); orc_sse_emit_psrad (p, 30, tmp); orc_sse_emit_pslld (p, 30, tmp); orc_sse_emit_movdqa (p, tmp, tmp2); orc_sse_emit_pslld_imm (p, 1, tmp2); orc_sse_emit_movdqa (p, tmp, tmp3); orc_sse_emit_pxor (p, tmp2, tmp3); orc_sse_emit_psrad_imm (p, 31, tmp3); orc_sse_emit_psrad_imm (p, 31, tmp2); tmp = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor (p, tmp, tmp2); // clamped value orc_sse_emit_pand (p, tmp3, tmp2); orc_sse_emit_paddd (p, src, dest); orc_sse_emit_pandn (p, dest, tmp3); // tmp is mask: ~0 is for clamping orc_sse_emit_movdqa (p, tmp3, dest); orc_sse_emit_por (p, tmp2, dest); #endif int s = orc_compiler_get_temp_reg (p); int t = orc_compiler_get_temp_reg (p); /* From Tim Terriberry: (slightly faster than above) m=0xFFFFFFFF; s=_a; t=_a; s^=_b; _a+=_b; t^=_a; t^=m; m>>=1; s|=t; t=_b; s>>=31; t>>=31; _a&=s; t^=m; s=~s&t; _a|=s; */ orc_sse_emit_movdqa (p, dest, s); orc_sse_emit_movdqa (p, dest, t); orc_sse_emit_pxor (p, src, s); orc_sse_emit_paddd (p, src, dest); orc_sse_emit_pxor (p, dest, t); tmp = orc_compiler_get_constant (p, 4, 0xffffffff); orc_sse_emit_pxor (p, tmp, t); orc_sse_emit_por (p, t, s); orc_sse_emit_movdqa (p, src, t); orc_sse_emit_psrad_imm (p, 31, s); orc_sse_emit_psrad_imm (p, 31, t); orc_sse_emit_pand (p, s, dest); tmp = orc_compiler_get_constant (p, 4, 0x7fffffff); orc_sse_emit_pxor (p, tmp, t); orc_sse_emit_pandn (p, t, s); orc_sse_emit_por (p, s, dest); } static void sse_rule_subssl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); int tmp3 = orc_compiler_get_temp_reg (p); tmp = orc_compiler_get_temp_constant (p, 4, 0xffffffff); orc_sse_emit_pxor (p, src, tmp); orc_sse_emit_movdqa (p, tmp, tmp2); orc_sse_emit_por (p, dest, tmp); orc_sse_emit_pxor (p, dest, tmp2); orc_sse_emit_psrad_imm (p, 1, tmp2); orc_sse_emit_psubd (p, tmp2, tmp); orc_sse_emit_psrad_imm (p, 30, tmp); orc_sse_emit_pslld_imm (p, 30, tmp); orc_sse_emit_movdqa (p, tmp, tmp2); orc_sse_emit_pslld_imm (p, 1, tmp2); orc_sse_emit_movdqa (p, tmp, tmp3); orc_sse_emit_pxor (p, tmp2, tmp3); orc_sse_emit_psrad_imm (p, 31, tmp3); // tmp3 is mask: ~0 is for clamping orc_sse_emit_psrad_imm (p, 31, tmp2); tmp = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor (p, tmp, tmp2); // clamped value orc_sse_emit_pand (p, tmp3, tmp2); orc_sse_emit_psubd (p, src, dest); orc_sse_emit_pandn (p, dest, tmp3); orc_sse_emit_movdqa (p, tmp3, dest); orc_sse_emit_por (p, tmp2, dest); } static void sse_rule_addusl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); #if 0 /* an alternate version. slower. */ /* Compute the bit that gets carried from bit 0 to bit 1 */ orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pand (p, dest, tmp); orc_sse_emit_pslld_imm (p, 31, tmp); orc_sse_emit_psrld_imm (p, 31, tmp); /* Add in (src>>1) */ orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_psrld_imm (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); /* Add in (dest>>1) */ orc_sse_emit_movdqa (p, dest, tmp2); orc_sse_emit_psrld_imm (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); /* turn overflow bit into mask */ orc_sse_emit_psrad_imm (p, 31, tmp); /* compute the sum, then or over the mask */ orc_sse_emit_paddd (p, src, dest); orc_sse_emit_por (p, tmp, dest); #endif orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pand (p, dest, tmp); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_pxor (p, dest, tmp2); orc_sse_emit_psrld_imm (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); orc_sse_emit_psrad_imm (p, 31, tmp); orc_sse_emit_paddd (p, src, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_subusl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_psrld_imm (p, 1, tmp2); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psrld_imm (p, 1, tmp); orc_sse_emit_psubd (p, tmp, tmp2); /* turn overflow bit into mask */ orc_sse_emit_psrad_imm (p, 31, tmp2); /* compute the difference, then and over the mask */ orc_sse_emit_psubd (p, src, dest); orc_sse_emit_pand (p, tmp2, dest); } #ifndef MMX /* float ops */ #define UNARY_F(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } #define BINARY_F(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[1]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } BINARY_F(addf, addps, 0x58) BINARY_F(subf, subps, 0x5c) BINARY_F(mulf, mulps, 0x59) BINARY_F(divf, divps, 0x5e) UNARY_F(sqrtf, sqrtps, 0x51) #define UNARY_D(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } #define BINARY_D(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[1]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } BINARY_D(addd, addpd, 0x58) BINARY_D(subd, subpd, 0x5c) BINARY_D(muld, mulpd, 0x59) BINARY_D(divd, divpd, 0x5e) UNARY_D(sqrtd, sqrtpd, 0x51) static void sse_rule_minf (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_minps (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_minps (p, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_minps (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_mind (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_minpd (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_minpd (p, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_minpd (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_maxf (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_maxps (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_maxps (p, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_maxps (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_maxd (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_maxpd (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_maxpd (p, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_maxpd (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_cmpeqf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cmpeqps (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_cmpeqd (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cmpeqpd (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_cmpltf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cmpltps (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_cmpltd (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cmpltpd (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_cmplef (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cmpleps (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_cmpled (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cmplepd (p, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_convfl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmpc; int tmp = orc_compiler_get_temp_reg (p); tmpc = orc_compiler_get_temp_constant (p, 4, 0x80000000); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_cvttps2dq (p, src, dest); orc_sse_emit_psrad_imm (p, 31, tmp); orc_sse_emit_pcmpeqd (p, dest, tmpc); orc_sse_emit_pandn (p, tmpc, tmp); orc_sse_emit_paddd (p, tmp, dest); } static void sse_rule_convdl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmpc; int tmp = orc_compiler_get_temp_reg (p); tmpc = orc_compiler_get_temp_constant (p, 4, 0x80000000); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,1,3,1), src, tmp); orc_sse_emit_cvttpd2dq (p, src, dest); orc_sse_emit_psrad_imm (p, 31, tmp); orc_sse_emit_pcmpeqd (p, dest, tmpc); orc_sse_emit_pandn (p, tmpc, tmp); orc_sse_emit_paddd (p, tmp, dest); } static void sse_rule_convlf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cvtdq2ps (p, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_convld (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cvtdq2pd (p, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_convfd (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cvtps2pd (p, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_convdf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_cvtpd2ps (p, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } #endif #define UNARY_SSE41(opcode,insn_name) \ static void \ sse_rule_ ## opcode ## _sse41 (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } UNARY_SSE41(convsbw,pmovsxbw); UNARY_SSE41(convswl,pmovsxwd); UNARY_SSE41(convslq,pmovsxdq); UNARY_SSE41(convubw,pmovzxbw); UNARY_SSE41(convuwl,pmovzxwd); UNARY_SSE41(convulq,pmovzxdq); void orc_compiler_sse_register_rules (OrcTarget *target) { OrcRuleSet *rule_set; #define REG(x) \ orc_rule_register (rule_set, #x , sse_rule_ ## x, NULL) /* SSE 2 */ #ifndef MMX rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSE2); #else rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_MMX_MMX); #endif orc_rule_register (rule_set, "loadb", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadw", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadl", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadq", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadoffb", sse_rule_loadoffX, NULL); orc_rule_register (rule_set, "loadoffw", sse_rule_loadoffX, NULL); orc_rule_register (rule_set, "loadoffl", sse_rule_loadoffX, NULL); orc_rule_register (rule_set, "loadupdb", sse_rule_loadupdb, NULL); orc_rule_register (rule_set, "loadupib", sse_rule_loadupib, NULL); orc_rule_register (rule_set, "loadpb", sse_rule_loadpX, (void *)1); orc_rule_register (rule_set, "loadpw", sse_rule_loadpX, (void *)2); orc_rule_register (rule_set, "loadpl", sse_rule_loadpX, (void *)4); orc_rule_register (rule_set, "loadpq", sse_rule_loadpX, (void *)8); orc_rule_register (rule_set, "ldresnearl", sse_rule_ldresnearl, NULL); orc_rule_register (rule_set, "ldreslinl", sse_rule_ldreslinl, NULL); orc_rule_register (rule_set, "storeb", sse_rule_storeX, NULL); orc_rule_register (rule_set, "storew", sse_rule_storeX, NULL); orc_rule_register (rule_set, "storel", sse_rule_storeX, NULL); orc_rule_register (rule_set, "storeq", sse_rule_storeX, NULL); REG(addb); REG(addssb); REG(addusb); REG(andb); REG(andnb); REG(avgub); REG(cmpeqb); REG(cmpgtsb); REG(maxub); REG(minub); REG(orb); REG(subb); REG(subssb); REG(subusb); REG(xorb); REG(addw); REG(addssw); REG(addusw); REG(andw); REG(andnw); REG(avguw); REG(cmpeqw); REG(cmpgtsw); REG(maxsw); REG(minsw); REG(mullw); REG(mulhsw); REG(mulhuw); REG(orw); REG(subw); REG(subssw); REG(subusw); REG(xorw); REG(addl); REG(andl); REG(andnl); REG(cmpeql); REG(cmpgtsl); REG(orl); REG(subl); REG(xorl); REG(andq); REG(andnq); REG(orq); REG(xorq); REG(select0ql); REG(select1ql); REG(select0lw); REG(select1lw); REG(select0wb); REG(select1wb); REG(mergebw); REG(mergewl); REG(mergelq); orc_rule_register (rule_set, "copyb", sse_rule_copyx, NULL); orc_rule_register (rule_set, "copyw", sse_rule_copyx, NULL); orc_rule_register (rule_set, "copyl", sse_rule_copyx, NULL); orc_rule_register (rule_set, "copyq", sse_rule_copyx, NULL); orc_rule_register (rule_set, "shlw", sse_rule_shift, (void *)0); orc_rule_register (rule_set, "shruw", sse_rule_shift, (void *)1); orc_rule_register (rule_set, "shrsw", sse_rule_shift, (void *)2); orc_rule_register (rule_set, "shll", sse_rule_shift, (void *)3); orc_rule_register (rule_set, "shrul", sse_rule_shift, (void *)4); orc_rule_register (rule_set, "shrsl", sse_rule_shift, (void *)5); orc_rule_register (rule_set, "shlq", sse_rule_shift, (void *)6); orc_rule_register (rule_set, "shruq", sse_rule_shift, (void *)7); orc_rule_register (rule_set, "shrsq", sse_rule_shrsq, NULL); orc_rule_register (rule_set, "convsbw", sse_rule_convsbw, NULL); orc_rule_register (rule_set, "convubw", sse_rule_convubw, NULL); orc_rule_register (rule_set, "convssswb", sse_rule_convssswb, NULL); orc_rule_register (rule_set, "convsuswb", sse_rule_convsuswb, NULL); orc_rule_register (rule_set, "convuuswb", sse_rule_convuuswb, NULL); orc_rule_register (rule_set, "convwb", sse_rule_convwb, NULL); orc_rule_register (rule_set, "convswl", sse_rule_convswl, NULL); orc_rule_register (rule_set, "convuwl", sse_rule_convuwl, NULL); orc_rule_register (rule_set, "convssslw", sse_rule_convssslw, NULL); orc_rule_register (rule_set, "convql", sse_rule_convql, NULL); orc_rule_register (rule_set, "convslq", sse_rule_convslq, NULL); orc_rule_register (rule_set, "convulq", sse_rule_convulq, NULL); //orc_rule_register (rule_set, "convsssql", sse_rule_convsssql, NULL); orc_rule_register (rule_set, "mulsbw", sse_rule_mulsbw, NULL); orc_rule_register (rule_set, "mulubw", sse_rule_mulubw, NULL); orc_rule_register (rule_set, "mulswl", sse_rule_mulswl, NULL); orc_rule_register (rule_set, "muluwl", sse_rule_muluwl, NULL); orc_rule_register (rule_set, "accw", sse_rule_accw, NULL); orc_rule_register (rule_set, "accl", sse_rule_accl, NULL); orc_rule_register (rule_set, "accsadubl", sse_rule_accsadubl, NULL); #ifndef MMX /* These require the SSE2 flag, although could be used with MMX. That flag is not yet handled. */ orc_rule_register (rule_set, "mululq", sse_rule_mululq, NULL); REG(addq); REG(subq); orc_rule_register (rule_set, "addf", sse_rule_addf, NULL); orc_rule_register (rule_set, "subf", sse_rule_subf, NULL); orc_rule_register (rule_set, "mulf", sse_rule_mulf, NULL); orc_rule_register (rule_set, "divf", sse_rule_divf, NULL); orc_rule_register (rule_set, "minf", sse_rule_minf, NULL); orc_rule_register (rule_set, "maxf", sse_rule_maxf, NULL); orc_rule_register (rule_set, "sqrtf", sse_rule_sqrtf, NULL); orc_rule_register (rule_set, "cmpeqf", sse_rule_cmpeqf, NULL); orc_rule_register (rule_set, "cmpltf", sse_rule_cmpltf, NULL); orc_rule_register (rule_set, "cmplef", sse_rule_cmplef, NULL); orc_rule_register (rule_set, "convfl", sse_rule_convfl, NULL); orc_rule_register (rule_set, "convlf", sse_rule_convlf, NULL); orc_rule_register (rule_set, "addd", sse_rule_addd, NULL); orc_rule_register (rule_set, "subd", sse_rule_subd, NULL); orc_rule_register (rule_set, "muld", sse_rule_muld, NULL); orc_rule_register (rule_set, "divd", sse_rule_divd, NULL); orc_rule_register (rule_set, "mind", sse_rule_mind, NULL); orc_rule_register (rule_set, "maxd", sse_rule_maxd, NULL); orc_rule_register (rule_set, "sqrtd", sse_rule_sqrtd, NULL); orc_rule_register (rule_set, "cmpeqd", sse_rule_cmpeqd, NULL); orc_rule_register (rule_set, "cmpltd", sse_rule_cmpltd, NULL); orc_rule_register (rule_set, "cmpled", sse_rule_cmpled, NULL); orc_rule_register (rule_set, "convdl", sse_rule_convdl, NULL); orc_rule_register (rule_set, "convld", sse_rule_convld, NULL); orc_rule_register (rule_set, "convfd", sse_rule_convfd, NULL); orc_rule_register (rule_set, "convdf", sse_rule_convdf, NULL); #endif /* slow rules */ orc_rule_register (rule_set, "maxuw", sse_rule_maxuw_slow, NULL); orc_rule_register (rule_set, "minuw", sse_rule_minuw_slow, NULL); orc_rule_register (rule_set, "avgsb", sse_rule_avgsb_slow, NULL); orc_rule_register (rule_set, "avgsw", sse_rule_avgsw_slow, NULL); orc_rule_register (rule_set, "maxsb", sse_rule_maxsb_slow, NULL); orc_rule_register (rule_set, "minsb", sse_rule_minsb_slow, NULL); orc_rule_register (rule_set, "maxsl", sse_rule_maxsl_slow, NULL); orc_rule_register (rule_set, "minsl", sse_rule_minsl_slow, NULL); orc_rule_register (rule_set, "maxul", sse_rule_maxul_slow, NULL); orc_rule_register (rule_set, "minul", sse_rule_minul_slow, NULL); orc_rule_register (rule_set, "convlw", sse_rule_convlw, NULL); orc_rule_register (rule_set, "signw", sse_rule_signw_slow, NULL); orc_rule_register (rule_set, "absb", sse_rule_absb_slow, NULL); orc_rule_register (rule_set, "absw", sse_rule_absw_slow, NULL); orc_rule_register (rule_set, "absl", sse_rule_absl_slow, NULL); orc_rule_register (rule_set, "swapw", sse_rule_swapw, NULL); orc_rule_register (rule_set, "swapl", sse_rule_swapl, NULL); orc_rule_register (rule_set, "swapwl", sse_rule_swapwl, NULL); orc_rule_register (rule_set, "swapq", sse_rule_swapq, NULL); orc_rule_register (rule_set, "swaplq", sse_rule_swaplq, NULL); orc_rule_register (rule_set, "splitql", sse_rule_splitql, NULL); orc_rule_register (rule_set, "splitlw", sse_rule_splitlw, NULL); orc_rule_register (rule_set, "splitwb", sse_rule_splitwb, NULL); orc_rule_register (rule_set, "avgsl", sse_rule_avgsl, NULL); orc_rule_register (rule_set, "avgul", sse_rule_avgul, NULL); orc_rule_register (rule_set, "shlb", sse_rule_shlb, NULL); orc_rule_register (rule_set, "shrsb", sse_rule_shrsb, NULL); orc_rule_register (rule_set, "shrub", sse_rule_shrub, NULL); orc_rule_register (rule_set, "mulll", sse_rule_mulll_slow, NULL); #ifndef MMX orc_rule_register (rule_set, "mulhsl", sse_rule_mulhsl_slow, NULL); orc_rule_register (rule_set, "mulhul", sse_rule_mulhul, NULL); orc_rule_register (rule_set, "mulslq", sse_rule_mulslq_slow, NULL); #endif orc_rule_register (rule_set, "mullb", sse_rule_mullb, NULL); orc_rule_register (rule_set, "mulhsb", sse_rule_mulhsb, NULL); orc_rule_register (rule_set, "mulhub", sse_rule_mulhub, NULL); orc_rule_register (rule_set, "addssl", sse_rule_addssl_slow, NULL); orc_rule_register (rule_set, "subssl", sse_rule_subssl_slow, NULL); orc_rule_register (rule_set, "addusl", sse_rule_addusl_slow, NULL); orc_rule_register (rule_set, "subusl", sse_rule_subusl_slow, NULL); orc_rule_register (rule_set, "convhwb", sse_rule_convhwb, NULL); orc_rule_register (rule_set, "convhlw", sse_rule_convhlw, NULL); orc_rule_register (rule_set, "splatw3q", sse_rule_splatw3q, NULL); orc_rule_register (rule_set, "splatbw", sse_rule_splatbw, NULL); orc_rule_register (rule_set, "splatbl", sse_rule_splatbl, NULL); orc_rule_register (rule_set, "div255w", sse_rule_div255w, NULL); orc_rule_register (rule_set, "divluw", sse_rule_divluw, NULL); /* SSE 3 -- no rules */ /* SSSE 3 */ rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSSE3); #ifndef MMX orc_rule_register (rule_set, "signb", sse_rule_signX_ssse3, (void *)0); orc_rule_register (rule_set, "signw", sse_rule_signX_ssse3, (void *)1); orc_rule_register (rule_set, "signl", sse_rule_signX_ssse3, (void *)2); #endif REG(absb); REG(absw); REG(absl); #ifndef MMX orc_rule_register (rule_set, "swapw", sse_rule_swapw_ssse3, NULL); orc_rule_register (rule_set, "swapl", sse_rule_swapl_ssse3, NULL); orc_rule_register (rule_set, "swapwl", sse_rule_swapwl_ssse3, NULL); orc_rule_register (rule_set, "swapq", sse_rule_swapq_ssse3, NULL); orc_rule_register (rule_set, "select0lw", sse_rule_select0lw_ssse3, NULL); orc_rule_register (rule_set, "select1lw", sse_rule_select1lw_ssse3, NULL); orc_rule_register (rule_set, "select0wb", sse_rule_select0wb_ssse3, NULL); orc_rule_register (rule_set, "select1wb", sse_rule_select1wb_ssse3, NULL); #endif /* SSE 4.1 */ rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSE4_1); REG(maxsb); REG(minsb); REG(maxuw); REG(minuw); REG(maxsl); REG(maxul); REG(minsl); REG(minul); REG(mulll); orc_rule_register (rule_set, "convsbw", sse_rule_convsbw_sse41, NULL); orc_rule_register (rule_set, "convswl", sse_rule_convswl_sse41, NULL); orc_rule_register (rule_set, "convslq", sse_rule_convslq_sse41, NULL); orc_rule_register (rule_set, "convubw", sse_rule_convubw_sse41, NULL); orc_rule_register (rule_set, "convuwl", sse_rule_convuwl_sse41, NULL); orc_rule_register (rule_set, "convulq", sse_rule_convulq_sse41, NULL); orc_rule_register (rule_set, "convsuslw", sse_rule_convsuslw, NULL); orc_rule_register (rule_set, "mulslq", sse_rule_mulslq, NULL); #ifndef MMX orc_rule_register (rule_set, "mulhsl", sse_rule_mulhsl, NULL); #endif REG(cmpeqq); /* SSE 4.2 -- no rules */ rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSE4_2); REG(cmpgtsq); /* SSE 4a -- no rules */ }