#include #include #include #include #include #include #include #include "codex86.h" #include #define TRUE 1 #define FALSE 0 typedef void (* CompFunc) ( uint32_t width, uint32_t height, void * src_start, void * mask_start, void * dest_start, int32_t src_stride, int32_t mask_stride, int32_t dest_stride); typedef enum { MMX = (1 << 0), MMX_EXTENSIONS = (1 << 1), /* The MMX part of SSE */ SSE = (1 << 2), SSE2 = (1 << 3), CMOV = (1 << 4) } CPUFeatures; static CPUFeatures detect_cpu (Asm *a) { CPUFeatures result = 0; char vendor[13] = { 0 }; uint32_t features = 0; uint8_t *code; /* Stack layout * * uint32_t * vendor2 ebp + 20 * uint32_t * vendor1 ebp + 16 * uint32_t * vendor0 ebp + 12 * uint32_t * features ebp + 8 * uint32_t * ret_addr ebp + 4 * pointer old_ebp <- ebp * */ /* see p. 118 of amd64 instruction set manual Vol3 */ asm_function_preamble (a); x86_pushf (a); x86_pop (a, x86_eax()); x86_mov (a, x86_eax(), x86_ecx()); x86_xor (a, x86_imm (0x00200000), x86_eax()); x86_push (a, x86_eax()); x86_popf (a); x86_pushf (a); x86_pop (a, x86_eax()); x86_mov (a, x86_imm (0x00000000), x86_edx()); x86_xor (a, x86_ecx(), x86_eax()); x86_jz (a, "no_cpuid"); x86_mov (a, x86_imm (0x00000000), x86_eax()); x86_push (a, x86_ebx()); x86_cpuid (a); x86_mov (a, x86_ebx(), x86_eax()); x86_pop (a, x86_ebx()); printf ("%p\n", &(vendor[0])); x86_mov (a, x86_eax(), x86_address (&(vendor[0]))); x86_mov (a, x86_edx(), x86_address (&(vendor[4]))); x86_mov (a, x86_ecx(), x86_address (&(vendor[8]))); x86_mov (a, x86_imm (0x00000001), x86_eax()); x86_push (a, x86_ebx()); x86_cpuid (a); x86_pop (a, x86_ebx()); asm_label (a, "no_cpuid"); x86_mov (a, x86_edx(), x86_address (&features)); asm_function_postamble (a); if ((code = asm_emit (a))) { typedef void (* DetectFeatures) (void); ((DetectFeatures)code) (); asm_free_code (code); } if (features & (1 << 15)) result |= CMOV; if (features & (1 << 23)) result |= MMX; if (features & (1 << 25)) result |= SSE; if (features & (1 << 26)) result |= SSE2; if (features & MMX && ((strcmp (vendor, "AuthenticAMD") == 0) || (strcmp (vendor, "Geode by NSC") == 0))) { asm_function_preamble (a); x86_mov (a, x86_imm (0x80000000), x86_eax()); x86_cpuid (a); x86_xor (a, x86_edx(), x86_edx()); x86_cmp (a, x86_imm (0x1), x86_eax()); x86_jge (a, "no_amd"); x86_mov (a, x86_imm (0x80000001), x86_eax()); x86_cpuid (a); /* label notamd */ asm_label (a, "no_amd"); x86_pop (a, x86_ebx()); x86_mov (a, x86_edx(), x86_eax()); asm_function_postamble (a); if ((code = asm_emit (a))) { typedef int (* DetectMMXExtensions) (void); if (((DetectMMXExtensions)code)()) result |= MMX_EXTENSIONS; asm_free_code (code); } } printf ("Vendor string: %s\n", vendor); if (result & SSE) result |= MMX_EXTENSIONS; printf ("CPU Features:\n"); printf (" MMX: %s\n", (result & MMX)? "yes" : " no"); printf (" MMX Extensions: %s\n", (result & MMX_EXTENSIONS)? "yes" : " no"); printf (" SSE: %s\n", (result & SSE)? "yes" : " no"); printf (" SSE2: %s\n", (result & SSE2)? "yes" : " no"); printf (" CMOV: %s\n", (result & CMOV)? "yes" : " no"); return result; } static int get_shift (bytes) { switch (bytes) { case 1: return 0; case 2: return 1; case 4: return 2; case 8: return 3; default: assert (0); break; } return 0; } static void do_print (const char *s) { printf ("%p\n", s); } static int n_pixel (pixman_format_code_t src, pixman_format_code_t mask, pixman_format_code_t dest) { return 100; } static void div_255 (Asm *a, op_t target, op_t x0080, op_t tmp) { /* This is a clever way to divide by 255 */ x86_paddusw (a, x0080, target); x86_movdqa (a, target, tmp); x86_psrlw (a, tmp, x86_imm (8)); x86_paddusw (a, tmp, target); x86_psrlw (a, target, x86_imm (8)); } /* * The render equation is * * (src IN mask) OP dst * * With component alpha, alpha must be considered per component, * for both IN and OP. Ie., the output of (src IN mask) should * treated as if it had different alpha values for each component. * * So the algorithm is * * if (component_alpha && need_srca (op)) * { * srca = expand_alpha(src); * srca = srca * mask; * * src = src in mask; * dest = op (src, srca, dest); * } * else if (component_alpha && !need_srca (op)) * { * src = src in mask; * dest = op (src, dest); * } * else if (!component_alpha && need_srca (op)) * { * srca = expand_alpha (src); * * src = src in mask; * dest = op (src, srca, dest); * } * else if (!component_alpha && !need_srca (op)) * { * src = src in mask; * dest = op (src, dest); * } * else { assert (0); } * */ static pixman_bool_t generate_func (Asm *a, pixman_op_t op, pixman_format_code_t src_format, pixman_format_code_t mask_format, pixman_format_code_t dest_format) { /* Stack layout * * int32_t dest_stride ebp + 36 in bytes * int32_t mask_stride ebp + 32 in bytes * int32_t src_stride ebp + 28 in bytes * pointer dest_start ebp + 24 * pointer mask_start ebp + 20 * pointer src_start ebp + 16 * uint32_t height ebp + 12 * uint32_t width ebp + 8 * pointer ret_addr ebp + 4 * pointer old_ebp <- ebp * * Everything else is known statically at compile time * */ op_t width_arg = x86_membase (x86_ebp(), 8); op_t height_arg = x86_membase (x86_ebp(), 12); op_t src_start_arg = x86_membase (x86_ebp(), 16); op_t mask_start_arg = x86_membase (x86_ebp(), 20); op_t dest_start_arg = x86_membase (x86_ebp(), 24); op_t src_stride_arg = x86_membase (x86_ebp(), 28); op_t mask_stride_arg = x86_membase (x86_ebp(), 32); op_t dest_stride_arg = x86_membase (x86_ebp(), 36); op_t src_line; op_t mask_line; op_t dest_line; op_t w, h; op_t s, m, d, tmp, vs, vd, vz, x0080, x00ff; op_t s1, s2, m1, d1; int s_bytes = PIXMAN_FORMAT_BPP (src_format) / 8; int m_bytes = PIXMAN_FORMAT_BPP (mask_format) / 8; int d_bytes = PIXMAN_FORMAT_BPP (dest_format) / 8; /* For now we only support Over_8888x8x8888, and only aligned */ if (src_format != PIXMAN_a8r8g8b8) return FALSE; if (mask_format != PIXMAN_a8) return FALSE; if (dest_format != PIXMAN_a8r8g8b8) return FALSE; if (op != PIXMAN_OP_OVER) return FALSE; /* Preamble */ asm_function_preamble (a); /* Allocate registers */ src_line = x86_ebx(); mask_line = x86_ecx(); dest_line = x86_edx(); w = x86_esi(); h = x86_edi(); tmp = x86_eax(); x86_mov (a, height_arg, h); /* Set up the lines */ x86_mov (a, src_start_arg, src_line); x86_mov (a, mask_start_arg, mask_line); x86_mov (a, dest_start_arg, dest_line); /* Add width to the lines and negate width so that we can iterate forward */ x86_mov (a, width_arg, tmp); x86_lea (a, x86_memindex (src_line, 0, tmp, get_shift (s_bytes)), src_line); x86_lea (a, x86_memindex (mask_line, 0, tmp, get_shift (m_bytes)), mask_line); x86_lea (a, x86_memindex (dest_line, 0, tmp, get_shift (d_bytes)), dest_line); x86_neg (a, width_arg); /* Allocate some constants in registers */ vz = x86_xmm0(); /* zero */ x86_pxor (a, vz, vz); x0080 = x86_xmm1(); /* 0080 0080 0080 0080 ... */ x86_pcmpeqw (a, x0080, x0080); x86_psrlw (a, x0080, x86_imm (15)); x86_psllw (a, x0080, x86_imm (7)); x00ff = x86_xmm2(); /* 00ff 00ff 00ff 00ff ... */ x86_pcmpeqw (a, x00ff, x00ff); x86_psrlw (a, x00ff, x86_imm (8)); /* Outer loop header */ asm_label (a, "row_loop"); x86_mov (a, width_arg, w); #if 0 /* Call function to deal with unaligned leading pixels */ x86_call_label (a, "unaligned_pixels"); #endif /* Jump to column test as w may be zero now */ x86_jmp (a, "col_test"); /* Inner loop header */ asm_label (a, "col_loop"); /* Setup pixel variables */ s = x86_memindex (src_line, 0, w, get_shift (s_bytes)); m = x86_memindex (mask_line, 0, w, get_shift (m_bytes)); d = x86_memindex (dest_line, 0, w, get_shift (d_bytes)); /* Read pixels into SSE registers */ vs = x86_xmm3(); /* Read source */ #if 0 x86_pusha (a); x86_lea (a, s, tmp); x86_push (a, tmp); x86_call (a, x86_imm (do_print)); x86_pop (a, tmp); x86_mov (a, src_stride_arg, tmp); x86_push (a, tmp); x86_call (a, x86_imm (do_print)); x86_pop (a, tmp); x86_popa (a); #endif x86_movdqa (a, s, vs); s1 = x86_xmm4(); s2 = x86_xmm5(); m1 = x86_xmm6(); /* Read mask */ x86_mov (a, m, tmp); /* Process (src in mask) for the two first pixels */ x86_movd (a, tmp, m1); x86_punpcklbw (a, vz, m1); x86_pshuflw (a, m1, m1, x86_imm (0x50)); x86_pshufd (a, m1, m1, x86_imm (0x50)); x86_movdqa (a, vs, s1); x86_punpcklbw (a, vz, s1); x86_pmullw (a, m1, s1); div_255 (a, s1, x0080, m1); /* Process (src in mask) for the two next pixels */ x86_shr (a, tmp, x86_imm (16)); x86_movd (a, tmp, m1); x86_punpcklbw (a, vz, m1); x86_pshuflw (a, m1, m1, x86_imm (0x50)); x86_pshufd (a, m1, m1, x86_imm (0x50)); x86_movdqa (a, vs, s2); x86_punpckhbw (a, vz, s2); x86_pmullw (a, m1, s2); div_255 (a, s2, x0080, m1); /* Pack s1 and s2 into s */ x86_movdqa (a, s1, vs); x86_packuswb (a, s2, vs); /* Turn s1 and s2 into src_alphas */ x86_pshuflw (a, s1, s1, x86_imm (0xff)); x86_pshufhw (a, s1, s1, x86_imm (0xff)); x86_pshuflw (a, s2, s2, x86_imm (0xff)); x86_pshufhw (a, s2, s2, x86_imm (0xff)); /* Negate them */ x86_pxor (a, x00ff, s1); x86_pxor (a, x00ff, s2); /* Read destination */ vd = m1; x86_movdqa (a, d, vd); d1 = x86_xmm7(); /* Multiply first two destination pixels onto s1 */ x86_movdqa (a, vd, d1); x86_punpcklbw (a, vz, d1); x86_pmullw (a, d1, s1); div_255 (a, s1, x0080, d1); /* Multiply second two destination pixels onto s2 */ x86_movdqa (a, vd, d1); x86_punpckhbw (a, vz, d1); x86_pmullw (a, d1, s2); div_255 (a, s2, x0080, d1); /* Pack s1 and s2 into s1 */ x86_packuswb (a, s2, s1); /* Add source onto destination */ x86_paddusb (a, vs, s1); /* Finally store it */ x86_movdqa (a, s1, d); /* Inner test */ x86_add (a, x86_imm (4), w); asm_label (a, "col_test"); x86_cmp (a, x86_imm (-4), w); x86_jle_s (a, "col_loop"); #if 0 /* Deal with unaligned trailing pixels */ x86_call_label (a, "unaligned_pixels"); #endif /* Update lines */ x86_add (a, src_stride_arg, src_line); x86_add (a, mask_stride_arg, mask_line); x86_add (a, dest_stride_arg, dest_line); /* Outer test */ x86_sub (a, x86_imm (1), h); x86_jnz (a, "row_loop"); /* Emms */ x86_emms (a); /* Postamble */ asm_function_postamble (a); asm_label (a, "do_unaligned_pixel"); x86_add (a, x86_imm (1), w); asm_label (a, "unaligned_pixels"); x86_cmp (a, x86_imm (0), w); x86_je (a, "unaligned_done"); x86_cmp (a, x86_imm (-4), w); x86_jg_s (a, "do_unaligned_pixel"); x86_lea (a, s, tmp); x86_test (a, x86_imm (s_bytes * 4 - 1), tmp); x86_jne (a, "do_unaligned_pixel"); x86_lea (a, m, tmp); x86_test (a, x86_imm (m_bytes * 4 - 1), tmp); x86_jne (a, "do_unaligned_pixel"); x86_lea (a, d, tmp); x86_test (a, x86_imm (d_bytes * 4 - 1), tmp); x86_jne (a, "do_unaligned_pixel"); asm_label (a, "unaligned_done"); x86_ret (a); return TRUE; } static void do_test (Asm *a) { uint8_t *src, *mask, *dest; CompFunc code; printf ("asdf\n"); detect_cpu (a); printf ("asdf\n"); if (generate_func (a, PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8)) { if ((code = (CompFunc)asm_emit (a))) { src = NULL; mask = NULL; dest = NULL; int i; int w = 6400; int h = 1400; int s = 6400; struct timeval before; struct timeval after; int n_usec; uint64_t n_pixels; #define N_TIMES 200 #define UNALIGN 0 if (posix_memalign ((void **)&src, 4096, s * h * 4) < 0 || posix_memalign ((void **)&mask, 4096, s * h * 1) < 0 || posix_memalign ((void **)&dest, 4096, s * h * 4)) { printf ("a memalign failed\n"); } if (mlock (src, s * h * 4) != 0 || mlock (mask, s * h * 1) != 0 || mlock (dest, s * h * 4) != 0) { printf ("Can't mlock(): run as root to get more reliable timings\n"); } src += UNALIGN; mask += UNALIGN; dest += UNALIGN; printf ("source is at %p - %p\n", src, (char *)src + s * h * 4); printf ("mask is at %p - %p\n", mask, (char *)mask + s * h * 1); printf ("dest is at %p - %p\n", dest, (char *)dest + s * h * 4); for (i = 0; i < s * h; ++i) { ((uint32_t *)src) [i] = 0xff0000ff; ((uint8_t *)mask)[i] = 0x7f; ((uint32_t *)dest)[i] = 0xffff0000; } gettimeofday (&before, NULL); for (i = 0; i < N_TIMES; ++i) { code (w, h, src, mask, dest, s * 4, s * 1, s * 4); } gettimeofday (&after, NULL); n_pixels = N_TIMES * (w * (uint64_t)h); n_usec = (after.tv_sec * 1000000 + after.tv_usec) - (before.tv_sec * 1000000 + before.tv_usec); printf ("Time: %.3f seconds, Pixels: %llu. MPixels per sec: %f\n", n_usec / 1000000.0, n_pixels, (n_pixels / (1024 * 1024.0)) / ((double)n_usec / 1000000.0)); src -= UNALIGN; mask -= UNALIGN; dest -= UNALIGN; free (src); free (mask); free (dest); } } else { printf ("Failed to generate code\n"); } } int main () { Asm *a = asm_new (); do_test (a); printf ("O hai wurld. The size of an op is %d\n", sizeof (op_t)); return 0; }