diff options
author | Søren Sandmann Pedersen <ssp@redhat.com> | 2013-09-05 18:14:11 -0400 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2013-09-05 18:15:11 -0400 |
commit | 0c75782895aa8c8454f864a24c57549b36d7e6ac (patch) | |
tree | 0071eade44cb8c0fb962f71f0b3793c366f43f55 | |
parent | ddf2aa26a7fbe94f0c9403af7546c756db1a80e4 (diff) |
more iterjit
-rw-r--r-- | iterjit.c | 293 |
1 files changed, 285 insertions, 8 deletions
@@ -1,4 +1,21 @@ /* + - We need an "outer" driver from which constants can be requested. + API: + outer->register_constant_4x32 (outer, jit, 0x01010101) + outer->get_constant_4x32 (outer, jit, 0x01010101) + + The outer driver will: + - at the end of the kernel, it will generate a sequence of constants that + can be addressed in a RIP relative way (What about x32?) + - at register time, it will allocate a register and load the constant + - get_constant() will then return either a memory location or a register + + - Register allocator should support "get location" of a register. If + the register is spilled, the returned op will be a memory location. + If not, it will be a register. + + + Flow: - outer loop: - generates outer loop @@ -100,9 +117,11 @@ */ +#define _GNU_SOURCE #include <stdio.h> #include <stddef.h> #include <pixman.h> +#include <string.h> #include "simplex86.h" #include "simple-reg.h" #include "stack-man.h" @@ -316,26 +335,31 @@ struct jit_src_iter_t struct jit_combiner_t { - reg_t (* combine) (jit_combiner_t *combiner, jit_t *jit, - jit_src_iter_t *src, - jit_src_iter_t *mask, - jit_dest_iter_t *dest); + void (* combine) (jit_combiner_t *combiner, jit_t *jit, + jit_src_iter_t *src, + jit_src_iter_t *mask, + jit_dest_iter_t *dest, + int n_pixels); }; struct jit_dest_iter_t { void (* begin) (jit_dest_iter_t *dest, jit_t *jit, reg_t info); - op_t (* load_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels); - void (* advance_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels); void (* process_line) (jit_dest_iter_t *dest_iter, jit_t *jit, jit_src_iter_t *src_iter, jit_src_iter_t *mask_iter, jit_combiner_t *combiner); + reg_t (* load_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels); + void (* write_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels, reg_t reg); + void (* advance_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels); void (* end) (jit_dest_iter_t *dest, jit_t *jit); reg_t line; reg_t stride; + reg_t d; + reg_t w; + reg_t width; }; #define MEMBER(variable, type, member) \ @@ -512,6 +536,11 @@ dest_a8r8g8b8_begin (jit_dest_iter_t * dest, jit_free_gp (jit, info); jit_free_gp (jit, image); jit_free_gp (jit, tmp); + + dest->width = jit_alloc_gp (jit); + BEGIN_ASM (jit->fragment) + I_mov, dest->width, MEMBER (info, pixman_composite_info_t, width), + END_ASM (); } static void @@ -521,12 +550,149 @@ dest_a8r8g8b8_process_line (jit_dest_iter_t *dest, jit_src_iter_t *mask, jit_combiner_t *combiner) { + int n_pixels[] = { 1, 2, 4, 2, 1 }; + int i; + src->begin_line (src, jit); mask->begin_line (mask, jit); + jit_switch_group (jit, "dest"); + + jit_reload_gp (jit, dest->line); + jit_reload_gp (jit, dest->stride); + + dest->d = jit_alloc_gp (jit); + dest->w = jit_alloc_gp (jit); + BEGIN_ASM (jit->fragment) + I_mov, dest->d, dest->line, + I_mov, dest->w, dest->width, + I_add, dest->line, dest->stride, + END_ASM (); + + for (i = 0; i < sizeof (n_pixels) / sizeof (n_pixels[0]); ++i) + { + char *loop, *test, *done; + + asprintf (&loop, "horz_%d_loop", i); /* FIXME OOM */ + asprintf (&test, "horz_%d_test", i); /* FIXME OOM */ + asprintf (&done, "horz_%d_done", i); /* FIXME OOM */ + + jit_switch_group (jit, "dest"); + jit_reload_gp (jit, dest->w); + + BEGIN_ASM (jit->fragment) + I_jmp, LABEL (test), + I_sub, dest->w, IMM (n_pixels[i]), + END_ASM (); - + combiner->combine (combiner, jit, src, mask, dest, n_pixels[i]); + + BEGIN_ASM (jit->fragment) + DEFINE_LABEL (test), + END_ASM (); + + jit_switch_group (jit, "dest"); + if (i < 2) + { + jit_reload_gp (jit, dest->d); + BEGIN_ASM (jit->fragment) + /* If not misaligned, then skip to next block */ + I_test, dest->d, IMM (n_pixels[i] * 4 * 2 - 1), + I_jz, LABEL (done), + END_ASM (); + } + + jit_reload_gp (jit, dest->w); + BEGIN_ASM (jit->fragment) + I_cmp, dest->w, IMM (n_pixels[i]), + I_jge, LABEL (loop), + DEFINE_LABEL (done), + END_ASM (); + + free (loop); + free (test); + free (done); + } + + jit_free_gp (jit, dest->d); + jit_free_gp (jit, dest->w); +} + +static reg_t +dest_a8r8g8b8_load_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels) +{ + instruction_t move; + reg_t r; + + jit_switch_group (jit, "dest"); + jit_reload_gp (jit, dest->d); + r = jit_alloc_xmm (jit); + + switch (n_pixels) + { + case 1: + move = I_movd; + break; + + case 2: + move = I_movq; + break; + + case 4: + move = I_movdqa; + break; + } + + BEGIN_ASM (jit->fragment) + move, r, PTR (dest->d), + END_ASM (); + + jit_free_xmm (jit, r); + + return r; +} + +static void +dest_a8r8g8b8_write_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels, reg_t reg) +{ + instruction_t move; + + switch (n_pixels) + { + case 1: + move = I_movd; + break; + + case 2: + move = I_movq; + break; + + case 4: + move = I_movdqa; + break; + } + + jit_switch_group (jit, "dest"); + jit_reload_gp (jit, dest->d); + + BEGIN_ASM (jit->fragment) + move, reg, PTR (dest->d), + END_ASM(); +} + +static void +dest_a8r8g8b8_advance_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels) +{ + int n_bytes; + + jit_switch_group (jit, "dest"); + jit_reload_gp (jit, dest->d); + + n_bytes = n_pixels * 4; + + BEGIN_ASM (jit->fragment) + I_add, dest->d, IMM (n_bytes), END_ASM (); } @@ -537,17 +703,128 @@ dest_a8r8g8b8_end (jit_dest_iter_t *dest, jit_t *jit) } jit_dest_iter_t * -dest_iter_create_a8r8g8b8 (jit_dest_iter_t *dest) +dest_iter_create_a8r8g8b8 (void) { jit_dest_iter_t *iter = malloc (sizeof *iter); /* FIXME OOM */ iter->begin = dest_a8r8g8b8_begin; iter->process_line = dest_a8r8g8b8_process_line; + iter->load_pixels = dest_a8r8g8b8_load_pixels; + iter->write_pixels = dest_a8r8g8b8_write_pixels; + iter->advance_pixels = dest_a8r8g8b8_advance_pixels; iter->end = dest_a8r8g8b8_end; return iter; } +/* combiner */ +static void +combine_over (jit_combiner_t *combiner, jit_t *jit, + jit_src_iter_t *src, jit_src_iter_t *mask, + jit_dest_iter_t *dest, int n_pixels) +{ + reg_t s, d, m_hi, m_lo, d_hi, d_lo, zero; + reg_t m00ff, m0101, m0080; + + s = src->load_pixels (src, jit, n_pixels); + + src->advance_pixels (src, jit, n_pixels); + + jit_switch_group (jit, "combiner"); + jit_preserve_gp (jit, s); + + m00ff = zero = jit_alloc_xmm (jit); + m_hi = jit_alloc_xmm (jit); + m_lo = jit_alloc_xmm (jit); + + BEGIN_ASM (jit->fragment) + /* Generate zero */ + I_pxor, zero, zero, + + /* Expand source */ + I_movdqa, m_hi, s, + I_movdqa, m_lo, s, + I_punpckhbw, m_hi, zero, + I_punpcklbw, m_lo, zero, + I_pshuflw, m_hi, 0xff, + I_pshuflw, m_lo, 0xff, + I_pshufhw, m_hi, 0xff, + I_pshufhw, m_lo, 0xff, + + /* Negate mask */ + I_pcmpeqw, m00ff, m00ff, + I_psrlw, m00ff, IMM (8), + I_pxor, m_lo, m00ff, + I_pxor, m_hi, m00ff, + END_ASM (); + + jit_free_xmm (jit, zero); + + d = dest->load_pixels (dest, jit, n_pixels); + + jit_switch_group (jit, "combiner"); + jit_preserve_gp (jit, d); + + m0080 = zero = jit_alloc_xmm (jit); + m0101 = jit_alloc_xmm (jit); + d_hi = jit_alloc_xmm (jit); + d_lo = d; + + BEGIN_ASM (jit->fragment) + /* Unpack dest */ + I_pxor, zero, zero, + I_movdqa, d_hi, d, + I_punpckhbw, d_hi, zero, + I_punpcklbw, d_lo, zero, + + /* Generate 0101 */ + I_pcmpeqw, m0101, m0101, + I_psrlw, m0101, IMM (15), + I_packuswb, m0101, m0101, + + /* Generate 0080 */ + I_pcmpeqw, m0080, m0080, + I_psrlw, m0080, IMM (15), + I_psllw, m0080, IMM (7), + + /* Multiply */ + I_pmullw, d_hi, m_hi, + I_paddusw, d_hi, m0080, + I_pmulhw, d_hi, m0101, + + I_pmullw, d_lo, m_lo, + I_paddusw, d_lo, m0080, + I_pmulhw, d_lo, m0101, + + /* Pack */ + I_packuswb, d_lo, d_hi, + + /* Add */ + I_paddusb, d, s, + END_ASM(); + + jit_free_xmm (jit, m0080); + jit_free_xmm (jit, m0101); + jit_free_xmm (jit, m_hi); + jit_free_xmm (jit, m_lo); + jit_free_xmm (jit, d_lo); + jit_free_xmm (jit, d_hi); + jit_free_xmm (jit, s); + + dest->write_pixels (dest, jit, n_pixels, d); + dest->advance_pixels (dest, jit, n_pixels); +} + +jit_combiner_t * +combiner_create_over (void) +{ + jit_combiner_t *combiner = malloc (sizeof *combiner); /* FIXME OOM */ + + combiner->combine = combine_over; + + return combiner; +} + void generate_kernel (jit_t *jit, jit_src_iter_t *src_iter, |