summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSøren Sandmann Pedersen <ssp@redhat.com>2013-09-05 18:14:11 -0400
committerSøren Sandmann Pedersen <ssp@redhat.com>2013-09-05 18:15:11 -0400
commit0c75782895aa8c8454f864a24c57549b36d7e6ac (patch)
tree0071eade44cb8c0fb962f71f0b3793c366f43f55
parentddf2aa26a7fbe94f0c9403af7546c756db1a80e4 (diff)
more iterjit
-rw-r--r--iterjit.c293
1 files changed, 285 insertions, 8 deletions
diff --git a/iterjit.c b/iterjit.c
index b188f92..77d2300 100644
--- a/iterjit.c
+++ b/iterjit.c
@@ -1,4 +1,21 @@
/*
+ - We need an "outer" driver from which constants can be requested.
+ API:
+ outer->register_constant_4x32 (outer, jit, 0x01010101)
+ outer->get_constant_4x32 (outer, jit, 0x01010101)
+
+ The outer driver will:
+ - at the end of the kernel, it will generate a sequence of constants that
+ can be addressed in a RIP relative way (What about x32?)
+ - at register time, it will allocate a register and load the constant
+ - get_constant() will then return either a memory location or a register
+
+ - Register allocator should support "get location" of a register. If
+ the register is spilled, the returned op will be a memory location.
+ If not, it will be a register.
+
+
+
Flow:
- outer loop:
- generates outer loop
@@ -100,9 +117,11 @@
*/
+#define _GNU_SOURCE
#include <stdio.h>
#include <stddef.h>
#include <pixman.h>
+#include <string.h>
#include "simplex86.h"
#include "simple-reg.h"
#include "stack-man.h"
@@ -316,26 +335,31 @@ struct jit_src_iter_t
struct jit_combiner_t
{
- reg_t (* combine) (jit_combiner_t *combiner, jit_t *jit,
- jit_src_iter_t *src,
- jit_src_iter_t *mask,
- jit_dest_iter_t *dest);
+ void (* combine) (jit_combiner_t *combiner, jit_t *jit,
+ jit_src_iter_t *src,
+ jit_src_iter_t *mask,
+ jit_dest_iter_t *dest,
+ int n_pixels);
};
struct jit_dest_iter_t
{
void (* begin) (jit_dest_iter_t *dest, jit_t *jit, reg_t info);
- op_t (* load_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels);
- void (* advance_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels);
void (* process_line) (jit_dest_iter_t *dest_iter,
jit_t *jit,
jit_src_iter_t *src_iter,
jit_src_iter_t *mask_iter,
jit_combiner_t *combiner);
+ reg_t (* load_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels);
+ void (* write_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels, reg_t reg);
+ void (* advance_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels);
void (* end) (jit_dest_iter_t *dest, jit_t *jit);
reg_t line;
reg_t stride;
+ reg_t d;
+ reg_t w;
+ reg_t width;
};
#define MEMBER(variable, type, member) \
@@ -512,6 +536,11 @@ dest_a8r8g8b8_begin (jit_dest_iter_t * dest,
jit_free_gp (jit, info);
jit_free_gp (jit, image);
jit_free_gp (jit, tmp);
+
+ dest->width = jit_alloc_gp (jit);
+ BEGIN_ASM (jit->fragment)
+ I_mov, dest->width, MEMBER (info, pixman_composite_info_t, width),
+ END_ASM ();
}
static void
@@ -521,12 +550,149 @@ dest_a8r8g8b8_process_line (jit_dest_iter_t *dest,
jit_src_iter_t *mask,
jit_combiner_t *combiner)
{
+ int n_pixels[] = { 1, 2, 4, 2, 1 };
+ int i;
+
src->begin_line (src, jit);
mask->begin_line (mask, jit);
+ jit_switch_group (jit, "dest");
+
+ jit_reload_gp (jit, dest->line);
+ jit_reload_gp (jit, dest->stride);
+
+ dest->d = jit_alloc_gp (jit);
+ dest->w = jit_alloc_gp (jit);
+
BEGIN_ASM (jit->fragment)
+ I_mov, dest->d, dest->line,
+ I_mov, dest->w, dest->width,
+ I_add, dest->line, dest->stride,
+ END_ASM ();
+
+ for (i = 0; i < sizeof (n_pixels) / sizeof (n_pixels[0]); ++i)
+ {
+ char *loop, *test, *done;
+
+ asprintf (&loop, "horz_%d_loop", i); /* FIXME OOM */
+ asprintf (&test, "horz_%d_test", i); /* FIXME OOM */
+ asprintf (&done, "horz_%d_done", i); /* FIXME OOM */
+
+ jit_switch_group (jit, "dest");
+ jit_reload_gp (jit, dest->w);
+
+ BEGIN_ASM (jit->fragment)
+ I_jmp, LABEL (test),
+ I_sub, dest->w, IMM (n_pixels[i]),
+ END_ASM ();
-
+ combiner->combine (combiner, jit, src, mask, dest, n_pixels[i]);
+
+ BEGIN_ASM (jit->fragment)
+ DEFINE_LABEL (test),
+ END_ASM ();
+
+ jit_switch_group (jit, "dest");
+ if (i < 2)
+ {
+ jit_reload_gp (jit, dest->d);
+ BEGIN_ASM (jit->fragment)
+ /* If not misaligned, then skip to next block */
+ I_test, dest->d, IMM (n_pixels[i] * 4 * 2 - 1),
+ I_jz, LABEL (done),
+ END_ASM ();
+ }
+
+ jit_reload_gp (jit, dest->w);
+ BEGIN_ASM (jit->fragment)
+ I_cmp, dest->w, IMM (n_pixels[i]),
+ I_jge, LABEL (loop),
+ DEFINE_LABEL (done),
+ END_ASM ();
+
+ free (loop);
+ free (test);
+ free (done);
+ }
+
+ jit_free_gp (jit, dest->d);
+ jit_free_gp (jit, dest->w);
+}
+
+static reg_t
+dest_a8r8g8b8_load_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels)
+{
+ instruction_t move;
+ reg_t r;
+
+ jit_switch_group (jit, "dest");
+ jit_reload_gp (jit, dest->d);
+ r = jit_alloc_xmm (jit);
+
+ switch (n_pixels)
+ {
+ case 1:
+ move = I_movd;
+ break;
+
+ case 2:
+ move = I_movq;
+ break;
+
+ case 4:
+ move = I_movdqa;
+ break;
+ }
+
+ BEGIN_ASM (jit->fragment)
+ move, r, PTR (dest->d),
+ END_ASM ();
+
+ jit_free_xmm (jit, r);
+
+ return r;
+}
+
+static void
+dest_a8r8g8b8_write_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels, reg_t reg)
+{
+ instruction_t move;
+
+ switch (n_pixels)
+ {
+ case 1:
+ move = I_movd;
+ break;
+
+ case 2:
+ move = I_movq;
+ break;
+
+ case 4:
+ move = I_movdqa;
+ break;
+ }
+
+ jit_switch_group (jit, "dest");
+ jit_reload_gp (jit, dest->d);
+
+ BEGIN_ASM (jit->fragment)
+ move, reg, PTR (dest->d),
+ END_ASM();
+}
+
+static void
+dest_a8r8g8b8_advance_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels)
+{
+ int n_bytes;
+
+ jit_switch_group (jit, "dest");
+ jit_reload_gp (jit, dest->d);
+
+ n_bytes = n_pixels * 4;
+
+ BEGIN_ASM (jit->fragment)
+ I_add, dest->d, IMM (n_bytes),
END_ASM ();
}
@@ -537,17 +703,128 @@ dest_a8r8g8b8_end (jit_dest_iter_t *dest, jit_t *jit)
}
jit_dest_iter_t *
-dest_iter_create_a8r8g8b8 (jit_dest_iter_t *dest)
+dest_iter_create_a8r8g8b8 (void)
{
jit_dest_iter_t *iter = malloc (sizeof *iter); /* FIXME OOM */
iter->begin = dest_a8r8g8b8_begin;
iter->process_line = dest_a8r8g8b8_process_line;
+ iter->load_pixels = dest_a8r8g8b8_load_pixels;
+ iter->write_pixels = dest_a8r8g8b8_write_pixels;
+ iter->advance_pixels = dest_a8r8g8b8_advance_pixels;
iter->end = dest_a8r8g8b8_end;
return iter;
}
+/* combiner */
+static void
+combine_over (jit_combiner_t *combiner, jit_t *jit,
+ jit_src_iter_t *src, jit_src_iter_t *mask,
+ jit_dest_iter_t *dest, int n_pixels)
+{
+ reg_t s, d, m_hi, m_lo, d_hi, d_lo, zero;
+ reg_t m00ff, m0101, m0080;
+
+ s = src->load_pixels (src, jit, n_pixels);
+
+ src->advance_pixels (src, jit, n_pixels);
+
+ jit_switch_group (jit, "combiner");
+ jit_preserve_gp (jit, s);
+
+ m00ff = zero = jit_alloc_xmm (jit);
+ m_hi = jit_alloc_xmm (jit);
+ m_lo = jit_alloc_xmm (jit);
+
+ BEGIN_ASM (jit->fragment)
+ /* Generate zero */
+ I_pxor, zero, zero,
+
+ /* Expand source */
+ I_movdqa, m_hi, s,
+ I_movdqa, m_lo, s,
+ I_punpckhbw, m_hi, zero,
+ I_punpcklbw, m_lo, zero,
+ I_pshuflw, m_hi, 0xff,
+ I_pshuflw, m_lo, 0xff,
+ I_pshufhw, m_hi, 0xff,
+ I_pshufhw, m_lo, 0xff,
+
+ /* Negate mask */
+ I_pcmpeqw, m00ff, m00ff,
+ I_psrlw, m00ff, IMM (8),
+ I_pxor, m_lo, m00ff,
+ I_pxor, m_hi, m00ff,
+ END_ASM ();
+
+ jit_free_xmm (jit, zero);
+
+ d = dest->load_pixels (dest, jit, n_pixels);
+
+ jit_switch_group (jit, "combiner");
+ jit_preserve_gp (jit, d);
+
+ m0080 = zero = jit_alloc_xmm (jit);
+ m0101 = jit_alloc_xmm (jit);
+ d_hi = jit_alloc_xmm (jit);
+ d_lo = d;
+
+ BEGIN_ASM (jit->fragment)
+ /* Unpack dest */
+ I_pxor, zero, zero,
+ I_movdqa, d_hi, d,
+ I_punpckhbw, d_hi, zero,
+ I_punpcklbw, d_lo, zero,
+
+ /* Generate 0101 */
+ I_pcmpeqw, m0101, m0101,
+ I_psrlw, m0101, IMM (15),
+ I_packuswb, m0101, m0101,
+
+ /* Generate 0080 */
+ I_pcmpeqw, m0080, m0080,
+ I_psrlw, m0080, IMM (15),
+ I_psllw, m0080, IMM (7),
+
+ /* Multiply */
+ I_pmullw, d_hi, m_hi,
+ I_paddusw, d_hi, m0080,
+ I_pmulhw, d_hi, m0101,
+
+ I_pmullw, d_lo, m_lo,
+ I_paddusw, d_lo, m0080,
+ I_pmulhw, d_lo, m0101,
+
+ /* Pack */
+ I_packuswb, d_lo, d_hi,
+
+ /* Add */
+ I_paddusb, d, s,
+ END_ASM();
+
+ jit_free_xmm (jit, m0080);
+ jit_free_xmm (jit, m0101);
+ jit_free_xmm (jit, m_hi);
+ jit_free_xmm (jit, m_lo);
+ jit_free_xmm (jit, d_lo);
+ jit_free_xmm (jit, d_hi);
+ jit_free_xmm (jit, s);
+
+ dest->write_pixels (dest, jit, n_pixels, d);
+ dest->advance_pixels (dest, jit, n_pixels);
+}
+
+jit_combiner_t *
+combiner_create_over (void)
+{
+ jit_combiner_t *combiner = malloc (sizeof *combiner); /* FIXME OOM */
+
+ combiner->combine = combine_over;
+
+ return combiner;
+}
+
void
generate_kernel (jit_t *jit,
jit_src_iter_t *src_iter,