diff options
author | Søren Sandmann <ssp@redhat.com> | 2013-12-22 11:02:45 -0500 |
---|---|---|
committer | Søren Sandmann <ssp@redhat.com> | 2013-12-22 11:02:45 -0500 |
commit | 181eef863b38bba050225b15e057f18845edccdf (patch) | |
tree | 868fc827374febbf50fb975fb3a0a6776a178210 /iterjit.c | |
parent | 01b4b9f26232b1f95a339fae4ce6e681f7236272 (diff) |
iter jit
Diffstat (limited to 'iterjit.c')
-rw-r--r-- | iterjit.c | 381 |
1 files changed, 30 insertions, 351 deletions
@@ -1,353 +1,11 @@ -/* - - We need an "outer" driver from which constants can be requested. - API: - outer->register_constant_4x32 (outer, jit, 0x01010101) - outer->get_constant_4x32 (outer, jit, 0x01010101) - - The outer driver will: - - at the end of the kernel, it will generate a sequence of constants that - can be addressed in a RIP relative way (What about x32?) - - at register time, it will allocate a register and load the constant - - get_constant() will then return either a memory location or a register - - - Register allocator should support "get location" of a register. If - the register is spilled, the returned op will be a memory location. - If not, it will be a register. - - - Biting the bullet on a real register allocator: - - - Introduce new OP_VAR that represent a variable to be - register allocated. - - - Introduce new OP_VAR_MEM{8,16,32,64} that correspond to - memory references where the involved registers are - all unallocated variables. - - - register allocator hands out variables when given a register - pool - - - Ghetto liveness analysis: - - 1. Variable is live between first assignment - and last read - - 2. For each jump target, for all variables live there - make them live at the jump source - - 3. Repeat 2 until it converges - - - Linear scan allocation for each register class - - - Run through code and generate allocated machine code - - - Spilled variables will sometimes need to be loaded into - registers; when that happens, we need to be able to - - - Information required from the assembler: - - which arguments are read and written - - in which cases is a register *required*? - - this second one could just be done by - adding an api to "check" an instruction: - - check (I_mov, PTR (eax), PTR (ebx)) - - for example would not be allowed since there - are two memory references. - - The technique for dealing with spilled variables - is to: - 1. first check the instruction with memory - references - 2. if that fails, then load one of the operands - into a register - 3. if that fails, then load the other into - a register - 4. if that fails, then load both into registers - (this should be rare/impossible) - - - When a variable absolutely *must* be in a register, do it like this: - - mov t1, [t2 + 8 * t3] - - t1, t2 and t3 here *must* be in registers. Replace instruction with - - MOV tt1, t1 - MOV tt2, t2 - MOV tt3, t3 - MOV tt1, [tt2 + 8 * tt3] - MOV t1, tt1 - - and mark tt1, tt2, and tt3 as "unspillable" and (since they - are not live at the same time as their counterparts, "hint" - that they should be stored in the same register. - - Implications: - - live ranges must be more exact (ie., must be able to - contain holes) - - - variables can now be unspillable; as long as such - variables can only occur as the result of the - above construction, we should be fine. - - - when the linear scan allocator encounters a - situation where there are unspillable variables, it - processes the unspillable variables first, assigning - them to their hints if they have one and their hint - was not itself spilled, and if that register is - available. - - - mov elision: a mov between the same register should - be discarded. - -Concepts: - - - 'code': array of uint64, can be register allocated or not. If not, - it may contain OP_VAR and OP_VAR_MEM{8,16,32,64} operands. - - - assembler: knows how to convert register allocated code to - machine code - - - fragment: Can take 'code' and turn it into annotated machine - code. Should probably go away and become just 'code' since - the details of annotation are irrelevant to the outside - - - register allocator: will hand out OP_VARs, knows how to - convert non-register-allocated into register-allocated. Is - specific to a register class. - - - stack manager: will hand out stack offsets based on size etc. - Will also tell how much stack space is needed in total - - - code manager: will hand out blocks of writable/executable - memory that can be written into. Will (eventually) deal with - handling ELF files. This will need to be OS specific. - - - BEGIN_ASM/END_ASM should become BEGIN_CODE/END_CODE and just - return a pointer; maybe even - - BEGIN_CODE (&code) - - END_CODE (); - - which will copy the code into malloced memory and store a - pointer to it in code. (It will be appended, so the memory - management needs to be a little more complex here). - - - jit: Contains all the above, and will turn non-register - allocated code into a pointer to executable code. - -The 'code' data structure: - - - needs ability to be appended to from an array - - - needs ability to be 'rewritten', ie., a linear scan through it - where instructions are inserted and removed - - - needs to be concatenatable - -It's tempting to say that the various operations will free and allocate: - - - code = register_allocate (code) - -will free the original code and return a new one, and - - - executable = assembler_assemble (code, .... ); - -will free all the code arrays. There should also be a regular -code_free. So, - - typedef struct - { - int n_elements; - uint64_t code[1]; - } code_t; - -and - - register_allocator_t allocator; - - register_alloc_init (&allocator, &stack_man); - - op1 = register_alloc_new (&allocator, gp_pool); - op2 = register_alloc_new (&allocator, xmm_pool); - op3 = register_alloc_new (&allocator, xmm_pool); - - code_t *code = code_new(); - - BEGIN_CODE (&code) - ..., - END_CODE(); - - code = register_alloc_allocate (&gp_alloc, code); - - - -Note: An interesting fix to x86 crazyness is to just turn - - mov d, [b + x * k + i] - - into - - mov t1, x - shl t1, k - add t1, i - add t1, b - mov d, [t1] - - and then have a peephole pass that recognizes the above and - turns it back into x86 addressing when possible. - - Ie., recognize - - live t1 - mov r0, r1, - [shl r0, {1,2,3}], - [add r0, imm], - add r0, r2, - mov d, [t1] - dead t1 - - - Flow: - - outer loop: - - generates outer loop - - src/mask/dest have pre-scanline setups called - - body is generated by destination iter - - src/mask/dest have post-scanline finalizers called - - - destination iter: - - for each alignment loop, it generates an inner loop - - body of inner loop is generated by combiner - - combiner returns pixels or an indication that - no change is required - - destination iter writes back if necessary - - advances pointer - - - combiner drives generation of pixels - - if there is a mask, then - - tell mask iter to load n pixels - - generate code to check if all pixels are 0 - - if it is, then - - depending on operator - - return 0 - - return "no change needed" - else: - - tell source iter to load n pixels - - add code to multiply them together - - depending on operator - - add code to check if a result is 0xff - - if it is, then - - return source - else: - - tell destination to read a pixel - - generate code to combine with source - - return result - tell mask iter to advance - else: - - tell source iter to load n pixels - - depending on operator - - add code to check if source is 0xff - - if it is, then - - return source - else: - - tell destination to read pixel - - generate code to combine with source - - return result - tell source iter to advance - - -=- - - Issues: - - The stack based register allocator is not a good fit for this - scheme. How about the concept of register "groups"? - - At all times, one group is "active". Allocations are done in this - group; registers from other groups can't be used. If such registers - are needed, they must be reallocated in the new group. - - When there is no longer enough free registers in a group, a - register from another group is kicked out to the stack. When that - other group becomes active, all kicked-out registers are moved - back in (lazily?). Note, you can never allocate more than the - number of real registers within one group. - - API: - reg_alloc_init (&ra, stack_manager_t *sman, reg_set_t *regs); - reg_alloc_switch (&ra, "name"); - op_t = reg_alloc_alloc (&ra); - - // The register becomes allocated in the current group - // with its current value - reg_alloc_preserve (&ra, op_t); - - // After a switch to another group, this make sure the - // given registers are reloaded from the stack. - reg_alloc_reload (&ra, ..., -1); - - - See reggroups.[ch] for a start - -=- - - Iterator based JIT compiler - - There is a new structure that can be either source or - destination. It contains function pointers that know how to generate - pixels. The function pointers: - - Source iterators: - - - begin(): generates code that runs before the kernel - - begin_line(): generates code that runs before each scanline - - get_one_pixel(): generates code that reads one pixel - - get_two_pixels(): --- two pixels - - get_four_pixels(): --- four pixels - - get_eight_pixels(): --- eight pixels - - get_sixteen_pixels(): --- - - end_line(); - - end(); - - An iterator is associated with a format: - - format_4_a8r8g8b8 - format_16_a8 - format_8_a8r8g8b8 - and possibly others - - The iterator is supposed to return pixels in that format. It will - never be asked to fetch more pixels than the format supports. Ie., - get_sixteen_pixels() will never be called if the format is - format_4_a8r8g8b8(). - - All the get_n_pixels() return a register where they left the pixel. - - There are also combiners. These are also associated with a format - and know how to combine pixels in that format with each other. - - Destination iterators are responsible for driving a lot of the access? - - They have a method: process_scanline (src_iter, mask_iter, combiner) - which is responsible for: - - - generating the inner loop, - - calling src iter, - - callling mask iter, - - calling combiner - - writing back - - They are specific to a format, and an intermediate format. - - There may also be setup()/setup_line() methods required. - -*/ - #define _GNU_SOURCE #include <stdio.h> #include <stddef.h> #include <pixman.h> #include <string.h> #include "simplex86.h" -#include "simple-reg.h" #include "stack-man.h" +#include "reggroups.h" #include "crc32.h" typedef struct @@ -539,7 +197,8 @@ struct jit_t assembler_t *assembler; fragment_t *fragment; stack_man_t stack_man; - reg_alloc_t reg_alloc; + reg_alloc_t gp_allocator; + reg_alloc_t xmm_allocator; }; struct jit_src_iter_t @@ -588,12 +247,25 @@ struct jit_dest_iter_t #define MEMBER(variable, type, member) \ BASE((variable), offsetof (type, member)) -void jit_switch_group (jit_t *jit, const char *group); +jit_t * +jit_new (void) +{ + jit_t *jit = malloc (sizeof (jit_t)); + + return jit; +} + +void +jit_switch_group (jit_t *jit, const char *group) +{ +} + reg_t jit_alloc_gp (jit_t *jit); reg_t jit_alloc_xmm (jit_t *jit); void jit_preserve_gp (jit_t *jit, reg_t reg); void jit_free_gp (jit_t *jit, reg_t reg); void jit_reload_gp (jit_t *jit, reg_t reg); +void jit_reload_xmm (jit_t *jit, reg_t reg); void jit_free_gp (jit_t *jit, reg_t reg); void jit_free_xmm (jit_t *jit, reg_t reg); @@ -710,7 +382,7 @@ src_a8r8g8b8_end (jit_src_iter_t *src, jit_t *jit) } jit_src_iter_t * -src_iter_create_a8r8g8b8 (jit_dest_iter_t *dest) +src_iter_create_a8r8g8b8 (void) { jit_src_iter_t *iter = malloc (sizeof *iter); /* FIXME OOM */ @@ -1126,11 +798,11 @@ generate_kernel (jit_t *jit, jit_switch_group (jit, "outer"); /* Restore callee-save registers */ - jit_reload (jit, rbx); - jit_reload (jit, r12); - jit_reload (jit, r13); - jit_reload (jit, r14); - jit_reload (jit, r15); + jit_reload_gp (jit, rbx); + jit_reload_gp (jit, r12); + jit_reload_gp (jit, r13); + jit_reload_gp (jit, r14); + jit_reload_gp (jit, r15); BEGIN_ASM (jit->fragment) DEFINE_LABEL ("done"), I_pop, rbp, @@ -1140,8 +812,15 @@ generate_kernel (jit_t *jit, int main () { + jit_dest_iter_t *dest = dest_iter_create_a8r8g8b8 (); + jit_combiner_t *combiner = combiner_create_over (); + jit_src_iter_t *src = src_iter_create_a8r8g8b8 (); + jit_t *jit = jit_new (); + /* n_8_8888() */ printf ("iter jit\n"); + generate_kernel (jit, src, NULL, dest, combiner); + return 0; } |