#include #include #include #include #include #include "x86-codegen.h" #include typedef struct bits_image bits_image_t; struct bits_image { int common[24]; int format; void *indexed; int width; int height; uint32_t * bits; uint32_t * free_me; int rowstride; /* in number of uint32_t's */ }; #define OFFSET(name) \ (((unsigned char *)(&((bits_image_t *)0)->name)) - (unsigned char *)0) static unsigned char * print_reg (unsigned char *code, X86_Reg_No reg, const char *title) { x86_pushad (code); x86_push_reg (code, reg); x86_push_imm (code, title); x86_call_code (code, printf); x86_pop_reg (code, X86_EAX); x86_pop_reg (code, X86_EAX); x86_popad (code); return code; } static gboolean EAX; static gboolean EBX; static gboolean ECX; static gboolean EDX; static gboolean ESI; static gboolean EDI; static gboolean need_pop_eax; static X86_Reg_No reg_alloc (gboolean need_16_bit) { #define CHECK_REG(r) \ do \ { \ if (!(r)) \ { \ r = TRUE; \ return X86_##r; \ } \ } while (0) if (!need_16_bit) { CHECK_REG(ESI); CHECK_REG(EDI); } CHECK_REG(EBX); CHECK_REG(ECX); CHECK_REG(EDX); if (!need_pop_eax) { CHECK_REG(EAX); /* Allocate EAX last since it is required to be available in * some cases */ } assert (0); return X86_NREG; } static void reg_unalloc (X86_Reg_No r) { #define CASE_REG(r) case X86_##r: (r) = FALSE; break switch (r) { CASE_REG(EAX); CASE_REG(EBX); CASE_REG(ECX); CASE_REG(EDX); CASE_REG(ESI); CASE_REG(EDI); default: assert (0); break; } } static unsigned char * reg_alloc_eax (unsigned char *code) { if (!EAX) { EAX = TRUE; } else { if (need_pop_eax) assert ("Trying to allocate EAX twice" == NULL); need_pop_eax = TRUE; x86_push_reg (code, X86_EAX); } return code; } static unsigned char * reg_unalloc_eax (unsigned char *code) { if (need_pop_eax) { x86_pop_reg (code, X86_EAX); need_pop_eax = FALSE; } else { EAX = FALSE; } return code; } static unsigned char * compute_start (unsigned char *code, int bpp, X86_Reg_No stride, int x_offset, int y_offset, X86_Reg_No dest) { int shift; X86_Reg_No tmp; switch (bpp) { case 8: shift = 0; break; case 16: shift = 1; break; case 32: shift = 2; break; default: return NULL; break; } /* @#$! x86 */ code = reg_alloc_eax (code); x86_movzwl_reg_membase (code, X86_EAX, X86_EBP, y_offset); x86_mul_reg (code, stride, FALSE); tmp = reg_alloc(FALSE); x86_movzwl_reg_membase (code, tmp, X86_EBP, x_offset); x86_lea_memindex (code, dest, X86_EAX, 0, tmp, shift); reg_unalloc (tmp); code = reg_unalloc_eax (code); /* dest = stride * y + x * (bpp / 8) */ return code; } /* Clobbers stride_reg */ static unsigned char * compute_skip (unsigned char *code, int bpp, X86_Reg_No stride, X86_Reg_No neg_width, int dest_offset) { int shift; X86_Reg_No tmp; /* This function computes the number of bytes to skip after the inner loop. * the result is stored in dest_offset(%ebp). * * The value computed is * * stride * 4 - width * (bpp / 8) */ switch (bpp) { case 8: shift = 0; break; case 16: shift = 1; break; case 32: shift = 2; break; default: return NULL; break; } /* %eax = stride in bytes */ tmp = reg_alloc (FALSE); /* %eax = %eax + (-width) << shift */ x86_lea_memindex (code, tmp, stride, 0, neg_width, shift); x86_mov_membase_reg (code, X86_EBP, dest_offset, tmp, 4); reg_unalloc (tmp); code = print_reg (code, X86_EAX, "Skip: %d\n"); return code; } static gboolean generate_compose_func (unsigned char *code, pixman_format_code_t src_format, pixman_format_code_t mask_format, pixman_format_code_t dest_format) { /* Stack layout * * [ uint16 height ] ebp+52 higher addresses * [ uint16 width ] ebp+48 * [ int16 yDst ] ebp+44 * [ int16 xDst ] ebp+40 * [ int16 yMask ] ebp+36 * [ int16 xMask ] ebp+32 * [ int16 ySrc ] ebp+28 * [ int16 xSrc ] ebp+24 * [ pointer pDst ] ebp+20 * [ pointer pMask ] ebp+16 * [ pointer pSrc ] ebp+12 * [ uint8 op ] ebp+8 * [ pointer retaddr ] ebp+4 * [ pointer old_ebp ] <- ebp * [ uint32_t src_skip ] ebp-4 * [ uint32_t mask_skip ] ebp-8 * [ uint32_t dest_skip ] ebp-12 */ #define HEIGHT 52 #define WIDTH 48 #define SRC 12 #define MASK 16 #define DEST 20 #define X_SRC 24 #define Y_SRC 28 #define X_MASK 32 #define Y_MASK 36 #define X_DEST 40 #define Y_DEST 44 #define N_LOCALS 3 #define SRC_SKIP -4 #define MASK_SKIP -8 #define DEST_SKIP -12 unsigned char *row_loop, *row_test, *row_jump; unsigned char *column_loop, *column_test, *column_jump; int bpp, bpp_shift; X86_Reg_No loop_reg; X86_Reg_No neg_width; X86_Reg_No src_bits, mask_bits, dest_bits; X86_Reg_No stride; X86_Reg_No tmp; /* Preamble */ x86_push_reg (code, X86_EBP); x86_mov_reg_reg (code, X86_EBP, X86_ESP, 4); #if 0 x86_mov_memindex_reg (code, x86_mov_reg_imm (code, X86_EAX, 0x1345134); x86_mov_reg_mem (code, X86_EAX, 0x1234567, 4); #endif x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4 * N_LOCALS, 4); /* 3 local variables */ /* Compute number of bytes to skip after every inner loop */ neg_width = reg_alloc (FALSE); /* %ecx = -width */ x86_movzwl_reg_membase (code, neg_width, X86_EBP, WIDTH); x86_neg_reg (code, neg_width); /* Compute source skip */ stride = reg_alloc(FALSE); x86_mov_reg_membase (code, stride, X86_EBP, SRC, 4); x86_mov_reg_membase (code, stride, stride, OFFSET(rowstride), 4); src_bits = reg_alloc(FALSE); code = compute_skip (code, PIXMAN_FORMAT_BPP (src_format), stride, neg_width, SRC_SKIP); code = compute_start (code, PIXMAN_FORMAT_BPP (src_format), stride, X_SRC, Y_SRC, src_bits); tmp = reg_alloc (FALSE); x86_mov_reg_membase (code, tmp, X86_EBP, SRC, 4); x86_alu_reg_membase (code, X86_ADD, src_bits, tmp, OFFSET(bits)); reg_unalloc (tmp); if (!code) return FALSE; /* Compute mask skip */ mask_bits = reg_alloc(FALSE); x86_mov_reg_membase (code, stride, X86_EBP, MASK, 4); x86_mov_reg_membase (code, stride, stride, OFFSET(rowstride), 4); code = compute_skip (code, PIXMAN_FORMAT_BPP (mask_format), stride, neg_width, MASK_SKIP); code = compute_start (code, PIXMAN_FORMAT_BPP (mask_format), stride, X_MASK, Y_MASK, mask_bits); tmp = reg_alloc (FALSE); x86_mov_reg_membase (code, tmp, X86_EBP, MASK, 4); x86_alu_reg_membase (code, X86_ADD, mask_bits, tmp, OFFSET(bits)); reg_unalloc (tmp); if (!code) return FALSE; /* Compute dest skip */ dest_bits = reg_alloc(FALSE); x86_mov_reg_membase (code, stride, X86_EBP, DEST, 4); x86_mov_reg_membase (code, stride, stride, OFFSET(rowstride), 4); code = compute_skip (code, PIXMAN_FORMAT_BPP (mask_format), stride, neg_width, DEST_SKIP); reg_unalloc (neg_width); code = compute_start (code, PIXMAN_FORMAT_BPP (mask_format), stride, X_DEST, Y_DEST, dest_bits); reg_unalloc (stride); tmp = reg_alloc (FALSE); x86_mov_reg_membase (code, tmp, X86_EBP, DEST, 4); x86_alu_reg_membase (code, X86_ADD, dest_bits, tmp, OFFSET(bits)); reg_unalloc (tmp); if (!code) return FALSE; { /* Row loop */ /* We use a trick to use %ecx for both the width and height counters: * * Height is stored in the high order 16 bit of %ecx, and * width is store in the low order 16 bit. This allows us to * refer to width as just %cl. Once we get to the point where * we need to decrement and test the height, we know that cl * is 0, so we can just subtract (1 << 16) and test %ecx against 0. */ loop_reg = reg_alloc (TRUE); x86_movzwl_reg_membase (code, loop_reg, X86_EBP, HEIGHT); x86_shift_reg_imm(code, X86_SHL, loop_reg, 16); row_jump = code; #if 0 x86_jump32 (code, 0); /* Jump to test, will patch later */ #endif /* Start of the row loop */ row_loop = code; { /* Column loop */ x86_mov_reg_membase (code, loop_reg, X86_EBP, WIDTH, 2); column_jump = code; #if 0 x86_jump32 (code, 0); /* Jump to column test, patch later */ #endif column_loop = code; #if 0 x86_push_reg (code, loop_reg); /* ecx is caller save */ x86_push_imm (code, "birnan\n"); x86_call_code (code, printf); x86_pop_reg (code, X86_EAX); x86_pop_reg (code, loop_reg); #endif x86_mov_reg_memindex (code, X86_EBX, ECX, 0, loop_reg, 4, 2); x86_alu_reg_imm (code, X86_SUB, loop_reg, 1, 2); /* Column test */ column_test = code; x86_branch (code, X86_CC_NZ, column_loop, FALSE); } /* Add skip to the pointers */ /* Decrement row counter */ x86_alu_reg_imm (code, X86_SUB, loop_reg, 1 << 16, 4); /* Row test */ row_test = code; x86_branch (code, X86_CC_NZ, row_loop, FALSE); } /* Postamble */ x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4 * N_LOCALS, 4); x86_pop_reg (code, X86_EBP); x86_ret_void (code); #if 0 /* Patch up jumps */ x86_patch (row_jump, row_test); x86_patch (column_jump, column_test); #endif return TRUE; } typedef void (* ComposeFunc) ( uint8_t op, void *src, void *mask, void *dst, int16_t src_x, int16_t src_y, int16_t mask_x, int16_t mask_y, int16_t dst_x, int16_t dst_y, uint16_t width, uint16_t height); int main () { unsigned char *the_code = g_malloc (65536); bits_image_t src; bits_image_t mask; bits_image_t dest; if (!generate_compose_func (the_code, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, PIXMAN_a8r8g8b8)) g_print ("code generation failed\n"); src.rowstride = 100; mask.rowstride = 50; dest.rowstride = 20; if (mprotect ((void *)(((gulong)the_code) & ~((1 << 12) - 1)), 65536, PROT_READ | PROT_EXEC) < 0) g_print ("mprotect failed: %s\n", strerror (errno)); else g_print ("mprotect succeeded\n"); ((ComposeFunc)the_code)( 100, &src, &mask, &dest, 1, 1, 2, 2, 3, 3, 2, 2); return 0; }