diff options
author | Veli-Matti Valtonen <veli-matti.valtonen@movial.com> | 2011-02-22 11:05:57 +0200 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2011-02-24 06:36:25 -0500 |
commit | 0eb98a3a6a4320619ce8d95eba9f884d8a7c9097 (patch) | |
tree | 6d4ad59ac9d4a538ed4be6fef8821011849e22b9 | |
parent | c45f7d46eabaadb8cad0ac6f229ed46c216e847b (diff) |
DSPASE More cleanup, out reverse op.mips2
MIPS: DSPASE Implemented DSPASE1_UN8x4_MUL_UN8 macro.
MIPS: DSPASE Implemented scanline out reverse
MIPS: DSPASE over_n_8_8888 modified to use the macro bindings
-rw-r--r-- | pixman/pixman-mips-dspase1-asm.S | 226 | ||||
-rw-r--r-- | pixman/pixman-mips-dspase1.c | 50 |
2 files changed, 155 insertions, 121 deletions
diff --git a/pixman/pixman-mips-dspase1-asm.S b/pixman/pixman-mips-dspase1-asm.S index 596b38a4..0cb2293f 100644 --- a/pixman/pixman-mips-dspase1-asm.S +++ b/pixman/pixman-mips-dspase1-asm.S @@ -18,6 +18,26 @@ .size \fname, .-\fname .endm +# result register can be the same as any of the params +# rb_half should contain 0x00800080 +.macro DSPASE1_UN8x4_MUL_UN8_head a, b, x, y + muleu_s.ph.qbl \x, \a, \b + muleu_s.ph.qbr \y, \a, \b +.endm + +.macro DSPASE1_UN8x4_MUL_UN8_tail x, y, result, rb_half, tmp3, tmp4 + addu \x, \x, \rb_half + addu \y, \y, \rb_half + + preceu.ph.qbla \tmp3, \x + preceu.ph.qbla \tmp4, \y + + addu \x, \x, \tmp3 + addu \y, \y, \tmp4 + + precrq.qb.ph \result, \x, \y +.endm + .set noreorder .set nomacro @@ -40,20 +60,13 @@ pixman_asm_func pixman_composite_scanline_over_asm_dspase1 srl $t2, $t2, 24 // ALPHA_8(~src) ins $t2, $t2, 16, 8 // 0:a:0:a; equivalent to replv.ph - muleu_s.ph.qbl $t3, $t0, $t2 - muleu_s.ph.qbr $t4, $t0, $t2 + DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4 lw $t0, 4($a1) // dest[1] for next loop iteration addiu $a1, $a1, 4 // dest++ - addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph - preceu.ph.qbla $t5, $t3 // rev2: shrl.ph - preceu.ph.qbla $t6, $t4 // rev2: shrl.ph - addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6 - precrq.qb.ph $t3, $t3, $t4 addu_s.qb $t3, $t3, $t1 lwx $t1, $v0($a1) // src (dest + diff) for next loop iteration @@ -88,35 +101,22 @@ pixman_asm_func pixman_composite_scanline_over_mask_asm_dspase1 srl $t8, $t8, 24 // mask >>= A_SHIFT ins $t8, $t8, 16, 8 // 0:m:0:m; equivalent to replv.ph - muleu_s.ph.qbl $t3, $t1, $t8 - muleu_s.ph.qbr $t4, $t1, $t8 + DSPASE1_UN8x4_MUL_UN8_head $t1, $t8, $t3, $t4 lw $t0, 0($a1) // dest - addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph - preceu.ph.qbla $t5, $t3 // rev2: shrl.ph - preceu.ph.qbla $t6, $t4 // rev2: shrl.ph - addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph - precrq.qb.ph $t1, $t3, $t4 + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t9, $t5, $t6 not $t2, $t1 // ~src srl $t2, $t2, 24 // ALPHA_8(~src) ins $t2, $t2, 16, 8 // 0:a:0:a; equivalent to replv.ph - muleu_s.ph.qbl $t3, $t0, $t2 - muleu_s.ph.qbr $t4, $t0, $t2 + DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4 addiu $a1, $a1, 4 // dest++ + + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6 - addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph - preceu.ph.qbla $t5, $t3 // rev2: shrl.ph - preceu.ph.qbla $t6, $t4 // rev2: shrl.ph - addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph - precrq.qb.ph $t3, $t3, $t4 addu_s.qb $t3, $t3, $t1 bne $a1, $a0, 0b @@ -197,28 +197,18 @@ pixman_asm_func pixman_composite_scanline_add_mask_asm_dspase1 $scanline_add_mask_loop: lwx $t2, $a3($a1) lwx $t1, $a2($a1) - lw $t0, 0($a1) - - addiu $a1, $a1, 4 # based on pixman_composite_scanline_over_mask_asm_dspase1 - # converting these to macroes might make sense srl $t2, $t2, 24 ins $t2, $t2, 16, 8 // 0:m:0:m; equivalent to replv.ph - - muleu_s.ph.qbl $t3, $t1, $t2 - muleu_s.ph.qbr $t4, $t1, $t2 - - addu $t3, $t3, $t8 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t8 // can't overflow; rev2: addu_s.ph - preceu.ph.qbla $t5, $t3 // rev2: shrl.ph - preceu.ph.qbla $t6, $t4 // rev2: shrl.ph - addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph + DSPASE1_UN8x4_MUL_UN8_head $t1, $t2, $t3, $t4 - precrq.qb.ph $t1, $t3, $t4 + lw $t0, 0($a1) + addiu $a1, $a1, 4 + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t8, $t5, $t6 + addu_s.qb $t0, $t0, $t1 bne $a1, $t9, $scanline_add_mask_loop @@ -229,60 +219,144 @@ $scanline_add_mask_exit: pixman_end_func pixman_composite_scanline_add_mask_asm_dspase1 +# Scanline out reverse, no mask +pixman_asm_func pixman_composite_scanline_out_reverse_asm_dspase1 + beqz $a0, $scanline_out_reverse_exit + sll $a0, $a0, 2 # Number of 8bit blocks (For addressing) + + li $t8, 0x00800080 + + subu $a2, $a2, $a1 // sdiff = src - dest (for LWX) + + addu $t9, $a1, $a0 +$scanline_out_reverse_loop: + lwx $t1, $a2($a1) + lw $t0, 0($a1) + + not $t1, $t1 + srl $t1, $t1, 24 # src + ins $t1, $t1, 16, 8 // 0:m:0:m; equivalent to replv.ph + + DSPASE1_UN8x4_MUL_UN8_head $t0, $t1, $t3, $t4 + + addiu $a1, $a1, 4 + + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t0, $t8, $t5, $t6 + + bne $a1, $t9, $scanline_out_reverse_loop + sw $t0, -4($a1) +$scanline_out_reverse_exit: + jr $ra + nop +pixman_end_func pixman_composite_scanline_out_reverse_asm_dspase1 + +# Scanline out reverse, mask +pixman_asm_func pixman_composite_scanline_out_reverse_mask_asm_dspase1 + beqz $a0, $scanline_out_reverse_mask_exit + sll $a0, $a0, 2 # Number of 8bit blocks (For addressing) + + li $t8, 0x00800080 + + subu $a2, $a2, $a1 // sdiff = src - dest (for LWX) + subu $a3, $a3, $a1 // mdiff = mask - dest (for LWX) + + addu $t9, $a1, $a0 +$scanline_out_reverse_mask_loop: + lwx $t2, $a3($a1) + lwx $t1, $a2($a1) + + # combine mask + srl $t2, $t2, 24 # mask + srl $t1, $t1, 24 # src + + mul $t3, $t2, $t1 + + lw $t0, 0($a1) + + addiu $t3, $t3, 0x80 + srl $t4, $t3, 8 + addu $t3, $t3, $t4 + srl $t3, $t3, 8 + # mask combined + + not $t1, $t3 + andi $t1, $t1, 0xff + ins $t1, $t1, 16, 8 // 0:m:0:m; equivalent to replv.ph + + DSPASE1_UN8x4_MUL_UN8_head $t0, $t1, $t3, $t4 + + addiu $a1, $a1, 4 + + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t0, $t8, $t5, $t6 + + bne $a1, $t9, $scanline_out_reverse_mask_loop + sw $t0, -4($a1) +$scanline_out_reverse_mask_exit: + jr $ra + nop +pixman_end_func pixman_composite_scanline_out_reverse_mask_asm_dspase1 + + //////////////////////////////////////////////////////////////////////////////// -pixman_asm_func mips_dspase1_composite_over_n_8_8888_inner - beqz $a3, 1f - sll $a3, $a3, 2 // width <<= 2 +pixman_asm_func pixman_composite_over_n_8_8888_asm_dspase1 + lw $v0, 16($sp) # src + # 20($sp) is unused + lw $v1, 24($sp) # mask + lw $t7, 28($sp) # mask_stride + + beqz $a0, $over_n_8_8888_end + addiu $sp, $sp, -4 + + sw $s0, 0($sp) + + subu $t7, $t7, $a0 # mask 8bit stride - width - addu $a3, $a0, $a3 // dest_end = dest + width + sll $a0, $a0, 2 // width <<= 2 + sll $a3, $a3, 2 # dst <<= 2 + + subu $a3, $a3, $a0 # dst stride - width li $t9, 0x00800080 -0: - lbu $t8, 0($a2) // mask - lw $t0, 0($a0) // dest +$over_n_8_8888_height_loop: + addu $s0, $a0, $a2 # dst end + addiu $a1, $a1, -1 + +$over_n_8_8888_width_loop: + lbu $t8, 0($v1) // mask + lw $t0, 0($a2) // dest ins $t8, $t8, 16, 8 // 0:m:0:m; equivalent to replv.ph - muleu_s.ph.qbl $t3, $a1, $t8 - muleu_s.ph.qbr $t4, $a1, $t8 + DSPASE1_UN8x4_MUL_UN8_head $v0, $t8, $t3, $t4 - addiu $a0, $a0, 4 // dest++ - addiu $a2, $a2, 1 // mask++ + addiu $a2, $a2, 4 // dest++ + addiu $v1, $v1, 1 // mask++ - addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph - preceu.ph.qbla $t5, $t3 // rev2: shrl.ph - preceu.ph.qbla $t6, $t4 // rev2: shrl.ph - addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph - precrq.qb.ph $t1, $t3, $t4 // in(src,m) + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t9, $t5, $t6 not $t2, $t1 // ~in(src,m) srl $t2, $t2, 24 ins $t2, $t2, 16, 8 // 0:a:0:a; equivalent to replv.ph - muleu_s.ph.qbl $t3, $t0, $t2 - muleu_s.ph.qbr $t4, $t0, $t2 + DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4 + DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6 - addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph - preceu.ph.qbla $t5, $t3 // rev2: shrl.ph - preceu.ph.qbla $t6, $t4 // rev2: shrl.ph - addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph - addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph - precrq.qb.ph $t3, $t3, $t4 addu_s.qb $t3, $t3, $t1 // over(in(src,m),dest) - bne $a0, $a3, 0b - sw $t3, -4($a0) // dest + bne $a2, $s0, $over_n_8_8888_width_loop + sw $t3, -4($a2) // dest -1: + addu $a2, $a2, $a3 + bnez $a1, $over_n_8_8888_height_loop + addu $v1, $v1, $t7 + + lw $s0, 0($sp) +$over_n_8_8888_end: jr $ra - nop - -pixman_end_func mips_dspase1_composite_over_n_8_8888_inner + addiu $sp, $sp, 4 +pixman_end_func pixman_composite_over_n_8_8888_asm_dspase1 pixman_asm_func pixman_composite_add_8888_8888_asm_dspase1 lw $v0, 16($sp) # src @@ -403,7 +477,7 @@ $add_n_8888_no_main_loop: $add_n_8888_leftover_loop: lw $t2, 0($a2) - addiu $a2, $a2, 4 # Moving this anywhere else will cause a stall in store + addiu $a2, $a2, 4 addu_s.qb $t2, $t2, $v0 bne $a2, $t8, $add_n_8888_leftover_loop diff --git a/pixman/pixman-mips-dspase1.c b/pixman/pixman-mips-dspase1.c index 0ab3f87d..b53c3df5 100644 --- a/pixman/pixman-mips-dspase1.c +++ b/pixman/pixman-mips-dspase1.c @@ -8,10 +8,8 @@ // assembly-language functions -void -mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, uint32_t src, - const uint8_t *mask, int width); - +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(SKIP_ZERO_SRC, dspase1, over_n_8_8888, + uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(dspase1, add_8888_8888, uint32_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888, @@ -19,46 +17,6 @@ PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888, //////////////////////////////////////////////////////////////////////////////// -static void -mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint32_t *dst_line, *dst; - uint8_t *mask_line, *mask; - int dst_stride, mask_stride; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - - mips_dspase1_composite_over_n_8_8888_inner(dst, src, mask, width); - } -} - #define BIND_COMBINE_U(name) \ void \ pixman_composite_scanline_##name##_mask_asm_dspase1 (int32_t w, \ @@ -88,13 +46,14 @@ dspase1_combine_##name##_u (pixman_implementation_t *imp, \ BIND_COMBINE_U (over) BIND_COMBINE_U (add) +BIND_COMBINE_U (out_reverse) //////////////////////////////////////////////////////////////////////////////// static const pixman_fast_path_t mips_dspase1_fast_paths[] = { - PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, mips_dspase1_fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, dspase1_composite_over_n_8_8888 ), PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, dspase1_composite_add_n_8888 ), PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, dspase1_composite_add_n_8888 ), @@ -115,6 +74,7 @@ _pixman_implementation_create_mips_dspase1 (pixman_implementation_t *delegate) imp->combine_32[PIXMAN_OP_OVER] = dspase1_combine_over_u; imp->combine_32[PIXMAN_OP_ADD] = dspase1_combine_add_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = dspase1_combine_out_reverse_u; return imp; } |