diff options
author | Veli-Matti Valtonen <veli-matti.valtonen@movial.com> | 2011-02-22 11:05:56 +0200 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2011-02-24 06:36:22 -0500 |
commit | c45f7d46eabaadb8cad0ac6f229ed46c216e847b (patch) | |
tree | 234ffa3068cd9947d69d38bf0ee3d0da58e0c1d1 | |
parent | 029e32f3f2f5e22b969698427a26f69be6a178e6 (diff) |
DSPASE Cleanup and add operations
MIPS: DSPASE Modified the original commit dspase to use arm-neon bind macro
MIPS: DSPASE Implemented add_8888_8888 and add_n_8888
MIPS: DSPASE Added some simple mips function begin/end macroes.
MIPS: DSPASE Implemented scanline add.
-rw-r--r-- | pixman/pixman-mips-dspase1-asm.S | 331 | ||||
-rw-r--r-- | pixman/pixman-mips-dspase1.c | 75 |
2 files changed, 325 insertions, 81 deletions
diff --git a/pixman/pixman-mips-dspase1-asm.S b/pixman/pixman-mips-dspase1-asm.S index b96fe837..596b38a4 100644 --- a/pixman/pixman-mips-dspase1-asm.S +++ b/pixman/pixman-mips-dspase1-asm.S @@ -1,27 +1,37 @@ - .text + .set mips32r2 + .set nomips16 + .set dsp + +.macro pixman_asm_func fname + .global \fname + .ent \fname +#ifdef __ELF__ + .type \fname, @function + .hidden \fname +#endif +\fname: +.endm + +.macro pixman_end_func fname + .end \fname + .size \fname, .-\fname +.endm + .set noreorder .set nomacro - -// void -// mips_dspase1_combine_over_u_nomask(uint32_t *dest, const uint32_t *src, -// const uint32_t *mask, int width) - - .global mips_dspase1_combine_over_u_nomask - .ent mips_dspase1_combine_over_u_nomask - // note: this version to be used only when mask = NULL -mips_dspase1_combine_over_u_nomask: - beqz $a3, 1f - subu $v0, $a1, $a0 // diff = src - dest (for LWX) +pixman_asm_func pixman_composite_scanline_over_asm_dspase1 + beqz $a0, 1f + subu $v0, $a2, $a1 // diff = src - dest (for LWX) - sll $a3, $a3, 2 // width <<= 2 - addu $a3, $a0, $a3 // dest_end = dest + width + sll $a0, $a0, 2 // width <<= 2 + addu $a0, $a1, $a0 // dest_end = dest + width - lw $t0, 0($a0) // dest - lwx $t1, $v0($a0) // src (dest + diff) + lw $t0, 0($a1) // dest + lwx $t1, $v0($a1) // src (dest + diff) li $t9, 0x00800080 @@ -33,8 +43,8 @@ mips_dspase1_combine_over_u_nomask: muleu_s.ph.qbl $t3, $t0, $t2 muleu_s.ph.qbr $t4, $t0, $t2 - lw $t0, 4($a0) // dest[1] for next loop iteration - addiu $a0, $a0, 4 // dest++ + lw $t0, 4($a1) // dest[1] for next loop iteration + addiu $a1, $a1, 4 // dest++ addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph @@ -46,41 +56,34 @@ mips_dspase1_combine_over_u_nomask: precrq.qb.ph $t3, $t3, $t4 addu_s.qb $t3, $t3, $t1 - lwx $t1, $v0($a0) // src (dest + diff) for next loop iteration + lwx $t1, $v0($a1) // src (dest + diff) for next loop iteration - bne $a0, $a3, 0b - sw $t3, -4($a0) // dest + bne $a1, $a0, 0b + sw $t3, -4($a1) // dest 1: jr $ra nop - .end mips_dspase1_combine_over_u_nomask - +pixman_end_func pixman_composite_scanline_over_asm_dspase1 -// void -// mips_dspase1_combine_over_u_mask(uint32_t *dest, const uint32_t *src, -// const uint32_t *mask, int width) - - .global mips_dspase1_combine_over_u_mask - .ent mips_dspase1_combine_over_u_mask // note: this version to be used only when mask != NULL -mips_dspase1_combine_over_u_mask: - beqz $a3, 1f - subu $v0, $a1, $a0 // sdiff = src - dest (for LWX) +pixman_asm_func pixman_composite_scanline_over_mask_asm_dspase1 + beqz $a0, 1f + subu $v0, $a2, $a1 // sdiff = src - dest (for LWX) - subu $v1, $a2, $a0 // mdiff = mask - dest (for LWX) + subu $v1, $a3, $a1 // mdiff = mask - dest (for LWX) - sll $a3, $a3, 2 // width <<= 2 - addu $a3, $a0, $a3 // dest_end = dest + width + sll $a0, $a0, 2 // width <<= 2 + addu $a0, $a1, $a0 // dest_end = dest + width li $t9, 0x00800080 0: - lwx $t8, $v1($a0) // mask (dest + mdiff) - lwx $t1, $v0($a0) // src (dest + sdiff) + lwx $t8, $v1($a1) // mask (dest + mdiff) + lwx $t1, $v0($a1) // src (dest + sdiff) srl $t8, $t8, 24 // mask >>= A_SHIFT ins $t8, $t8, 16, 8 // 0:m:0:m; equivalent to replv.ph @@ -88,7 +91,7 @@ mips_dspase1_combine_over_u_mask: muleu_s.ph.qbl $t3, $t1, $t8 muleu_s.ph.qbr $t4, $t1, $t8 - lw $t0, 0($a0) // dest + lw $t0, 0($a1) // dest addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph @@ -105,7 +108,7 @@ mips_dspase1_combine_over_u_mask: muleu_s.ph.qbl $t3, $t0, $t2 muleu_s.ph.qbr $t4, $t0, $t2 - addiu $a0, $a0, 4 // dest++ + addiu $a1, $a1, 4 // dest++ addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph @@ -116,26 +119,120 @@ mips_dspase1_combine_over_u_mask: precrq.qb.ph $t3, $t3, $t4 addu_s.qb $t3, $t3, $t1 - bne $a0, $a3, 0b - sw $t3, -4($a0) // dest + bne $a1, $a0, 0b + sw $t3, -4($a1) // dest 1: jr $ra nop - .end mips_dspase1_combine_over_u_mask +pixman_end_func pixman_composite_scanline_over_mask_asm_dspase1 + +# Scanline add, no mask +pixman_asm_func pixman_composite_scanline_add_asm_dspase1 + beqz $a0, $scanline_add_exit + sll $a0, $a0, 2 # Number of 8bit blocks (For addressing) + + move $t9, $a0 + ins $t9, $zero, 0, 4 # Number of 8*4*4 blocks + + addu $t8, $a0, $a1 + + beqz $t9, $scanline_add_no_main_loop + addu $t4, $t9, $a1 # end ptr for dst + +$scanline_add_width_loop: + lw $t2, 0($a1) # dst + lw $t3, 0($a2) # src + lw $v0, 4($a1) + lw $t7, 4($a2) + lw $v1, 8($a1) + lw $t0, 8($a2) + lw $t6, 12($a1) + lw $t1, 12($a2) + + addiu $a1, $a1, 16 + addiu $a2, $a2, 16 + + addu_s.qb $t2, $t2, $t3 + sw $t2, -16($a1) + addu_s.qb $v0, $v0, $t7 + sw $v0, -12($a1) + addu_s.qb $v1, $v1, $t0 + sw $v1, -8($a1) + addu_s.qb $t6, $t6, $t1 + + bne $a1, $t4, $scanline_add_width_loop + sw $t6, -4($a1) +$scanline_add_no_main_loop: + beq $t8, $a1, $scanline_add_exit + nop +$scanline_add_leftover_loop: + lw $t2, 0($a1) + lw $t3, 0($a2) + addiu $a1, $a1, 4 + addiu $a2, $a2, 4 + + addu_s.qb $t2, $t2, $t3 -//////////////////////////////////////////////////////////////////////////////// + bne $a1, $t8, $scanline_add_leftover_loop + sw $t2, -4($a1) +$scanline_add_exit: + jr $ra + nop +pixman_end_func pixman_composite_scanline_add_asm_dspase1 + +# Scanline add, mask +pixman_asm_func pixman_composite_scanline_add_mask_asm_dspase1 + beqz $a0, $scanline_add_mask_exit + sll $a0, $a0, 2 # Number of 8bit blocks (For addressing) + + li $t8, 0x00800080 + + subu $a2, $a2, $a1 // sdiff = src - dest (for LWX) + subu $a3, $a3, $a1 // mdiff = mask - dest (for LWX) + + addu $t9, $a1, $a0 +$scanline_add_mask_loop: + lwx $t2, $a3($a1) + lwx $t1, $a2($a1) + lw $t0, 0($a1) -// void -// mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, const uint32_t src, -// const uint8_t *mask, int width) + addiu $a1, $a1, 4 + + # based on pixman_composite_scanline_over_mask_asm_dspase1 + # converting these to macroes might make sense + srl $t2, $t2, 24 + ins $t2, $t2, 16, 8 // 0:m:0:m; equivalent to replv.ph + + muleu_s.ph.qbl $t3, $t1, $t2 + muleu_s.ph.qbr $t4, $t1, $t2 + + addu $t3, $t3, $t8 // can't overflow; rev2: addu_s.ph + addu $t4, $t4, $t8 // can't overflow; rev2: addu_s.ph + + preceu.ph.qbla $t5, $t3 // rev2: shrl.ph + preceu.ph.qbla $t6, $t4 // rev2: shrl.ph + addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph + addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph + + precrq.qb.ph $t1, $t3, $t4 + + addu_s.qb $t0, $t0, $t1 + + bne $a1, $t9, $scanline_add_mask_loop + sw $t0, -4($a1) +$scanline_add_mask_exit: + jr $ra + nop - .global mips_dspase1_composite_over_n_8_8888_inner - .ent mips_dspase1_composite_over_n_8_8888_inner +pixman_end_func pixman_composite_scanline_add_mask_asm_dspase1 -mips_dspase1_composite_over_n_8_8888_inner: + +//////////////////////////////////////////////////////////////////////////////// + +pixman_asm_func mips_dspase1_composite_over_n_8_8888_inner beqz $a3, 1f sll $a3, $a3, 2 // width <<= 2 @@ -185,5 +282,139 @@ mips_dspase1_composite_over_n_8_8888_inner: jr $ra nop - .end mips_dspase1_composite_over_n_8_8888_inner +pixman_end_func mips_dspase1_composite_over_n_8_8888_inner + +pixman_asm_func pixman_composite_add_8888_8888_asm_dspase1 + lw $v0, 16($sp) # src + lw $v1, 20($sp) # src_stride + + beqz $a1, $add_8888_8888_exit + addiu $sp, $sp, -8 + + sw $s0, 0($sp) + sw $s1, 4($sp) + + sll $a3, $a3, 2 + sll $v1, $v1, 2 + sll $a0, $a0, 2 # Number of 8bit blocks (For addressing) + + subu $v1, $v1, $a0 # stride - width + subu $a3, $a3, $a0 # stride - width + + move $t9, $a0 + ins $t9, $zero, 0, 4 # Number of 8*4*4 blocks + +$add_8888_8888_height_loop: + addu $t8, $a0, $a2 + + beqz $t9, $add_8888_8888_no_main_loop + addu $t4, $t9, $a2 # end ptr for dst + +$add_8888_8888_width_loop: + lw $t2, 0($a2) # dst + lw $t3, 0($v0) # src + lw $s0, 4($a2) + lw $t7, 4($v0) + lw $s1, 8($a2) + lw $t0, 8($v0) + lw $t6, 12($a2) + lw $t1, 12($v0) + + addiu $a2, $a2, 16 + addiu $v0, $v0, 16 + + addu_s.qb $t2, $t2, $t3 + sw $t2, -16($a2) + addu_s.qb $s0, $s0, $t7 + sw $s0, -12($a2) + addu_s.qb $s1, $s1, $t0 + sw $s1, -8($a2) + addu_s.qb $t6, $t6, $t1 + + bne $a2, $t4, $add_8888_8888_width_loop + sw $t6, -4($a2) +$add_8888_8888_no_main_loop: + beq $t8, $a2, $add_8888_8888_no_leftover + addiu $a1, $a1, -1 # Decrement height +$add_8888_8888_leftover_loop: + lw $t2, 0($a2) + lw $t3, 0($v0) + + addiu $a2, $a2, 4 + addiu $v0, $v0, 4 + + addu_s.qb $t2, $t2, $t3 + + bne $a2, $t8, $add_8888_8888_leftover_loop + sw $t2, -4($a2) +$add_8888_8888_no_leftover: + addu $v0, $v0, $v1 # src += src_stride + + bnez $a1, $add_8888_8888_height_loop + addu $a2, $a2, $a3 # dst += dst_stride + + lw $s0, 0($sp) + lw $s1, 4($sp) + +$add_8888_8888_exit: + jr $ra + addiu $sp, $sp, 8 +pixman_end_func pixman_composite_add_8888_8888_asm_dspase1 + +pixman_asm_func pixman_composite_add_n_8888_asm_dspase1 + lw $v0, 16($sp) # Src + + beqz $a1, $add_n_8888_exit + sll $a3, $a3, 2 # Dst stride + sll $a0, $a0, 2 # Number of 8bit blocks (For addressing) + + subu $a3, $a3, $a0 + + move $t9, $a0 + ins $t9, $zero, 0, 4 # Number of 8*4*4 blocks + +$add_n_8888_height_loop: + addu $t8, $a0, $a2 + + beqz $t9, $add_n_8888_no_main_loop + addu $t4, $t9, $a2 # end ptr for dst + +$add_n_8888_width_loop: + lw $t2, 0($a2) # dst + lw $t0, 4($a2) + lw $t1, 8($a2) + lw $t7, 12($a2) + + addiu $a2, $a2, 16 + + addu_s.qb $t2, $t2, $v0 + sw $t2, -16($a2) + addu_s.qb $t0, $t0, $v0 + sw $t0, -12($a2) + addu_s.qb $t1, $t1, $v0 + sw $t1, -8($a2) + addu_s.qb $t7, $t7, $v0 + + bne $a2, $t4, $add_n_8888_width_loop + sw $t7, -4($a2) +$add_n_8888_no_main_loop: + beq $t8, $a2, $add_n_8888_no_leftover + addiu $a1, $a1, -1 # Decrement height +$add_n_8888_leftover_loop: + lw $t2, 0($a2) + + addiu $a2, $a2, 4 # Moving this anywhere else will cause a stall in store + addu_s.qb $t2, $t2, $v0 + + bne $a2, $t8, $add_n_8888_leftover_loop + sw $t2, -4($a2) +$add_n_8888_no_leftover: + + bnez $a1, $add_n_8888_height_loop + addu $a2, $a2, $a3 # dst += dst_stride + +$add_n_8888_exit: + jr $ra + nop +pixman_end_func pixman_composite_add_n_8888_asm_dspase1 diff --git a/pixman/pixman-mips-dspase1.c b/pixman/pixman-mips-dspase1.c index 59722d21..0ab3f87d 100644 --- a/pixman/pixman-mips-dspase1.c +++ b/pixman/pixman-mips-dspase1.c @@ -3,47 +3,22 @@ #endif #include "pixman-private.h" +#include "pixman-arm-common.h" // assembly-language functions void -mips_dspase1_combine_over_u_nomask(uint32_t *dest, const uint32_t *src, - const uint32_t *mask, int width); - -void -mips_dspase1_combine_over_u_mask(uint32_t *dest, const uint32_t *src, - const uint32_t *mask, int width); - -void mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, uint32_t src, const uint8_t *mask, int width); +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(dspase1, add_8888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888, + uint32_t, 1) //////////////////////////////////////////////////////////////////////////////// - -static void -mips_dspase1_combine_over_u(pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dest, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - if (mask) - { -// _pixman_implementation_combine_32(imp->delegate, op, dest, src, mask, width); - mips_dspase1_combine_over_u_mask(dest, src, mask, width); - } - else - { -// _pixman_implementation_combine_32(imp->delegate, op, dest, src, mask, width); - mips_dspase1_combine_over_u_nomask(dest, src, mask, width); - } -} - - static void mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp, pixman_op_t op, @@ -84,6 +59,35 @@ mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp, } } +#define BIND_COMBINE_U(name) \ +void \ +pixman_composite_scanline_##name##_mask_asm_dspase1 (int32_t w, \ + const uint32_t *dst, \ + const uint32_t *src, \ + const uint32_t *mask); \ + \ +void \ +pixman_composite_scanline_##name##_asm_dspase1 (int32_t w, \ + const uint32_t *dst, \ + const uint32_t *src); \ + \ +static void \ +dspase1_combine_##name##_u (pixman_implementation_t *imp, \ + pixman_op_t op, \ + uint32_t * dest, \ + const uint32_t * src, \ + const uint32_t * mask, \ + int width) \ +{ \ + if (mask) \ + pixman_composite_scanline_##name##_mask_asm_dspase1 (width, dest, \ + src, mask); \ + else \ + pixman_composite_scanline_##name##_asm_dspase1 (width, dest, src); \ +} + +BIND_COMBINE_U (over) +BIND_COMBINE_U (add) //////////////////////////////////////////////////////////////////////////////// @@ -91,6 +95,14 @@ mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp, static const pixman_fast_path_t mips_dspase1_fast_paths[] = { PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, mips_dspase1_fast_composite_over_n_8_8888), + + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, dspase1_composite_add_n_8888 ), + PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, dspase1_composite_add_n_8888 ), + + PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, null, x8r8g8b8, dspase1_composite_add_8888_8888 ), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, dspase1_composite_add_8888_8888 ), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, dspase1_composite_add_8888_8888 ), + { PIXMAN_OP_NONE } }; @@ -101,7 +113,8 @@ _pixman_implementation_create_mips_dspase1 (pixman_implementation_t *delegate) pixman_implementation_t *imp = _pixman_implementation_create (delegate, mips_dspase1_fast_paths); - imp->combine_32[PIXMAN_OP_OVER] = mips_dspase1_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER] = dspase1_combine_over_u; + imp->combine_32[PIXMAN_OP_ADD] = dspase1_combine_add_u; return imp; } |