DSPASE More cleanup, out reverse op.mips2

MIPS: DSPASE Implemented DSPASE1_UN8x4_MUL_UN8 macro. MIPS: DSPASE Implemented scanline out reverse MIPS: DSPASE over_n_8_8888 modified to use the macro bindings
author: Veli-Matti Valtonen <veli-matti.valtonen@movial.com> 2011-02-22 11:05:57 +0200
committer: Søren Sandmann Pedersen <ssp@redhat.com> 2011-02-24 06:36:25 -0500
commit: 0eb98a3a6a4320619ce8d95eba9f884d8a7c9097 (patch)
tree: 6d4ad59ac9d4a538ed4be6fef8821011849e22b9
parent: c45f7d46eabaadb8cad0ac6f229ed46c216e847b (diff)
2 files changed, 155 insertions, 121 deletions
diff --git a/pixman/pixman-mips-dspase1-asm.S b/pixman/pixman-mips-dspase1-asm.S
index 596b38a4..0cb2293f 100644
--- a/pixman/pixman-mips-dspase1-asm.S
+++ b/pixman/pixman-mips-dspase1-asm.S
@@ -18,6 +18,26 @@
 	.size \fname, .-\fname
 .endm
 
+# result register can be the same as any of the params
+# rb_half should contain 0x00800080
+.macro DSPASE1_UN8x4_MUL_UN8_head a, b, x, y
+	muleu_s.ph.qbl \x, \a, \b
+	muleu_s.ph.qbr \y, \a, \b 
+.endm
+
+.macro DSPASE1_UN8x4_MUL_UN8_tail x, y, result, rb_half, tmp3, tmp4
+	addu \x, \x, \rb_half
+	addu \y, \y, \rb_half
+
+	preceu.ph.qbla \tmp3, \x	
+	preceu.ph.qbla \tmp4, \y	
+
+	addu \x, \x, \tmp3
+	addu \y, \y, \tmp4
+	
+	precrq.qb.ph \result, \x, \y
+.endm
+
 	.set		noreorder
 	.set		nomacro
 
@@ -40,20 +60,13 @@ pixman_asm_func pixman_composite_scanline_over_asm_dspase1
 	srl		$t2, $t2, 24	// ALPHA_8(~src)
 	ins		$t2, $t2, 16, 8	// 0:a:0:a; equivalent to replv.ph
 
-	muleu_s.ph.qbl	$t3, $t0, $t2
-	muleu_s.ph.qbr	$t4, $t0, $t2
+	DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4
 
 	lw		$t0, 4($a1)	// dest[1] for next loop iteration
 	addiu		$a1, $a1, 4	// dest++
 
-	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
-	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
-	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
-	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6
 
-	precrq.qb.ph	$t3, $t3, $t4
 	addu_s.qb	$t3, $t3, $t1
 
 	lwx		$t1, $v0($a1)	// src (dest + diff) for next loop iteration
@@ -88,35 +101,22 @@ pixman_asm_func pixman_composite_scanline_over_mask_asm_dspase1
 	srl		$t8, $t8, 24	// mask >>= A_SHIFT
 	ins		$t8, $t8, 16, 8	// 0:m:0:m; equivalent to replv.ph
 
-	muleu_s.ph.qbl	$t3, $t1, $t8
-	muleu_s.ph.qbr	$t4, $t1, $t8
+	DSPASE1_UN8x4_MUL_UN8_head $t1, $t8, $t3, $t4
 
 	lw		$t0, 0($a1)	// dest
 	
-	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
-	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
-	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
-	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
-	precrq.qb.ph	$t1, $t3, $t4
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t9, $t5, $t6
 
 	not		$t2, $t1	// ~src
 	srl		$t2, $t2, 24	// ALPHA_8(~src)
 	ins		$t2, $t2, 16, 8	// 0:a:0:a; equivalent to replv.ph
 
-	muleu_s.ph.qbl	$t3, $t0, $t2
-	muleu_s.ph.qbr	$t4, $t0, $t2
+	DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4
 
 	addiu		$a1, $a1, 4	// dest++
+	
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6
 
-	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
-	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
-	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
-	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
-	precrq.qb.ph	$t3, $t3, $t4
 	addu_s.qb	$t3, $t3, $t1
 
 	bne		$a1, $a0, 0b
@@ -197,28 +197,18 @@ pixman_asm_func pixman_composite_scanline_add_mask_asm_dspase1
 $scanline_add_mask_loop:
 	lwx $t2, $a3($a1)
 	lwx $t1, $a2($a1)
-	lw $t0, 0($a1)
-
-	addiu $a1, $a1, 4
 	
 	# based on pixman_composite_scanline_over_mask_asm_dspase1
-	# converting these to macroes might make sense
 	srl $t2, $t2, 24
 	ins $t2, $t2, 16, 8	// 0:m:0:m; equivalent to replv.ph
-	
-	muleu_s.ph.qbl $t3, $t1, $t2
-	muleu_s.ph.qbr $t4, $t1, $t2
-
-	addu $t3, $t3, $t8	// can't overflow; rev2: addu_s.ph
-	addu $t4, $t4, $t8	// can't overflow; rev2: addu_s.ph
 
-	preceu.ph.qbla $t5, $t3	// rev2: shrl.ph
-	preceu.ph.qbla $t6, $t4	// rev2: shrl.ph
-	addu $t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
-	addu $t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
+	DSPASE1_UN8x4_MUL_UN8_head $t1, $t2, $t3, $t4
 	
-	precrq.qb.ph $t1, $t3, $t4
+	lw $t0, 0($a1)
+	addiu $a1, $a1, 4
 
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t8, $t5, $t6
+	
 	addu_s.qb $t0, $t0, $t1
 
 	bne $a1, $t9, $scanline_add_mask_loop
@@ -229,60 +219,144 @@ $scanline_add_mask_exit:
 
 pixman_end_func pixman_composite_scanline_add_mask_asm_dspase1
 
+# Scanline out reverse, no mask
+pixman_asm_func pixman_composite_scanline_out_reverse_asm_dspase1
+	beqz $a0, $scanline_out_reverse_exit
+	sll $a0, $a0, 2			# Number of 8bit blocks (For addressing)
+
+	li $t8,	0x00800080
+	
+	subu $a2, $a2, $a1	// sdiff = src - dest (for LWX)
+
+	addu $t9, $a1, $a0
+$scanline_out_reverse_loop:
+	lwx $t1, $a2($a1)
+	lw $t0, 0($a1)
+
+	not $t1, $t1
+	srl $t1, $t1, 24	# src
+	ins $t1, $t1, 16, 8	// 0:m:0:m; equivalent to replv.ph
+	
+	DSPASE1_UN8x4_MUL_UN8_head $t0, $t1, $t3, $t4
+
+	addiu $a1, $a1, 4
+
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t0, $t8, $t5, $t6
+
+	bne $a1, $t9, $scanline_out_reverse_loop
+	sw $t0, -4($a1)
+$scanline_out_reverse_exit:
+	jr $ra
+	nop
+pixman_end_func pixman_composite_scanline_out_reverse_asm_dspase1
+
+# Scanline out reverse, mask
+pixman_asm_func pixman_composite_scanline_out_reverse_mask_asm_dspase1
+	beqz $a0, $scanline_out_reverse_mask_exit
+	sll $a0, $a0, 2			# Number of 8bit blocks (For addressing)
+	
+	li $t8,	0x00800080
+	
+	subu $a2, $a2, $a1	// sdiff = src - dest (for LWX)
+	subu $a3, $a3, $a1	// mdiff = mask - dest (for LWX)
+
+	addu $t9, $a1, $a0
+$scanline_out_reverse_mask_loop:
+	lwx $t2, $a3($a1)
+	lwx $t1, $a2($a1)
+	
+	# combine mask
+	srl $t2, $t2, 24	# mask
+	srl $t1, $t1, 24	# src
+
+	mul $t3, $t2, $t1
+
+	lw $t0, 0($a1)
+
+	addiu $t3, $t3, 0x80
+	srl $t4, $t3, 8
+	addu $t3, $t3, $t4
+	srl $t3, $t3, 8
+	# mask combined	
+
+	not $t1, $t3
+	andi $t1, $t1, 0xff
+	ins $t1, $t1, 16, 8	// 0:m:0:m; equivalent to replv.ph
+
+	DSPASE1_UN8x4_MUL_UN8_head $t0, $t1, $t3, $t4
+
+	addiu $a1, $a1, 4
+
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t0, $t8, $t5, $t6
+
+	bne $a1, $t9, $scanline_out_reverse_mask_loop
+	sw $t0, -4($a1)
+$scanline_out_reverse_mask_exit:
+	jr $ra
+	nop
+pixman_end_func pixman_composite_scanline_out_reverse_mask_asm_dspase1
+
+
 
 ////////////////////////////////////////////////////////////////////////////////
 
-pixman_asm_func mips_dspase1_composite_over_n_8_8888_inner
-	beqz		$a3, 1f
-	sll		$a3, $a3, 2	// width <<= 2
+pixman_asm_func pixman_composite_over_n_8_8888_asm_dspase1
+	lw		$v0, 16($sp) # src
+	# 20($sp) is unused
+	lw		$v1, 24($sp) # mask
+	lw		$t7, 28($sp) # mask_stride
+
+	beqz		$a0, $over_n_8_8888_end
+	addiu		$sp, $sp, -4
+	
+	sw		$s0, 0($sp)
+
+	subu		$t7, $t7, $a0	# mask 8bit stride - width
 
-	addu		$a3, $a0, $a3	// dest_end = dest + width
+	sll		$a0, $a0, 2	// width <<= 2
+	sll		$a3, $a3, 2	# dst <<= 2
+
+	subu		$a3, $a3, $a0	# dst stride - width
 
 	li		$t9, 0x00800080
 
-0:
-	lbu		$t8, 0($a2)	// mask
-	lw		$t0, 0($a0)	// dest
+$over_n_8_8888_height_loop:
+	addu		$s0, $a0, $a2	# dst end
+	addiu		$a1, $a1, -1
+
+$over_n_8_8888_width_loop:
+	lbu		$t8, 0($v1)	// mask
+	lw		$t0, 0($a2)	// dest
 	ins		$t8, $t8, 16, 8	// 0:m:0:m; equivalent to replv.ph
 
-	muleu_s.ph.qbl	$t3, $a1, $t8
-	muleu_s.ph.qbr	$t4, $a1, $t8
+	DSPASE1_UN8x4_MUL_UN8_head $v0, $t8, $t3, $t4
 
-	addiu		$a0, $a0, 4	// dest++
-	addiu		$a2, $a2, 1	// mask++
+	addiu		$a2, $a2, 4	// dest++
+	addiu		$v1, $v1, 1	// mask++
 
-	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
-	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
-	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
-	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
-	precrq.qb.ph	$t1, $t3, $t4	// in(src,m)
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t9, $t5, $t6
 
 	not		$t2, $t1	// ~in(src,m)
 	srl		$t2, $t2, 24
 	ins		$t2, $t2, 16, 8	// 0:a:0:a; equivalent to replv.ph
 
-	muleu_s.ph.qbl	$t3, $t0, $t2
-	muleu_s.ph.qbr	$t4, $t0, $t2
+	DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4
+	DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6
 
-	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
-	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
-	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
-	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
-	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
-	precrq.qb.ph	$t3, $t3, $t4
 	addu_s.qb	$t3, $t3, $t1	// over(in(src,m),dest)
 
-	bne		$a0, $a3, 0b
-	sw		$t3, -4($a0)	// dest
+	bne		$a2, $s0, $over_n_8_8888_width_loop
+	sw		$t3, -4($a2)	// dest
 
-1:
+	addu		$a2, $a2, $a3
+	bnez		$a1, $over_n_8_8888_height_loop
+	addu		$v1, $v1, $t7
+	
+	lw		$s0, 0($sp)
+$over_n_8_8888_end:
 	jr		$ra
-	nop
-
-pixman_end_func mips_dspase1_composite_over_n_8_8888_inner
+	addiu		$sp, $sp, 4
+pixman_end_func pixman_composite_over_n_8_8888_asm_dspase1
 
 pixman_asm_func pixman_composite_add_8888_8888_asm_dspase1
 	lw $v0, 16($sp) # src
@@ -403,7 +477,7 @@ $add_n_8888_no_main_loop:
 $add_n_8888_leftover_loop:
 	lw $t2, 0($a2)
 
-	addiu $a2, $a2, 4	# Moving this anywhere else will cause a stall in store
+	addiu $a2, $a2, 4
 	addu_s.qb $t2, $t2, $v0
 
 	bne $a2, $t8, $add_n_8888_leftover_loop
diff --git a/pixman/pixman-mips-dspase1.c b/pixman/pixman-mips-dspase1.c
index 0ab3f87d..b53c3df5 100644
--- a/pixman/pixman-mips-dspase1.c
+++ b/pixman/pixman-mips-dspase1.c
@@ -8,10 +8,8 @@
 
 // assembly-language functions
 
-void
-mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, uint32_t src,
-										   const uint8_t *mask, int width);
-
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(SKIP_ZERO_SRC, dspase1, over_n_8_8888,
+                                     uint8_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(dspase1, add_8888_8888,
                                   uint32_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888,
@@ -19,46 +17,6 @@ PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static void
-mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp,
-										  pixman_op_t              op,
-										  pixman_image_t *         src_image,
-										  pixman_image_t *         mask_image,
-										  pixman_image_t *         dst_image,
-										  int32_t                  src_x,
-										  int32_t                  src_y,
-										  int32_t                  mask_x,
-										  int32_t                  mask_y,
-										  int32_t                  dest_x,
-										  int32_t                  dest_y,
-										  int32_t                  width,
-										  int32_t                  height)
-{
-    uint32_t src, srca;
-    uint32_t *dst_line, *dst;
-    uint8_t  *mask_line, *mask;
-    int dst_stride, mask_stride;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-		return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-		dst = dst_line;
-		dst_line += dst_stride;
-		mask = mask_line;
-		mask_line += mask_stride;
-
-		mips_dspase1_composite_over_n_8_8888_inner(dst, src, mask, width);
-    }
-}
-
 #define BIND_COMBINE_U(name)                                             \
 void                                                                     \
 pixman_composite_scanline_##name##_mask_asm_dspase1 (int32_t         w,     \
@@ -88,13 +46,14 @@ dspase1_combine_##name##_u (pixman_implementation_t *imp,                   \
 
 BIND_COMBINE_U (over)
 BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
 
 ////////////////////////////////////////////////////////////////////////////////
 
 
 static const pixman_fast_path_t mips_dspase1_fast_paths[] =
 {
-	PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, mips_dspase1_fast_composite_over_n_8_8888),
+	PIXMAN_STD_FAST_PATH(OVER, solid,      a8, a8r8g8b8, dspase1_composite_over_n_8_8888 ),
 
 	PIXMAN_STD_FAST_PATH (ADD, solid,    null, a8r8g8b8, dspase1_composite_add_n_8888 ),
 	PIXMAN_STD_FAST_PATH (ADD, solid,    null, x8r8g8b8, dspase1_composite_add_n_8888 ),
@@ -115,6 +74,7 @@ _pixman_implementation_create_mips_dspase1 (pixman_implementation_t *delegate)
 
 	imp->combine_32[PIXMAN_OP_OVER] = dspase1_combine_over_u;
 	imp->combine_32[PIXMAN_OP_ADD] = dspase1_combine_add_u;
+	imp->combine_32[PIXMAN_OP_OUT_REVERSE] = dspase1_combine_out_reverse_u;
 
     return imp;
 }
author	Veli-Matti Valtonen <veli-matti.valtonen@movial.com>	2011-02-22 11:05:57 +0200
committer	Søren Sandmann Pedersen <ssp@redhat.com>	2011-02-24 06:36:25 -0500
commit	0eb98a3a6a4320619ce8d95eba9f884d8a7c9097 (patch)
tree	6d4ad59ac9d4a538ed4be6fef8821011849e22b9
parent	c45f7d46eabaadb8cad0ac6f229ed46c216e847b (diff)