summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNemanja Lukic <nemanja.lukic@rt-rk.com>2012-10-14 11:58:50 +0200
committerSøren Sandmann Pedersen <ssp@redhat.com>2012-10-25 10:04:30 -0400
commit52d20e692ebc605077448ab6f52fd257f83481b2 (patch)
treea31b6af8cbedbe2034ef4edb1cdffe672eb864fa
parent9df645dfb04b5a790faabe1e9a84fc37287d91b0 (diff)
MIPS: DSPr2: Added fast-paths for ADD operation: - add_n_8_8 - add_n_8_8888 - add_8_8_8
Performance numbers before/after on MIPS-74kc @ 1GHz: lowlevel-blt-bench results Referent (before): add_n_8_8 = L1: 41.37 L2: 37.83 M: 30.38 ( 60.45%) HT: 23.70 VT: 22.85 R: 21.51 RT: 10.32 ( 45Kops/s) add_n_8_8888 = L1: 16.01 L2: 14.46 M: 11.64 ( 46.32%) HT: 5.50 VT: 5.18 R: 5.06 RT: 1.89 ( 18Kops/s) add_8_8_8 = L1: 13.26 L2: 12.47 M: 11.16 ( 29.61%) HT: 8.09 VT: 8.04 R: 7.68 RT: 3.90 ( 29Kops/s) Optimized: add_n_8_8 = L1: 96.03 L2: 79.37 M: 51.89 (103.31%) HT: 32.59 VT: 31.29 R: 28.52 RT: 11.08 ( 46Kops/s) add_n_8_8888 = L1: 53.61 L2: 46.92 M: 23.78 ( 94.70%) HT: 19.06 VT: 18.64 R: 17.30 RT: 9.15 ( 43Kops/s) add_8_8_8 = L1: 89.65 L2: 66.82 M: 37.10 ( 98.48%) HT: 22.10 VT: 21.74 R: 20.12 RT: 8.12 ( 41Kops/s)
-rw-r--r--pixman/pixman-mips-dspr2-asm.S252
-rw-r--r--pixman/pixman-mips-dspr2-asm.h22
-rw-r--r--pixman/pixman-mips-dspr2.c10
3 files changed, 284 insertions, 0 deletions
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 3a6b26a..614c628 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1209,6 +1209,258 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
END(pixman_composite_over_8888_8888_asm_mips)
+LEAF_MIPS_DSPR2(pixman_composite_add_8_8_8_asm_mips)
+/*
+ * a0 - dst (a8)
+ * a1 - src (a8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, v0, v1
+ li t9, 0x00ff00ff
+ beqz a3, 3f
+ nop
+
+ srl v0, a3, 2 /* v0 = how many multiples of 4 dst pixels */
+ beqz v0, 1f /* branch if less than 4 src pixels */
+ nop
+
+0:
+ beqz v0, 1f
+ addiu v0, v0, -1
+ lbu t0, 0(a2)
+ lbu t1, 1(a2)
+ lbu t2, 2(a2)
+ lbu t3, 3(a2)
+ lbu t4, 0(a0)
+ lbu t5, 1(a0)
+ lbu t6, 2(a0)
+ lbu t7, 3(a0)
+
+ addiu a2, a2, 4
+
+ precr_sra.ph.w t1, t0, 0
+ precr_sra.ph.w t3, t2, 0
+ precr_sra.ph.w t5, t4, 0
+ precr_sra.ph.w t7, t6, 0
+
+ precr.qb.ph t0, t3, t1
+ precr.qb.ph t1, t7, t5
+
+ lbu t4, 0(a1)
+ lbu v1, 1(a1)
+ lbu t7, 2(a1)
+ lbu t8, 3(a1)
+
+ addiu a1, a1, 4
+
+ precr_sra.ph.w v1, t4, 0
+ precr_sra.ph.w t8, t7, 0
+
+ muleu_s.ph.qbl t2, t0, t8
+ muleu_s.ph.qbr t3, t0, v1
+ shra_r.ph t4, t2, 8
+ shra_r.ph t5, t3, 8
+ and t4, t4, t9
+ and t5, t5, t9
+ addq.ph t2, t2, t4
+ addq.ph t3, t3, t5
+ shra_r.ph t2, t2, 8
+ shra_r.ph t3, t3, 8
+ precr.qb.ph t0, t2, t3
+
+ addu_s.qb t2, t0, t1
+
+ sb t2, 0(a0)
+ srl t2, t2, 8
+ sb t2, 1(a0)
+ srl t2, t2, 8
+ sb t2, 2(a0)
+ srl t2, t2, 8
+ sb t2, 3(a0)
+ addiu a3, a3, -4
+ b 0b
+ addiu a0, a0, 4
+
+1:
+ beqz a3, 3f
+ nop
+2:
+ lbu t8, 0(a1)
+ lbu t0, 0(a2)
+ lbu t1, 0(a0)
+ addiu a1, a1, 1
+ addiu a2, a2, 1
+
+ mul t2, t0, t8
+ shra_r.ph t3, t2, 8
+ andi t3, t3, 0xff
+ addq.ph t2, t2, t3
+ shra_r.ph t2, t2, 8
+ andi t2, t2, 0xff
+
+ addu_s.qb t2, t2, t1
+ sb t2, 0(a0)
+ addiu a3, a3, -1
+ bnez a3, 2b
+ addiu a0, a0, 1
+
+3:
+ RESTORE_REGS_FROM_STACK 0, v0, v1
+ j ra
+ nop
+
+END(pixman_composite_add_8_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8_asm_mips)
+/*
+ * a0 - dst (a8)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, v0
+ li t9, 0x00ff00ff
+ beqz a3, 3f
+ nop
+
+ srl v0, a3, 2 /* v0 = how many multiples of 4 dst pixels */
+ beqz v0, 1f /* branch if less than 4 src pixels */
+ nop
+
+ srl t8, a1, 24
+ replv.ph t8, t8
+
+0:
+ beqz v0, 1f
+ addiu v0, v0, -1
+ lbu t0, 0(a2)
+ lbu t1, 1(a2)
+ lbu t2, 2(a2)
+ lbu t3, 3(a2)
+ lbu t4, 0(a0)
+ lbu t5, 1(a0)
+ lbu t6, 2(a0)
+ lbu t7, 3(a0)
+
+ addiu a2, a2, 4
+
+ precr_sra.ph.w t1, t0, 0
+ precr_sra.ph.w t3, t2, 0
+ precr_sra.ph.w t5, t4, 0
+ precr_sra.ph.w t7, t6, 0
+
+ precr.qb.ph t0, t3, t1
+ precr.qb.ph t1, t7, t5
+
+ muleu_s.ph.qbl t2, t0, t8
+ muleu_s.ph.qbr t3, t0, t8
+ shra_r.ph t4, t2, 8
+ shra_r.ph t5, t3, 8
+ and t4, t4, t9
+ and t5, t5, t9
+ addq.ph t2, t2, t4
+ addq.ph t3, t3, t5
+ shra_r.ph t2, t2, 8
+ shra_r.ph t3, t3, 8
+ precr.qb.ph t0, t2, t3
+
+ addu_s.qb t2, t0, t1
+
+ sb t2, 0(a0)
+ srl t2, t2, 8
+ sb t2, 1(a0)
+ srl t2, t2, 8
+ sb t2, 2(a0)
+ srl t2, t2, 8
+ sb t2, 3(a0)
+ addiu a3, a3, -4
+ b 0b
+ addiu a0, a0, 4
+
+1:
+ beqz a3, 3f
+ nop
+ srl t8, a1, 24
+2:
+ lbu t0, 0(a2)
+ lbu t1, 0(a0)
+ addiu a2, a2, 1
+
+ mul t2, t0, t8
+ shra_r.ph t3, t2, 8
+ andi t3, t3, 0xff
+ addq.ph t2, t2, t3
+ shra_r.ph t2, t2, 8
+ andi t2, t2, 0xff
+
+ addu_s.qb t2, t2, t1
+ sb t2, 0(a0)
+ addiu a3, a3, -1
+ bnez a3, 2b
+ addiu a0, a0, 1
+
+3:
+ RESTORE_REGS_FROM_STACK 0, v0
+ j ra
+ nop
+
+END(pixman_composite_add_n_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, s0, s1, s2
+ li t4, 0x00ff00ff
+ beqz a3, 3f
+ nop
+ addiu t1, a3, -1
+ beqz t1, 2f
+ nop
+1:
+ /* a1 = source (32bit constant) */
+ lbu t0, 0(a2) /* t0 = mask (a8) */
+ lbu t1, 1(a2) /* t1 = mask (a8) */
+ lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+ lw t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+ addiu a2, a2, 2
+
+ MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 a1, a1, \
+ t0, t1, \
+ t2, t3, \
+ t5, t6, \
+ t4, t7, t8, t9, s0, s1, s2
+
+ sw t5, 0(a0)
+ sw t6, 4(a0)
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+2:
+ beqz a3, 3f
+ nop
+ /* a1 = source (32bit constant) */
+ lbu t0, 0(a2) /* t0 = mask (a8) */
+ lw t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+ MIPS_UN8x4_MUL_UN8_ADD_UN8x4 a1, t0, t1, t2, t4, t3, t5, t6
+
+ sw t2, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+ j ra
+ nop
+
+END(pixman_composite_add_n_8_8888_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
/*
* a0 - *dst
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index 7327dc6..b330c0f 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -600,6 +600,28 @@ LEAF_MIPS32R2(symbol) \
addu_s.qb \out_8888, \out_8888, \d_8888
.endm
+.macro MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 s1_8888, \
+ s2_8888, \
+ m1_8, \
+ m2_8, \
+ d1_8888, \
+ d2_8888, \
+ out1_8888, \
+ out2_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, \
+ scratch4, scratch5, scratch6
+ MIPS_2xUN8x4_MUL_2xUN8 \s1_8888, \s2_8888, \
+ \m1_8, \m2_8, \
+ \out1_8888, \out2_8888, \
+ \maskLSR, \
+ \scratch1, \scratch2, \scratch3, \
+ \scratch4, \scratch5, \scratch6
+
+ addu_s.qb \out1_8888, \out1_8888, \d1_8888
+ addu_s.qb \out2_8888, \out2_8888, \d2_8888
+.endm
+
.macro BILINEAR_INTERPOLATE_SINGLE_PIXEL tl, tr, bl, br, \
scratch1, scratch2, \
alpha, red, green, blue \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index e80bbb6..30d2a85 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -59,6 +59,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8,
+ uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -67,6 +71,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565,
PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565,
uint16_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t, 1,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1,
uint8_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
@@ -271,6 +277,10 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mips_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mips_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mips_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mips_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, mips_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, mips_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, mips_composite_add_8_8_8),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),