ARM: reuse common NEON code for over_{n_8|8888_n|8888_8}_0565

Renamed suppementary macros from 'over_n_8_0565' to 'over_8888_8_0565', because they can actually support all variants of this operation: over_8888_8_0565/over_n_8_0565/over_8888_n_0565. Also 'over_8888_8_0565' now uses more optimized common code instead of its own variant, improving performance a bit. Even though this operation is still memory bandwidth limited, scaled variants of these fast paths may put more stress on CPU later. Benchmarked on ARM Cortex-A8 @500MHz: == before == over_8888_8_0565 = L1: 67.10 L2: 53.82 M: 44.70 (105.17%) HT: 18.73 VT: 16.91 R: 14.25 RT: 4.80 (52Kops/s) == after == over_8888_8_0565 = L1: 77.83 L2: 58.14 M: 44.82 (105.52%) HT: 20.58 VT: 17.44 R: 15.05 RT: 4.88 (52Kops/s)
author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2010-11-27 04:47:39 +0200
committer: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2010-12-03 15:37:19 +0200
commit: 3990931bf6197eff1cec06cf24bce53ddf9a539a (patch)
tree: 566a2626ad4f819aeb6dd2f75abfec6a5868d097
parent: a7c36681c0c1955ff9110b81f1789e56abb10a95 (diff)
1 files changed, 25 insertions, 36 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3e52a49f..4175144b 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -791,7 +791,7 @@ generate_composite_function \
 
 /******************************************************************************/
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
     vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
     vmull.u8    q1,  d24, d9
     vmull.u8    q6,  d24, d10
@@ -816,7 +816,7 @@ generate_composite_function \
     vmull.u8    q10, d3, d30
 .endm
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
     /* 3 cycle bubble (after vmull.u8) */
     vrshr.u16   q13, q8,  #8
     vrshr.u16   q11, q9,  #8
@@ -835,7 +835,7 @@ generate_composite_function \
     vsri.u16    q14, q9,  #11
 .endm
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
     vld1.16     {d4, d5}, [DST_R, :128]!
     vshrn.u16   d6,  q2,  #8
     fetch_mask_pixblock
@@ -880,6 +880,23 @@ generate_composite_function \
     vmull.u8    q10, d3,  d30
 .endm
 
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
 /*
  * This function needs a special initialization of solid mask.
  * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
@@ -911,9 +928,9 @@ generate_composite_function \
     5, /* prefetch distance */ \
     pixman_composite_over_n_8_0565_init, \
     pixman_composite_over_n_8_0565_cleanup, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
 
 /******************************************************************************/
 
@@ -935,36 +952,8 @@ generate_composite_function \
     5, /* prefetch distance */ \
     pixman_composite_over_8888_n_0565_init, \
     pixman_composite_over_8888_n_0565_cleanup, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    pixman_composite_over_n_8_0565_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_over_n_8_0565_process_pixblock_head
-    vst1.16     {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
     28, /* dst_w_basereg */ \
     4,  /* dst_r_basereg */ \
author	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2010-11-27 04:47:39 +0200
committer	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2010-12-03 15:37:19 +0200
commit	3990931bf6197eff1cec06cf24bce53ddf9a539a (patch)
tree	566a2626ad4f819aeb6dd2f75abfec6a5868d097
parent	a7c36681c0c1955ff9110b81f1789e56abb10a95 (diff)