diff options
author | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-11-27 04:47:39 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-12-03 15:37:19 +0200 |
commit | 3990931bf6197eff1cec06cf24bce53ddf9a539a (patch) | |
tree | 566a2626ad4f819aeb6dd2f75abfec6a5868d097 | |
parent | a7c36681c0c1955ff9110b81f1789e56abb10a95 (diff) |
ARM: reuse common NEON code for over_{n_8|8888_n|8888_8}_0565
Renamed suppementary macros from 'over_n_8_0565' to 'over_8888_8_0565',
because they can actually support all variants of this operation:
over_8888_8_0565/over_n_8_0565/over_8888_n_0565.
Also 'over_8888_8_0565' now uses more optimized common code instead of its
own variant, improving performance a bit. Even though this operation is
still memory bandwidth limited, scaled variants of these fast paths may
put more stress on CPU later.
Benchmarked on ARM Cortex-A8 @500MHz:
== before ==
over_8888_8_0565 = L1: 67.10 L2: 53.82 M: 44.70 (105.17%)
HT: 18.73 VT: 16.91 R: 14.25 RT: 4.80 (52Kops/s)
== after ==
over_8888_8_0565 = L1: 77.83 L2: 58.14 M: 44.82 (105.52%)
HT: 20.58 VT: 17.44 R: 15.05 RT: 4.88 (52Kops/s)
-rw-r--r-- | pixman/pixman-arm-neon-asm.S | 61 |
1 files changed, 25 insertions, 36 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 3e52a49f..4175144b 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -791,7 +791,7 @@ generate_composite_function \ /******************************************************************************/ -.macro pixman_composite_over_n_8_0565_process_pixblock_head +.macro pixman_composite_over_8888_8_0565_process_pixblock_head vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ vmull.u8 q1, d24, d9 vmull.u8 q6, d24, d10 @@ -816,7 +816,7 @@ generate_composite_function \ vmull.u8 q10, d3, d30 .endm -.macro pixman_composite_over_n_8_0565_process_pixblock_tail +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail /* 3 cycle bubble (after vmull.u8) */ vrshr.u16 q13, q8, #8 vrshr.u16 q11, q9, #8 @@ -835,7 +835,7 @@ generate_composite_function \ vsri.u16 q14, q9, #11 .endm -.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head vld1.16 {d4, d5}, [DST_R, :128]! vshrn.u16 d6, q2, #8 fetch_mask_pixblock @@ -880,6 +880,23 @@ generate_composite_function \ vmull.u8 q10, d3, d30 .endm +generate_composite_function \ + pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + /* * This function needs a special initialization of solid mask. * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET @@ -911,9 +928,9 @@ generate_composite_function \ 5, /* prefetch distance */ \ pixman_composite_over_n_8_0565_init, \ pixman_composite_over_n_8_0565_cleanup, \ - pixman_composite_over_n_8_0565_process_pixblock_head, \ - pixman_composite_over_n_8_0565_process_pixblock_tail, \ - pixman_composite_over_n_8_0565_process_pixblock_tail_head + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head /******************************************************************************/ @@ -935,36 +952,8 @@ generate_composite_function \ 5, /* prefetch distance */ \ pixman_composite_over_8888_n_0565_init, \ pixman_composite_over_8888_n_0565_cleanup, \ - pixman_composite_over_n_8_0565_process_pixblock_head, \ - pixman_composite_over_n_8_0565_process_pixblock_tail, \ - pixman_composite_over_n_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head - vld1.16 {d4, d5}, [DST_R, :128]! - pixman_composite_over_n_8_0565_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - fetch_mask_pixblock - pixman_composite_over_n_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_n_8_0565_process_pixblock_head, \ - pixman_composite_over_n_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ 4, /* dst_r_basereg */ \ |