diff options
-rw-r--r-- | pixman/pixman-access.c | 8 | ||||
-rw-r--r-- | pixman/pixman-arm-neon-asm.S | 161 | ||||
-rw-r--r-- | pixman/pixman-arm-neon-asm.h | 6 | ||||
-rw-r--r-- | pixman/pixman-arm-neon.c | 55 | ||||
-rw-r--r-- | test/blitters-test-bisect.rb | 6 | ||||
-rw-r--r-- | test/blitters-test.c | 57 |
6 files changed, 240 insertions, 53 deletions
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c index 5f672579..5e72b421 100644 --- a/pixman/pixman-access.c +++ b/pixman/pixman-access.c @@ -1088,20 +1088,20 @@ fetch_scanline_ayuv (pixman_image_t *image, const uint8_t *bits = (const uint8_t *) (image->bits.bits + image->bits.rowstride * line + x); int i; - + for (i = 0; i < width; i++) { int32_t a, y, u, v, r, g, b; - + a = bits[0]; y = bits[1]; u = bits[2]; v = bits[3]; - + YUV2RGB_CHROMA (r, g, b, u, v); YUV2RGB_ADD (r, g, b, y); YUV2RGB_STORE_ALPHA (*buffer, a, r, g, b); - + buffer++; bits += 4; } diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index e8ccf77a..b2298481 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1049,3 +1049,164 @@ generate_composite_function \ 0, /* dst_r_basereg */ \ 0, /* src_basereg */ \ 0 /* mask_basereg */ + +/******************************************************************************/ + +/* + * YUV->RGB conversion, optimized for the use of fast 8-bit NEON multiplications, + * but aiming to minimize precision loss as much as possible. + * + * C pseudocode: + * + * y = clamp_range(y, 16, 235) - 16; + * u = clamp_range(u, 16, 240) - 16; + * v = clamp_range(v, 16, 240) - 16; + * + * r = clamp((int16_t)(((int16_t)(149 * y) - (int16_t)(22840 - 204 * v - (v >> 1))) >> 1) >> 6); + * g = clamp((int16_t)(((int16_t)(149 * y) + (int16_t)(17312 - 50 * u - 104 * v)) >> 1) >> 6); + * b = clamp((int16_t)(((int16_t)(149 * y) - (int16_t)(28832 - 258 * u)) >> 1) >> 6); + * + * The use of unsigned multiplications gains one extra bit of precision (sign is hidden in + * the ADD or SUB operations which are done with the result). Also the use of VHADD/VHSUB + * instructions allows to preserve one more bit in intermediate calculations (two 16-bit + * values are added together to get intermediate 17-bit result and shifted right by one + * bit). + */ + +/* + * Supplementary macro for packed YUV->RGB conversion + * + * Registers: + * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data + * q2, q3 : d4, d5, d6, d7 - together with d0-d3 are used for storing + * converted RGB data (actual layout + * depends on macro arguments) + * q4-q5 : d8, d9, d10, d11 - constants for clamping input YUV data + * q6 - used for temporary storage + * + * q7 - reserved + * + * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data + * q10 : d20, d21 + * q11 : d22, d23 + * q12 : d24, d25 + * q13 : d26, d27 + * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154) + */ + +.macro packed_yuv_to_rgb_helper is_uyvy_format, r_1, g_1, b_1, r_2, g_2, b_2 + /* convert from 'packed' to 'planar' representation */ +.if is_uyvy_format /* uyvy */ + vuzp.8 d0, d1 /* d1 - separated Y data (first 8 bytes) */ + vuzp.8 d2, d3 /* d3 - separated Y data (next 8 bytes) */ +.else /* yuy2 (TODO: get rid of vswp) */ + vswp d0, d1 + vswp d2, d3 + vuzp.8 d1, d0 /* d1 - separated Y data (first 8 bytes) */ + vuzp.8 d3, d2 /* d3 - separated Y data (next 8 bytes) */ +.endif + vuzp.8 d0, d2 /* d0 - separated U data, d2 - separated V data */ + /* split even and odd Y color components */ + vuzp.8 d1, d3 /* d1 - evenY, d3 - oddY */ + /* clamp Y to [16, 235] range, U/V to [16, 240] and subtract 16 */ + vqadd.u8 q0, q0, q4 + vqadd.u8 q1, q1, q4 + vqsub.u8 q0, q0, q5 + vqsub.u8 q1, q1, q5 + /* perform the conversion */ + adrl DUMMY, yuv_rgb_acc_r + vshr.u8 d4, d2, #1 /* d4 = V >> 1 */ + vmull.u8 q8, d1, d27 /* q8 = evenY * 149 */ + vmull.u8 q9, d3, d27 /* q9 = oddY * 149 */ + vld1.16 {d20, d21}, [DUMMY, :128]! /* q10 - initialize accumulator for red */ + vsubw.u8 q10, q10, d4 /* red acc -= (V >> 1) */ + vmlsl.u8 q10, d2, d28 /* red acc -= V * 204 */ + vld1.16 {d22, d23}, [DUMMY, :128]! /* q11 - initialize accumulator for green */ + vmlsl.u8 q11, d2, d30 /* green acc -= V * 104 */ + vmlsl.u8 q11, d0, d29 /* green acc -= U * 50 */ + vld1.16 {d24, d25}, [DUMMY, :128]! /* q12 - initialize accumulator for blue */ + vmlsl.u8 q12, d0, d30 /* blue acc -= U * 104 */ + vmlsl.u8 q12, d0, d31 /* blue acc -= U * 154 */ + vhsub.s16 q6, q8, q10 /* calculate even red components */ + vhsub.s16 q10, q9, q10 /* calculate odd red components */ + vqshrun.s16 r_1, q6, #6 /* right shift, narrow and saturate even red components */ + vqshrun.s16 r_2, q10, #6 /* right shift, narrow and saturate odd red components */ + vhadd.s16 q6, q8, q11 /* calculate even green components */ + vhadd.s16 q11, q9, q11 /* calculate odd green components */ + vqshrun.s16 g_1, q6, #6 /* right shift, narrow and saturate even green components */ + vqshrun.s16 g_2, q11, #6 /* right shift, narrow and saturate odd green components */ + vhsub.s16 q6, q8, q12 /* calculate even blue components */ + vhsub.s16 q12, q9, q12 /* calculate odd blue components */ + vqshrun.s16 b_1, q6, #6 /* right shift, narrow and saturate even blue components */ + vqshrun.s16 b_2, q12, #6 /* right shift, narrow and saturate odd blue components */ + vzip.8 r_1, r_2 /* join even and odd red components */ + vzip.8 g_1, g_2 /* join even and odd green components */ + vzip.8 b_1, b_2 /* join even and odd blue components */ +.endm + + .balign 32 +yuv_rgb_acc_r: + .rept 8 + .hword 22840 + .endr +yuv_rgb_acc_g: + .rept 8 + .hword 17312 + .endr +yuv_rgb_acc_b: + .rept 8 + .hword 28832 + .endr + +/******************************************************************************/ + +.macro pixman_composite_src_yuy2_0888_process_pixblock_head + packed_yuv_to_rgb_helper 0, d0, d1, d2, d3, d4, d5 +.endm + +.macro pixman_composite_src_yuy2_0888_process_pixblock_tail +.endm + +.macro pixman_composite_src_yuy2_0888_process_pixblock_tail_head + pixman_composite_src_yuy2_0888_process_pixblock_tail + vst3.8 {d0, d1, d2}, [DST_W]! + vst3.8 {d3, d4, d5}, [DST_W]! + vld1.16 {d0, d1, d2, d3}, [SRC]! + pixman_composite_src_yuy2_0888_process_pixblock_head + cache_preload 8, 8 +.endm + +.macro pixman_composite_src_yuy2_0888_init + vpush {d8-d15} + /* Initialize some constants */ + vmov.u8 d8, #15 /* add this to U/V to saturate upper boundary */ + vmov.u8 d9, #20 /* add this to Y to saturate upper boundary */ + vmov.u8 d10, #31 /* sub this from U/V to saturate lower boundary */ + vmov.u8 d11, #36 /* sub this from Y to saturate lower boundary */ + vmov.u8 d26, #16 + vmov.u8 d27, #149 + vmov.u8 d28, #204 + vmov.u8 d29, #50 + vmov.u8 d30, #104 + vmov.u8 d31, #154 +.endm + +.macro pixman_composite_src_yuy2_0888_cleanup + vpop {d8-d15} +.endm + + +generate_composite_function \ + pixman_composite_src_yuy2_0888_asm_neon, 16, 0, 24, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 16, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + pixman_composite_src_yuy2_0888_init, \ + pixman_composite_src_yuy2_0888_cleanup, \ + pixman_composite_src_yuy2_0888_process_pixblock_head, \ + pixman_composite_src_yuy2_0888_process_pixblock_tail, \ + pixman_composite_src_yuy2_0888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h index e7be5cdd..ecdf045a 100644 --- a/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman-arm-neon-asm.h @@ -148,6 +148,9 @@ .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 16) + pixldst3 vld3, 8, %(basereg+6), %(basereg+7), %(basereg+8), mem_operand + pixldst3 vld3, 8, %(basereg+9), %(basereg+10), %(basereg+11), mem_operand .elseif (bpp == 24) && (numpix == 8) pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand .elseif (bpp == 24) && (numpix == 4) @@ -171,6 +174,9 @@ .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 16) + pixldst3 vst3, 8, %(basereg+6), %(basereg+7), %(basereg+8), mem_operand + pixldst3 vst3, 8, %(basereg+9), %(basereg+10), %(basereg+11), mem_operand .elseif (bpp == 24) && (numpix == 8) pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand .elseif (bpp == 24) && (numpix == 4) diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 2ed8b4bd..bfcce8b0 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -228,6 +228,60 @@ BIND_SRC_N_DST(composite_over_8888_n_8888, uint32_t, 1, uint32_t, 1) BIND_SRC_MASK_DST(composite_add_8_8_8, uint8_t, 1, uint8_t, 1, uint8_t, 1) void +pixman_composite_src_yuy2_0888_asm_neon (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint16_t *src, + int32_t src_stride); + +static void +neon_composite_src_yuy2_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line; + uint16_t *src_line; + int32_t dst_stride, src_stride; + + /* TODO: handle all cases in assembly */ + if ((src_x & 1) != 0 || (width & 1) != 0) + { + /* + * TODO: if fallback is still going to be used, maybe call + * 'general_composite_rect' directly? + */ + _pixman_implementation_composite (imp->delegate, op, + src_image, mask_image, dst_image, + src_x, src_y, + mask_x, mask_y, + dest_x, dest_y, + width, height); + return; + } + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, + dst_stride, dst_line, 3); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, + src_stride, src_line, 1); + + pixman_composite_src_yuy2_0888_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride); +} + + +void pixman_composite_src_n_8_asm_neon (int32_t w, int32_t h, uint8_t *dst, @@ -305,6 +359,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] = { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888 }, { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888 }, { PIXMAN_OP_SRC, PIXMAN_r8g8b8, PIXMAN_null, PIXMAN_r8g8b8, neon_composite_src_0888_0888 }, + { PIXMAN_OP_SRC, PIXMAN_yuy2, PIXMAN_null, PIXMAN_b8g8r8, neon_composite_src_yuy2_0888 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_composite_over_n_8_0565 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888 }, diff --git a/test/blitters-test-bisect.rb b/test/blitters-test-bisect.rb index 62ff782e..2db78336 100644 --- a/test/blitters-test-bisect.rb +++ b/test/blitters-test-bisect.rb @@ -27,8 +27,8 @@ end base = 1 while true do # run infinitely, processing 100000 test cases per iteration - printf("running tests %d-%d\n", base, base + 100000 - 1); - res = test_range(base, base + 100000 - 1) + printf("running tests %d-%d\n", base, base + 10000 - 1); + res = test_range(base, base + 10000 - 1) if res then printf("-- ref --\n") printf("%s\n", `#{ARGV[0]} -#{res}`) @@ -39,5 +39,5 @@ while true do printf("#{ARGV[1]} -%d\n", res) exit(1) end - base += 100000 + base += 10000 end diff --git a/test/blitters-test.c b/test/blitters-test.c index ac816eba..cbf7fd69 100644 --- a/test/blitters-test.c +++ b/test/blitters-test.c @@ -192,53 +192,16 @@ static pixman_op_t op_list[] = { #endif }; +static pixman_format_code_t src_img_fmt_list[] = { + PIXMAN_yuy2, + -1 +}; + static pixman_format_code_t img_fmt_list[] = { PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, - PIXMAN_r5g6b5, - PIXMAN_r3g3b2, - PIXMAN_a8, - PIXMAN_a8b8g8r8, - PIXMAN_x8b8g8r8, - PIXMAN_b8g8r8a8, - PIXMAN_b8g8r8x8, - PIXMAN_r8g8b8, PIXMAN_b8g8r8, PIXMAN_r5g6b5, - PIXMAN_b5g6r5, - PIXMAN_x2r10g10b10, - PIXMAN_a2r10g10b10, - PIXMAN_x2b10g10r10, - PIXMAN_a2b10g10r10, - PIXMAN_a1r5g5b5, - PIXMAN_x1r5g5b5, - PIXMAN_a1b5g5r5, - PIXMAN_x1b5g5r5, - PIXMAN_a4r4g4b4, - PIXMAN_x4r4g4b4, - PIXMAN_a4b4g4r4, - PIXMAN_x4b4g4r4, - PIXMAN_a8, - PIXMAN_r3g3b2, - PIXMAN_b2g3r3, - PIXMAN_a2r2g2b2, - PIXMAN_a2b2g2r2, -#if 0 /* using these crashes the test */ - PIXMAN_c8, - PIXMAN_g8, - PIXMAN_x4c4, - PIXMAN_x4g4, - PIXMAN_c4, - PIXMAN_g4, - PIXMAN_g1, -#endif - PIXMAN_x4a4, - PIXMAN_a4, - PIXMAN_r1g2b1, - PIXMAN_b1g2r1, - PIXMAN_a1r1g1b1, - PIXMAN_a1b1g1r1, - PIXMAN_a1, -1 }; @@ -293,7 +256,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose) if (lcg_rand_n (8)) { /* normal image */ - src_img = create_random_image (img_fmt_list, max_width, max_height, + src_img = create_random_image (src_img_fmt_list, max_width, max_height, max_extra_stride, &src_fmt); } else @@ -397,6 +360,8 @@ test_composite (uint32_t initcrc, int testnum, int verbose) return crc32; } +#define N 100000 + int main (int argc, char *argv[]) { @@ -416,7 +381,7 @@ main (int argc, char *argv[]) else { n1 = 1; - n2 = 2000000; + n2 = N; } if (n2 < 0) @@ -435,12 +400,12 @@ main (int argc, char *argv[]) } printf ("crc32=%08X\n", crc); - if (n2 == 2000000) + if (n2 == N) { /* Predefined value for running with all the fastpath functions disabled. It needs to be updated every time when changes are introduced to this program or behavior of pixman changes! */ - if (crc == 0x1911E2C3) + if (crc == 0x3D271F0E) { printf ("blitters test passed\n"); } |