Merge commit 'ssvb/yuv-arm-neon-wip' into company-yuvcompany-yuv

Conflicts: pixman/pixman-access.c
author: Søren Sandmann Pedersen <sandmann@redhat.com> 2009-11-21 21:57:10 +0100
committer: Søren Sandmann Pedersen <sandmann@redhat.com> 2009-11-21 21:57:10 +0100
commit: 31666460ed3fa926e8876585c85b72009f9fa81e (patch)
tree: a7068fd84cc20c5a0254c7984b60bf57069273b2
parent: cf178e6e009df0e970e45882fe244dc125e929fc (diff)
parent: d96bea4f6e4ad248693dcf0bd6a45d51d6f6cf8a (diff)
6 files changed, 240 insertions, 53 deletions
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
index 5f672579..5e72b421 100644
--- a/pixman/pixman-access.c
+++ b/pixman/pixman-access.c
@@ -1088,20 +1088,20 @@ fetch_scanline_ayuv (pixman_image_t *image,
     const uint8_t *bits = (const uint8_t *)
         (image->bits.bits + image->bits.rowstride * line + x);
     int i;
-    
+
     for (i = 0; i < width; i++)
     {
 	int32_t a, y, u, v, r, g, b;
-	
+
         a = bits[0];
 	y = bits[1];
 	u = bits[2];
         v = bits[3];
-	
+
         YUV2RGB_CHROMA (r, g, b, u, v);
         YUV2RGB_ADD (r, g, b, y);
         YUV2RGB_STORE_ALPHA (*buffer, a, r, g, b);
-	
+
 	buffer++;
         bits += 4;
     }
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index e8ccf77a..b2298481 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1049,3 +1049,164 @@ generate_composite_function \
     0, /* dst_r_basereg */ \
     0, /* src_basereg   */ \
     0  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * YUV->RGB conversion, optimized for the use of fast 8-bit NEON multiplications,
+ * but aiming to minimize precision loss as much as possible.
+ *
+ * C pseudocode:
+ *
+ * y = clamp_range(y, 16, 235) - 16;
+ * u = clamp_range(u, 16, 240) - 16;
+ * v = clamp_range(v, 16, 240) - 16;
+ *
+ * r = clamp((int16_t)(((int16_t)(149 * y) - (int16_t)(22840 - 204 * v - (v >> 1))) >> 1) >> 6);
+ * g = clamp((int16_t)(((int16_t)(149 * y) + (int16_t)(17312 - 50 * u - 104 * v)) >> 1) >> 6);
+ * b = clamp((int16_t)(((int16_t)(149 * y) - (int16_t)(28832 - 258 * u)) >> 1) >> 6);
+ *
+ * The use of unsigned multiplications gains one extra bit of precision (sign is hidden in
+ * the ADD or SUB operations which are done with the result). Also the use of VHADD/VHSUB
+ * instructions allows to preserve one more bit in intermediate calculations (two 16-bit
+ * values are added together to get intermediate 17-bit result and shifted right by one
+ * bit).
+ */
+
+/*
+ * Supplementary macro for packed YUV->RGB conversion
+ *
+ * Registers:
+ * q0, q1 : d0, d1, d2, d3   - are used for initial loading of YUV data
+ * q2, q3 : d4, d5, d6, d7   - together with d0-d3 are used for storing
+ *                             converted RGB data (actual layout
+ *                             depends on macro arguments)
+ * q4-q5  : d8, d9, d10, d11 - constants for clamping input YUV data
+ * q6                        - used for temporary storage
+ *
+ * q7                        - reserved
+ *
+ * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data
+ * q10    : d20, d21
+ * q11    : d22, d23
+ * q12    : d24, d25
+ * q13    : d26, d27
+ * q13, q14, q15      - various constants (#16, #149, #204, #50, #104, #154)
+ */
+
+.macro packed_yuv_to_rgb_helper is_uyvy_format, r_1, g_1, b_1, r_2, g_2, b_2
+    /* convert from 'packed' to 'planar' representation */
+.if is_uyvy_format /* uyvy */
+    vuzp.8      d0, d1    /* d1 - separated Y data (first 8 bytes) */
+    vuzp.8      d2, d3    /* d3 - separated Y data (next 8 bytes) */
+.else /* yuy2 (TODO: get rid of vswp) */
+    vswp        d0, d1
+    vswp        d2, d3
+    vuzp.8      d1, d0    /* d1 - separated Y data (first 8 bytes) */
+    vuzp.8      d3, d2    /* d3 - separated Y data (next 8 bytes) */
+.endif
+    vuzp.8      d0, d2    /* d0 - separated U data, d2 - separated V data */
+    /* split even and odd Y color components */
+    vuzp.8      d1, d3    /* d1 - evenY, d3 - oddY */
+    /* clamp Y to [16, 235] range, U/V to [16, 240] and subtract 16 */
+    vqadd.u8    q0, q0, q4
+    vqadd.u8    q1, q1, q4
+    vqsub.u8    q0, q0, q5
+    vqsub.u8    q1, q1, q5
+    /* perform the conversion */
+    adrl        DUMMY, yuv_rgb_acc_r
+    vshr.u8     d4, d2, #1        /* d4 = V >> 1 */
+    vmull.u8    q8, d1, d27       /* q8 = evenY * 149 */
+    vmull.u8    q9, d3, d27       /* q9 = oddY * 149 */
+    vld1.16     {d20, d21}, [DUMMY, :128]!      /* q10 - initialize accumulator for red */
+    vsubw.u8    q10, q10, d4      /* red acc -= (V >> 1) */
+    vmlsl.u8    q10, d2, d28      /* red acc -= V * 204 */
+    vld1.16     {d22, d23}, [DUMMY, :128]!      /* q11 - initialize accumulator for green */
+    vmlsl.u8    q11, d2, d30      /* green acc -= V * 104 */
+    vmlsl.u8    q11, d0, d29      /* green acc -= U * 50 */
+    vld1.16     {d24, d25}, [DUMMY, :128]!      /* q12 - initialize accumulator for blue */
+    vmlsl.u8    q12, d0, d30      /* blue acc -= U * 104 */
+    vmlsl.u8    q12, d0, d31      /* blue acc -= U * 154 */
+    vhsub.s16   q6, q8, q10       /* calculate even red components */
+    vhsub.s16   q10, q9, q10      /* calculate odd red components */
+    vqshrun.s16 r_1, q6, #6       /* right shift, narrow and saturate even red components */
+    vqshrun.s16 r_2, q10, #6      /* right shift, narrow and saturate odd red components */
+    vhadd.s16   q6, q8, q11       /* calculate even green components */
+    vhadd.s16   q11, q9, q11      /* calculate odd green components */
+    vqshrun.s16 g_1, q6, #6       /* right shift, narrow and saturate even green components */
+    vqshrun.s16 g_2, q11, #6      /* right shift, narrow and saturate odd green components */
+    vhsub.s16   q6, q8, q12       /* calculate even blue components */
+    vhsub.s16   q12, q9, q12      /* calculate odd blue components */
+    vqshrun.s16 b_1, q6, #6       /* right shift, narrow and saturate even blue components */
+    vqshrun.s16 b_2, q12, #6      /* right shift, narrow and saturate odd blue components */
+    vzip.8      r_1, r_2          /* join even and odd red components */
+    vzip.8      g_1, g_2          /* join even and odd green components */
+    vzip.8      b_1, b_2          /* join even and odd blue components */
+.endm
+
+    .balign 32
+yuv_rgb_acc_r:
+    .rept 8
+    .hword 22840
+    .endr
+yuv_rgb_acc_g:
+    .rept 8
+    .hword 17312
+    .endr
+yuv_rgb_acc_b:
+    .rept 8
+    .hword 28832
+    .endr
+
+/******************************************************************************/
+
+.macro pixman_composite_src_yuy2_0888_process_pixblock_head
+    packed_yuv_to_rgb_helper 0, d0, d1, d2, d3, d4, d5
+.endm
+
+.macro pixman_composite_src_yuy2_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_yuy2_0888_process_pixblock_tail_head
+    pixman_composite_src_yuy2_0888_process_pixblock_tail
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    vst3.8 {d3, d4, d5}, [DST_W]!
+    vld1.16 {d0, d1, d2, d3}, [SRC]!
+    pixman_composite_src_yuy2_0888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_yuy2_0888_init
+    vpush       {d8-d15}
+    /* Initialize some constants */
+    vmov.u8     d8, #15   /* add this to U/V to saturate upper boundary */
+    vmov.u8     d9, #20   /* add this to Y to saturate upper boundary */
+    vmov.u8     d10, #31  /* sub this from U/V to saturate lower boundary */
+    vmov.u8     d11, #36  /* sub this from Y to saturate lower boundary */
+    vmov.u8     d26, #16
+    vmov.u8     d27, #149
+    vmov.u8     d28, #204
+    vmov.u8     d29, #50
+    vmov.u8     d30, #104
+    vmov.u8     d31, #154
+.endm
+
+.macro pixman_composite_src_yuy2_0888_cleanup
+    vpop        {d8-d15}
+.endm
+
+
+generate_composite_function \
+    pixman_composite_src_yuy2_0888_asm_neon, 16, 0, 24, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_yuy2_0888_init, \
+    pixman_composite_src_yuy2_0888_cleanup, \
+    pixman_composite_src_yuy2_0888_process_pixblock_head, \
+    pixman_composite_src_yuy2_0888_process_pixblock_tail, \
+    pixman_composite_src_yuy2_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index e7be5cdd..ecdf045a 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -148,6 +148,9 @@
 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
                       %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 16)
+    pixldst3 vld3, 8, %(basereg+6), %(basereg+7), %(basereg+8), mem_operand
+    pixldst3 vld3, 8, %(basereg+9), %(basereg+10), %(basereg+11), mem_operand
 .elseif (bpp == 24) && (numpix == 8)
     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
 .elseif (bpp == 24) && (numpix == 4)
@@ -171,6 +174,9 @@
 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
                       %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 16)
+    pixldst3 vst3, 8, %(basereg+6), %(basereg+7), %(basereg+8), mem_operand
+    pixldst3 vst3, 8, %(basereg+9), %(basereg+10), %(basereg+11), mem_operand
 .elseif (bpp == 24) && (numpix == 8)
     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
 .elseif (bpp == 24) && (numpix == 4)
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 2ed8b4bd..bfcce8b0 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -228,6 +228,60 @@ BIND_SRC_N_DST(composite_over_8888_n_8888, uint32_t, 1, uint32_t, 1)
 BIND_SRC_MASK_DST(composite_add_8_8_8, uint8_t, 1, uint8_t, 1, uint8_t, 1)
 
 void
+pixman_composite_src_yuy2_0888_asm_neon (int32_t    w,
+                                         int32_t    h,
+                                         uint8_t   *dst,
+                                         int32_t    dst_stride,
+                                         uint16_t  *src,
+                                         int32_t    src_stride);
+
+static void
+neon_composite_src_yuy2_0888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint8_t   *dst_line;
+    uint16_t  *src_line;
+    int32_t    dst_stride, src_stride;
+
+    /* TODO: handle all cases in assembly */
+    if ((src_x & 1) != 0 || (width & 1) != 0)
+    {
+	/*
+	 * TODO: if fallback is still going to be used, maybe call
+	 *  'general_composite_rect' directly?
+	 */
+	_pixman_implementation_composite (imp->delegate, op,
+                                      src_image, mask_image, dst_image,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
+	return;
+    }
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t,
+                           dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t,
+                           src_stride, src_line, 1);
+
+    pixman_composite_src_yuy2_0888_asm_neon (width, height,
+                                             dst_line, dst_stride,
+                                             src_line, src_stride);
+}
+
+
+void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
                                    int32_t   h,
                                    uint8_t  *dst,
@@ -305,6 +359,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] =
     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_src_8888_8888    },
     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_src_8888_8888    },
     { PIXMAN_OP_SRC,  PIXMAN_r8g8b8,   PIXMAN_null,     PIXMAN_r8g8b8,   neon_composite_src_0888_0888    },
+    { PIXMAN_OP_SRC,  PIXMAN_yuy2,     PIXMAN_null,     PIXMAN_b8g8r8,   neon_composite_src_yuy2_0888    },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565    },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565    },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888    },
diff --git a/test/blitters-test-bisect.rb b/test/blitters-test-bisect.rb
index 62ff782e..2db78336 100644
--- a/test/blitters-test-bisect.rb
+++ b/test/blitters-test-bisect.rb
@@ -27,8 +27,8 @@ end
 base = 1
 while true do
     # run infinitely, processing 100000 test cases per iteration
-    printf("running tests %d-%d\n", base, base + 100000 - 1);
-    res = test_range(base, base + 100000 - 1)
+    printf("running tests %d-%d\n", base, base + 10000 - 1);
+    res = test_range(base, base + 10000 - 1)
     if res then
         printf("-- ref --\n")
         printf("%s\n", `#{ARGV[0]} -#{res}`)
@@ -39,5 +39,5 @@ while true do
         printf("#{ARGV[1]} -%d\n", res)
         exit(1)
     end
-    base += 100000
+    base += 10000
 end
diff --git a/test/blitters-test.c b/test/blitters-test.c
index ac816eba..cbf7fd69 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -192,53 +192,16 @@ static pixman_op_t op_list[] = {
 #endif
 };
 
+static pixman_format_code_t src_img_fmt_list[] = {
+    PIXMAN_yuy2,
+    -1
+};
+
 static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_a8r8g8b8,
     PIXMAN_x8r8g8b8,
-    PIXMAN_r5g6b5,
-    PIXMAN_r3g3b2,
-    PIXMAN_a8,
-    PIXMAN_a8b8g8r8,
-    PIXMAN_x8b8g8r8,
-    PIXMAN_b8g8r8a8,
-    PIXMAN_b8g8r8x8,
-    PIXMAN_r8g8b8,
     PIXMAN_b8g8r8,
     PIXMAN_r5g6b5,
-    PIXMAN_b5g6r5,
-    PIXMAN_x2r10g10b10,
-    PIXMAN_a2r10g10b10,
-    PIXMAN_x2b10g10r10,
-    PIXMAN_a2b10g10r10,
-    PIXMAN_a1r5g5b5,
-    PIXMAN_x1r5g5b5,
-    PIXMAN_a1b5g5r5,
-    PIXMAN_x1b5g5r5,
-    PIXMAN_a4r4g4b4,
-    PIXMAN_x4r4g4b4,
-    PIXMAN_a4b4g4r4,
-    PIXMAN_x4b4g4r4,
-    PIXMAN_a8,
-    PIXMAN_r3g3b2,
-    PIXMAN_b2g3r3,
-    PIXMAN_a2r2g2b2,
-    PIXMAN_a2b2g2r2,
-#if 0 /* using these crashes the test */
-    PIXMAN_c8,
-    PIXMAN_g8,
-    PIXMAN_x4c4,
-    PIXMAN_x4g4,
-    PIXMAN_c4,
-    PIXMAN_g4,
-    PIXMAN_g1,
-#endif
-    PIXMAN_x4a4,
-    PIXMAN_a4,
-    PIXMAN_r1g2b1,
-    PIXMAN_b1g2r1,
-    PIXMAN_a1r1g1b1,
-    PIXMAN_a1b1g1r1,
-    PIXMAN_a1,
     -1
 };
 
@@ -293,7 +256,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     if (lcg_rand_n (8))
     {
 	/* normal image */
-	src_img = create_random_image (img_fmt_list, max_width, max_height,
+	src_img = create_random_image (src_img_fmt_list, max_width, max_height,
 				       max_extra_stride, &src_fmt);
     }
     else
@@ -397,6 +360,8 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     return crc32;
 }
 
+#define N 100000
+
 int
 main (int argc, char *argv[])
 {
@@ -416,7 +381,7 @@ main (int argc, char *argv[])
     else
     {
 	n1 = 1;
-	n2 = 2000000;
+	n2 = N;
     }
 
     if (n2 < 0)
@@ -435,12 +400,12 @@ main (int argc, char *argv[])
 	}
 	printf ("crc32=%08X\n", crc);
 
-	if (n2 == 2000000)
+	if (n2 == N)
 	{
 	    /* Predefined value for running with all the fastpath functions
 	       disabled. It needs to be updated every time when changes are
 	       introduced to this program or behavior of pixman changes! */
-	    if (crc == 0x1911E2C3)
+	    if (crc == 0x3D271F0E)
 	    {
 		printf ("blitters test passed\n");
 	    }
author	Søren Sandmann Pedersen <sandmann@redhat.com>	2009-11-21 21:57:10 +0100
committer	Søren Sandmann Pedersen <sandmann@redhat.com>	2009-11-21 21:57:10 +0100
commit	31666460ed3fa926e8876585c85b72009f9fa81e (patch)
tree	a7068fd84cc20c5a0254c7984b60bf57069273b2
parent	cf178e6e009df0e970e45882fe244dc125e929fc (diff)
parent	d96bea4f6e4ad248693dcf0bd6a45d51d6f6cf8a (diff)