summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-13 19:33:06 +0200
committerM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-13 19:33:06 +0200
commitb264fbcbe6f12d9f25a57f13f3747e2a51b2c2d9 (patch)
tree8e84d115d76e50518de0f9d0fcd890815d3e5a63
parenta30c4f05e916db2b04613dc22e357ab63235a776 (diff)
Separate unpremultiplier methods, clean them up, and kill the bad ones.
-rw-r--r--unpremultiply-div.c56
-rw-r--r--unpremultiply-inv32.c125
-rw-r--r--unpremultiply-lut.c79
-rw-r--r--unpremultiply-sse2.S1
-rw-r--r--unpremultiply.c378
5 files changed, 297 insertions, 342 deletions
diff --git a/unpremultiply-div.c b/unpremultiply-div.c
new file mode 100644
index 0000000..514aa28
--- /dev/null
+++ b/unpremultiply-div.c
@@ -0,0 +1,56 @@
+/* Reference implementation using divisions. Since the slow path is
+ * so very slow this version specialises runs of constant or solid
+ * pixels. */
+#include <stddef.h>
+#include <stdint.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components. Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255 << ASHIFT)
+#define RMASK (255 << RSHIFT)
+#define GMASK (255 << GSHIFT)
+#define BMASK (255 << BSHIFT)
+
+void
+unpremultiply_with_div(
+ uint32_t * restrict dst,
+ uint32_t const * restrict src,
+ size_t n)
+{
+ uint32_t prev_in = 0;
+ uint32_t prev_out = 0;
+ size_t i;
+
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba & AMASK) >> ASHIFT;
+ if (a == 255) {
+ dst[i] = rgba;
+ continue;
+ }
+ if (prev_in == rgba) {
+ dst[i] = prev_out;
+ continue;
+ }
+ if (a) {
+ uint32_t r = (rgba >> RSHIFT) & 0xFF;
+ uint32_t g = (rgba >> GSHIFT) & 0xFF;
+ uint32_t b = (rgba >> BSHIFT) & 0xFF;
+ r = r*255 / a;
+ g = g*255 / a;
+ b = b*255 / a;
+ prev_in = rgba;
+ prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+ } else {
+ prev_in = prev_out = 0;
+ }
+ dst[i] = prev_out;
+ }
+}
diff --git a/unpremultiply-inv32.c b/unpremultiply-inv32.c
new file mode 100644
index 0000000..e9505de
--- /dev/null
+++ b/unpremultiply-inv32.c
@@ -0,0 +1,125 @@
+/* An unpremultiplier using reciprocal multiplication. It specialises
+ * constant runs and solid runs of pixels with low overhead loops and
+ * uses only a 1KB table of reciprocals. */
+/* gcc -c -W -Wall -O3 -funroll-all-loops -fomit-frame-pointer -std=c99 unpremultiply-inv32.c */
+#include <stdint.h>
+#include <stddef.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components. Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255U << ASHIFT)
+#define RMASK (255U << RSHIFT)
+#define GMASK (255U << GSHIFT)
+#define BMASK (255U << BSHIFT)
+
+/* Shift x left by y bits. Supports negative y for right shifts. */
+#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
+
+#define ceil_div(a,b) ((a) + (b)-1) / (b)
+
+/* The reciprocal_table[i] entries are defined by
+ *
+ * 0 when i = 0
+ * 255 / i when i > 0
+ *
+ * represented in fixed point format with RECIPROCAL_BITS of
+ * precision and errors rounded up. */
+#define RECIPROCAL_BITS 16
+static uint32_t const reciprocal_table[256] = {
+# define R(i) ((i) ? ceil_div(255*(1<<RECIPROCAL_BITS), (i)) : 0)
+# define R1(i) R(i), R(i+1), R(i+2), R(i+3)
+# define R2(i) R1(i), R1(i+4), R1(i+8), R1(i+12)
+# define R3(i) R2(i), R2(i+16), R2(i+32), R2(i+48)
+ R3(0), R3(64), R3(128), R3(192)
+};
+
+/* Transfer num_pixels unpremultiplied pixels from src[] to dst[].
+ * This version uses a short probe period of a few pixels to identify
+ * runs of constant or solid pixels. When a run is identified it
+ * falls into a special case loop for the duration of the run. */
+void
+unpremultiply_with_inv32(
+ uint32_t * restrict dst,
+ uint32_t const * restrict src,
+ size_t num_pixels)
+{
+#define PROBE_LENGTH 2
+
+ for (size_t i = 0; i < num_pixels; ) {
+ /* We want to identify long runs of constant input pixels and
+ * cache the unpremultiplied. */
+ uint32_t const_in, const_out;
+
+ /* Diff is the or of all bitwise differences from const_in
+ * during the probe period. If it is zero after the probe
+ * period then every input pixel was identical in the
+ * probe. */
+ unsigned diff = 0;
+
+ /* Accumulator for all alphas of the probe period pixels,
+ * biased to make the sum zero if the */
+ unsigned accu = -PROBE_LENGTH*255;
+
+ unsigned j;
+
+ /* The first iteration of the probe period initialises
+ * const_in. */
+ {
+ uint32_t rgba = const_in = src[i];
+ uint32_t a = (rgba >> ASHIFT) & 255;
+ accu += a;
+ uint32_t r = (rgba >> RSHIFT) & 255;
+ uint32_t g = (rgba >> GSHIFT) & 255;
+ uint32_t b = (rgba >> BSHIFT) & 255;
+ uint32_t recip = reciprocal_table[a];
+ r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+ g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+ b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+ dst[i] = const_out =
+ (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+ }
+
+ for (j = 1; j < PROBE_LENGTH; j++) {
+ if (i + j >= num_pixels)
+ return;
+ uint32_t rgba = src[i+j];
+ diff |= rgba ^ const_in;
+ uint32_t a = (rgba >> ASHIFT) & 255;
+ accu += a;
+ uint32_t r = (rgba >> RSHIFT) & 255;
+ uint32_t g = (rgba >> GSHIFT) & 255;
+ uint32_t b = (rgba >> BSHIFT) & 255;
+ uint32_t recip = reciprocal_table[a];
+ r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+ g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+ b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+ dst[i+j] =
+ (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+ }
+ i += PROBE_LENGTH;
+
+ /* Fall into special cases if we have special
+ * circumstances. */
+ if (0 != (accu & diff)) continue;
+
+ if (0 == accu) { /* a run of solid pixels. */
+ uint32_t in;
+ while (AMASK == ((in = src[i]) & AMASK)) {
+ dst[i++] = in;
+ if (i >= num_pixels) return;
+ }
+ } else if (0 == diff) { /* a run of constant pixels. */
+ while (src[i] == const_in) {
+ dst[i++] = const_out;
+ if (i >= num_pixels) return;
+ }
+ }
+ }
+}
diff --git a/unpremultiply-lut.c b/unpremultiply-lut.c
new file mode 100644
index 0000000..048eefa
--- /dev/null
+++ b/unpremultiply-lut.c
@@ -0,0 +1,79 @@
+/* An unpremultiplier using a 64KB division table. It specialises
+ * constant runs and solid runs of pixels, but not quite as well as
+ * unpremultiply-inv32 (but it could given some more effort.) It has
+ * the advantage over the others that it can clamp the result into
+ * range without any cost so that the output doesn't overflow should
+ * there be superluminant pixels in the input. It's also reasonably
+ * fast if you have enough L1. */
+#include <stddef.h>
+#include <stdint.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components. Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255 << ASHIFT)
+#define RMASK (255 << RSHIFT)
+#define GMASK (255 << GSHIFT)
+#define BMASK (255 << BSHIFT)
+
+/* The function to tabulate in the lookup table. Could be anything
+ * really. */
+#define div_func(a,b) (255*(a) / (b))
+
+#define clamp(x) ((x) > 255 ? 255 : (x))
+
+/* The entries division_table[a][b] are defined by
+ *
+ * 0 when b = 0
+ * min(255, ceil(255*a / b)) when b > 0
+ *
+ */
+static uint8_t const division_table[256][256] = {
+
+#define R(a,b) ((b) > 0 ? clamp(div_func((a),(b))) : 0)
+#define R1(a,i) R(a, i), R(a+1, i), R(a+2, i), R(a+3, i)
+#define R2(a,i) R1(a, i), R1(a+4, i), R1(a+8, i), R1(a+12, i)
+#define R3(a,i) R2(a, i), R2(a+16, i), R2(a+32, i), R2(a+48, i)
+
+#define S(b) { R3(0, b), R3(64, b), R3(128, b), R3(192, b) }
+#define S1(b) S(b), S(b+1), S(b+2), S(b+3)
+#define S2(b) S1(b), S1(b+4), S1(b+8), S1(b+12)
+#define S3(b) S2(b), S2(b+16), S2(b+32), S2(b+48)
+
+ S3(0), S3(64), S3(128), S3(192)
+};
+
+void
+unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ size_t i;
+ uint32_t prev_in = 0;
+ uint32_t prev_out = 0;
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba >> ASHIFT) & 0xFF;
+ if (a == 255) {
+ dst[i] = rgba;
+ }
+ else if (prev_in == rgba) {
+ dst[i] = prev_out;
+ }
+ else {
+ uint32_t r = (rgba >> RSHIFT) & 0xFF;
+ uint32_t g = (rgba >> GSHIFT) & 0xFF;
+ uint32_t b = (rgba >> BSHIFT) & 0xFF;
+ r = division_table[a][r];
+ g = division_table[a][g];
+ b = division_table[a][b];
+ prev_in = rgba;
+ prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+ dst[i] = prev_out;
+ }
+ }
+}
diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S
index 8ecee17..1ffbf5f 100644
--- a/unpremultiply-sse2.S
+++ b/unpremultiply-sse2.S
@@ -9,6 +9,7 @@
;;; uint32_t const *src,
;;; unsigned long num_pixels);
;;;
+;;; Tested with nasm 2.06rc2.
section .text
; We're only using rax-rbp in this file so that
diff --git a/unpremultiply.c b/unpremultiply.c
index 6d904d7..201a59c 100644
--- a/unpremultiply.c
+++ b/unpremultiply.c
@@ -1,288 +1,42 @@
#define RUN_ME /*
nasm -g -f elf64 unpremultiply-sse2.S
-gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o $0
+CFLAGS="-W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g"
+gcc $CFLAGS -c *.c
+gcc $CFLAGS -o `basename $0 .c` *.o
exit $?
*/
+/* Test driver for unpremultipliers. */
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-
#include <sys/types.h>
-#include <sys/time.h>
+#include <sys/time.h> /* gettimeofday */
-#if 1
+/* Pixel format config for a 32 bit pixel with 8 bit components. Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
# define ASHIFT 24
-# define RSHIFT 16
-# define GSHIFT 8
-# define BSHIFT 0
-#else
-# define RSHIFT 24
-# define GSHIFT 16
-# define BSHIFT 8
-# define ASHIFT 0
#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
#define AMASK (255 << ASHIFT)
#define RMASK (255 << RSHIFT)
#define GMASK (255 << GSHIFT)
#define BMASK (255 << BSHIFT)
+/* The methods we have available. */
void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_inv32(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_lut(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_div(uint32_t *dst, uint32_t const *src, size_t n);
-static void __attribute__((noinline))
-unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
- uint32_t prev_in = 0;
- uint32_t prev_out = 0;
- size_t i;
-
- for (i=0; i<n; i++) {
- uint32_t rgba = src[i];
- uint32_t a = (rgba & AMASK) >> ASHIFT;
- if (a == 255) {
- dst[i] = rgba;
- continue;
- }
- if (prev_in == rgba) {
- dst[i] = prev_out;
- continue;
- }
- if (a) {
- uint32_t r = (rgba >> RSHIFT) & 0xFF;
- uint32_t g = (rgba >> GSHIFT) & 0xFF;
- uint32_t b = (rgba >> BSHIFT) & 0xFF;
- r = r*255 / a;
- g = g*255 / a;
- b = b*255 / a;
- assert(r < 256);
- assert(g < 256);
- assert(b < 256);
- assert(a < 256);
- prev_in = rgba;
- prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
- } else {
- prev_in = prev_out = 0;
- }
- dst[i] = prev_out;
- }
-}
-
-static uint8_t division_table[65536];
-
-static void __attribute__((noinline))
-unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
- size_t i;
- uint32_t prev_in = 0;
- uint32_t prev_out = 0;
- for (i=0; i<n; i++) {
- uint32_t rgba = src[i];
- uint32_t a = (rgba >> ASHIFT) & 0xFF;
- if (a == 255) {
- dst[i] = rgba;
- }
- else if (prev_in == rgba) {
- dst[i] = prev_out;
- }
- else {
- uint32_t r = (rgba >> RSHIFT) & 0xFF;
- uint32_t g = (rgba >> GSHIFT) & 0xFF;
- uint32_t b = (rgba >> BSHIFT) & 0xFF;
- r = division_table[a*256 + r];
- g = division_table[a*256 + g];
- b = division_table[a*256 + b];
-
- prev_in = rgba;
- prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
- dst[i] = prev_out;
- }
- }
-}
-
-#define RECIPROCAL_BITS 8
-
-static uint32_t reciprocal_table_A[256];
-static uint64_t reciprocal_table_B[256];
-
-#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
-
-static void __attribute__((noinline))
-unpremultiply_with_inv32(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
- size_t i;
- uint32_t prev_in = 0;
- uint32_t prev_out = 0;
- for (i=0; i<n; i++) {
- uint32_t rgba = src[i];
- uint32_t a = (rgba >> ASHIFT) & 255;
- if (a == 255) {
- dst[i] = rgba;
- }
- else if (prev_in == rgba) {
- dst[i] = prev_out;
- }
- else {
- prev_in = rgba;
- uint32_t r = (rgba >> RSHIFT) & 255;
- uint32_t g = (rgba >> GSHIFT) & 255;
- uint32_t b = (rgba >> BSHIFT) & 255;
- uint32_t recip = reciprocal_table_A[a];
- r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
- g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
- b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
- prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
- dst[i] = prev_out;
- }
- }
-}
-
-#define INNER_UNROLL (2)
-
-static void __attribute__((noinline))
-unpremultiply_with_inv32_bis(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
- for (size_t i = 0; i < n; ) {
- uint32_t prev_in, prev_out;
- /* 0 iff all inputs are the same */
- unsigned delta = 0;
- /* adds to 0 if all inputs are opaque */
- unsigned opaque_count = - INNER_UNROLL*255;
-
- {
- uint32_t rgba = prev_in = src[i];
- uint32_t a = (rgba >> ASHIFT) & 255;
- opaque_count += a;
- uint32_t r = (rgba >> RSHIFT) & 255;
- uint32_t g = (rgba >> GSHIFT) & 255;
- uint32_t b = (rgba >> BSHIFT) & 255;
- uint32_t recip = reciprocal_table_A[a];
- r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
- g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
- b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
- dst[i] = prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
- }
-
- /* UNROLL, bitch! */
- for (unsigned j = 1; j < INNER_UNROLL; j++) {
- if (i+j>=n) return;
- uint32_t rgba = src[i+j];
- delta |= rgba ^ prev_in;
- uint32_t a = (rgba >> ASHIFT) & 255;
- opaque_count += a;
- uint32_t r = (rgba >> RSHIFT) & 255;
- uint32_t g = (rgba >> GSHIFT) & 255;
- uint32_t b = (rgba >> BSHIFT) & 255;
- uint32_t recip = reciprocal_table_A[a];
- r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
- g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
- b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
- dst[i+j] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
- }
-
- i += INNER_UNROLL;
-
- /* switch to special case after k of them
- * + minimize cost when unapplicable
- */
-
- if (0 != (delta & opaque_count)) continue;
-
- if (0 == opaque_count) {
- uint32_t in;
- while ((255U << ASHIFT) == ((in = src[i]) & (255U << ASHIFT))) {
- dst[i++] = in;
- if (i >= n) return;
- }
- } else if (0 == delta) {
- while (src[i] == prev_in) {
- dst[i++] = prev_out;
- if (i >= n) return;
- }
- }
- }
-}
-#undef INNER_UNROLL
-
-static void __attribute__((noinline))
-unpremultiply_with_inv32_nocache(
- uint32_t * restrict dst,
- uint32_t const * restrict src,
- size_t n)
-{
- size_t i;
- for (i=0; i<n; i++) {
- uint32_t rgba = src[i];
- uint32_t a = (rgba >> ASHIFT) & 255;
- uint32_t r = (rgba >> RSHIFT) & 255;
- uint32_t g = (rgba >> GSHIFT) & 255;
- uint32_t b = (rgba >> BSHIFT) & 255;
- uint32_t recip = reciprocal_table_A[a];
- r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
- g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
- b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
- dst[i] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
- }
-}
-
-static void __attribute__((noinline))
-unpremultiply_with_inv64(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
- size_t i;
- uint32_t prev_in = 0;
- uint32_t prev_out = 0;
- for (i=0; i<n; i++) {
- uint32_t rgba = src[i];
- uint32_t a = (rgba >> ASHIFT) & 255;
- if (a == 255) {
- dst[i] = rgba;
- }
- else if (prev_in == rgba) {
- dst[i] = prev_out;
- }
- else {
- prev_in = rgba;
- uint64_t r = rgba & RMASK;
- uint64_t g = rgba & GMASK;
- uint64_t b = rgba & BMASK;
- uint64_t recip = reciprocal_table_B[a];
- r = r * recip;
- g = g * recip;
- b = b * recip;
- prev_out = (rgba & AMASK) |
- (((r & ((uint64_t)RMASK << 32)) |
- (g & ((uint64_t)GMASK << 32)) |
- (b & ((uint64_t)BMASK << 32))) >> 32);
- dst[i] = prev_out;
- }
- }
-}
-
-static void __attribute__((noinline))
-unpremultiply_with_inv64_nocache(
- uint32_t * restrict dst,
- uint32_t const * restrict src,
- size_t n)
-{
- size_t i;
- for (i=0; i<n; i++) {
- uint32_t rgba = src[i];
- uint32_t a = (rgba & AMASK) >> ASHIFT;
- uint64_t r = rgba & RMASK;
- uint64_t g = rgba & GMASK;
- uint64_t b = rgba & BMASK;
- uint64_t recip = reciprocal_table_B[a];
- r = r * recip;
- g = g * recip;
- b = b * recip;
- dst[i] = (rgba & AMASK) |
- (((r & ((uint64_t)RMASK << 32)) |
- (g & ((uint64_t)GMASK << 32)) |
- (b & ((uint64_t)BMASK << 32))) >> 32);
- }
-}
-
+/*
+ * Some non-functional methods to gauge the computation overhead.
+ */
static void __attribute__((noinline))
unpremultiply_with_memcpy(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
{
@@ -309,47 +63,9 @@ unpremultiply_with_read(uint32_t * restrict dst, uint32_t const * restrict src,
read_sum = sum;
}
-static void
-make_division_table()
-{
- unsigned a;
- for (a=1; a<256; a++) {
- unsigned x;
- for (x=0; x<256; x++) {
- unsigned y = x*255/a;
- y = y < 255 ? y : 255;
- division_table[a*256 + x] = y;
- }
- }
- for (a=0; a<256; a++) {
- division_table[a] = 0;
- }
-}
-
-static void
-make_reciprocal_table_A()
-{
- unsigned a;
- reciprocal_table_A[0] = 0;
- for (a=1; a<256; a++) {
- uint32_t r = 255U*(1<<RECIPROCAL_BITS) / a;
- while ((a*r) >> RECIPROCAL_BITS < 255) r++;
- reciprocal_table_A[a] = r;
- }
-}
-
-static void
-make_reciprocal_table_B()
-{
- unsigned a;
- reciprocal_table_B[0] = 0;
- for (a=1; a<256; a++) {
- uint64_t r = 255*(1ULL<<32)/a;
- while ((a*r) >> 32 < 255) r++;
- reciprocal_table_B[a] = r;
- }
-}
-
+/* Make sure we don't have superluminant pixels by clamping the colour
+ * components of n pixels at buf[] to be less than or equal to the
+ * pixels' alpha values. */
static void
saturate(void *buf, size_t n)
{
@@ -361,9 +77,9 @@ saturate(void *buf, size_t n)
* about the fact that it's accessing uint32_t values. The
* confusion is currently enough to make it doubt the
* alignment of the buf pointer and not let the vectoriser at
- * this loop. The new vectoriser aggressively uses aligned
- * accesses, and sometimes this loop is called with unaligned
- * addresses. */
+ * this loop. The new vectoriser aggressively depends on naturally
+ * aligned pointers, but sometimes we want to test with unaligned
+ * addresses too. */
union {
uint32_t u32;
uint8_t u8[4];
@@ -393,6 +109,9 @@ saturate(void *buf, size_t n)
}
}
+/*
+ * Methods to create pixel patterns to test.
+ */
static void
fill_random(uint32_t *buf, size_t n)
{
@@ -428,6 +147,10 @@ fill_empty(void *buf, size_t n)
memset(buf, 0, 4*n);
}
+/*
+ * Main tester.
+ */
+
static long
getenvlong(char const *name, long default_value)
{
@@ -446,10 +169,9 @@ now_ms()
int
main(int argc, char **argv)
{
- long nloops = getenvlong("loops", 50);
+ long nloops = getenvlong("loops", 500);
size_t n = getenvlong("pixels", 2*1024*1024);
- long offset = getenvlong("offset", 0);
- /* non-zero is liable to segfault due to gcc alignment breakage */
+ long offset = getenvlong("offset", 0); /* byte offset */
union {
uint32_t *u32;
char *u8;
@@ -465,10 +187,6 @@ main(int argc, char **argv)
#define dst udst.u32
#define src usrc.u32
- make_division_table();
- make_reciprocal_table_A();
- make_reciprocal_table_B();
-
for (i=1; i<argc; i++) {
if (0 == strcmp(argv[i], "verify")) {
verify = 1;
@@ -491,10 +209,6 @@ main(int argc, char **argv)
else if (0 == strcmp(argv[i], "div") ||
0 == strcmp(argv[i], "lut") ||
0 == strcmp(argv[i], "inv32") ||
- 0 == strcmp(argv[i], "inv32-bis") ||
- 0 == strcmp(argv[i], "inv64") ||
- 0 == strcmp(argv[i], "inv32-nocache") ||
- 0 == strcmp(argv[i], "inv64-nocache") ||
0 == strcmp(argv[i], "sse2") ||
0 == strcmp(argv[i], "copy") ||
0 == strcmp(argv[i], "read") ||
@@ -508,7 +222,7 @@ main(int argc, char **argv)
return 1;
}
}
- saturate(src, n);
+ saturate(src, n); /* don't deal with superluminant pixels. */
if (verify) {
ref = malloc(n*4);
@@ -531,24 +245,9 @@ main(int argc, char **argv)
unpremultiply_with_inv32(dst, src, n);
}
}
- else if (0 == strcmp(method, "inv32-bis")) {
- while (nloops-- > 0) {
- unpremultiply_with_inv32_bis(dst, src, n);
- }
- }
- else if (0 == strcmp(method, "inv64")) {
- while (nloops-- > 0) {
- unpremultiply_with_inv64(dst, src, n);
- }
- }
- else if (0 == strcmp(method, "inv32-nocache")) {
- while (nloops-- > 0) {
- unpremultiply_with_inv32_nocache(dst, src, n);
- }
- }
- else if (0 == strcmp(method, "inv64-nocache")) {
+ else if (0 == strcmp(method, "sse2")) {
while (nloops-- > 0) {
- unpremultiply_with_inv64_nocache(dst, src, n);
+ unpremultiply_with_sse2(dst, src, n);
}
}
else if (0 == strcmp(method, "copy")) {
@@ -566,11 +265,6 @@ main(int argc, char **argv)
unpremultiply_with_read(dst, src, n);
}
}
- else if (0 == strcmp(method, "sse2")) {
- while (nloops-- > 0) {
- unpremultiply_with_sse2(dst, src, n);
- }
- }
else if (0 == strcmp(method, "noop")) {
/* do nothing. */
} else {