Separate unpremultiplier methods, clean them up, and kill the bad ones.

author: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-13 19:33:06 +0200
committer: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-13 19:33:06 +0200
commit: b264fbcbe6f12d9f25a57f13f3747e2a51b2c2d9 (patch)
tree: 8e84d115d76e50518de0f9d0fcd890815d3e5a63
parent: a30c4f05e916db2b04613dc22e357ab63235a776 (diff)
5 files changed, 297 insertions, 342 deletions
diff --git a/unpremultiply-div.c b/unpremultiply-div.c
new file mode 100644
index 0000000..514aa28
--- /dev/null
+++ b/unpremultiply-div.c
@@ -0,0 +1,56 @@
+/* Reference implementation using divisions.  Since the slow path is
+ * so very slow this version specialises runs of constant or solid
+ * pixels. */
+#include <stddef.h>
+#include <stdint.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components.  Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255 << ASHIFT)
+#define RMASK (255 << RSHIFT)
+#define GMASK (255 << GSHIFT)
+#define BMASK (255 << BSHIFT)
+
+void
+unpremultiply_with_div(
+    uint32_t * restrict       dst,
+    uint32_t const * restrict src,
+    size_t                    n)
+{
+    uint32_t prev_in = 0;
+    uint32_t prev_out = 0;
+    size_t i;
+
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba & AMASK) >> ASHIFT;
+	if (a == 255) {
+	    dst[i] = rgba;
+	    continue;
+	}
+	if (prev_in == rgba) {
+	    dst[i] = prev_out;
+	    continue;
+	}
+	if (a) {
+	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
+	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
+	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
+	    r = r*255 / a;
+	    g = g*255 / a;
+	    b = b*255 / a;
+	    prev_in = rgba;
+	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+	} else {
+	    prev_in = prev_out = 0;
+	}
+	dst[i] = prev_out;
+    }
+}
diff --git a/unpremultiply-inv32.c b/unpremultiply-inv32.c
new file mode 100644
index 0000000..e9505de
--- /dev/null
+++ b/unpremultiply-inv32.c
@@ -0,0 +1,125 @@
+/* An unpremultiplier using reciprocal multiplication.  It specialises
+ * constant runs and solid runs of pixels with low overhead loops and
+ * uses only a 1KB table of reciprocals. */
+/* gcc -c -W -Wall -O3 -funroll-all-loops -fomit-frame-pointer -std=c99 unpremultiply-inv32.c */
+#include <stdint.h>
+#include <stddef.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components.  Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255U << ASHIFT)
+#define RMASK (255U << RSHIFT)
+#define GMASK (255U << GSHIFT)
+#define BMASK (255U << BSHIFT)
+
+/* Shift x left by y bits.  Supports negative y for right shifts. */
+#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
+
+#define ceil_div(a,b) ((a) + (b)-1) / (b)
+
+/* The reciprocal_table[i] entries are defined by
+ *
+ * 	0		when i = 0
+ *	255 / i		when i > 0
+ *
+ * represented in fixed point format with RECIPROCAL_BITS of
+ * precision and errors rounded up. */
+#define RECIPROCAL_BITS 16
+static uint32_t const reciprocal_table[256] = {
+# define R(i)  ((i) ? ceil_div(255*(1<<RECIPROCAL_BITS), (i)) : 0)
+# define R1(i) R(i),  R(i+1),   R(i+2),   R(i+3)
+# define R2(i) R1(i), R1(i+4),  R1(i+8),  R1(i+12)
+# define R3(i) R2(i), R2(i+16), R2(i+32), R2(i+48)
+               R3(0), R3(64),   R3(128),  R3(192)
+};
+
+/* Transfer num_pixels unpremultiplied pixels from src[] to dst[].
+ * This version uses a short probe period of a few pixels to identify
+ * runs of constant or solid pixels.  When a run is identified it
+ * falls into a special case loop for the duration of the run. */
+void
+unpremultiply_with_inv32(
+    uint32_t       * restrict dst,
+    uint32_t const * restrict src,
+    size_t                    num_pixels)
+{
+#define PROBE_LENGTH 2
+
+   for (size_t i = 0; i < num_pixels; ) {
+	/* We want to identify long runs of constant input pixels and
+	 * cache the unpremultiplied.  */
+        uint32_t const_in, const_out;
+
+        /* Diff is the or of all bitwise differences from const_in
+	 * during the probe period.  If it is zero after the probe
+	 * period then every input pixel was identical in the
+	 * probe. */
+        unsigned diff = 0; 
+
+        /* Accumulator for all alphas of the probe period pixels,
+	 * biased to make the sum zero if the */
+        unsigned accu = -PROBE_LENGTH*255;
+
+	unsigned j;
+
+	/* The first iteration of the probe period initialises
+	 * const_in. */
+        {
+            uint32_t rgba = const_in = src[i];
+            uint32_t a = (rgba >> ASHIFT) & 255;
+            accu += a;
+            uint32_t r = (rgba >> RSHIFT) & 255;
+            uint32_t g = (rgba >> GSHIFT) & 255;
+            uint32_t b = (rgba >> BSHIFT) & 255;
+            uint32_t recip = reciprocal_table[a];
+            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+            dst[i] = const_out = 
+		(r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+        }
+
+        for (j = 1; j < PROBE_LENGTH; j++) {
+            if (i + j >= num_pixels)
+		return;
+            uint32_t rgba = src[i+j];
+            diff |= rgba ^ const_in;
+            uint32_t a = (rgba >> ASHIFT) & 255;
+            accu += a;
+            uint32_t r = (rgba >> RSHIFT) & 255;
+            uint32_t g = (rgba >> GSHIFT) & 255;
+            uint32_t b = (rgba >> BSHIFT) & 255;
+            uint32_t recip = reciprocal_table[a];
+            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+            dst[i+j] =
+		(r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+        }
+        i += PROBE_LENGTH;
+
+	/* Fall into special cases if we have special
+	 * circumstances. */
+        if (0 != (accu & diff)) continue;
+
+        if (0 == accu) {	/* a run of solid pixels. */
+            uint32_t in;
+            while (AMASK == ((in = src[i]) & AMASK)) {
+                dst[i++] = in;
+                if (i >= num_pixels) return;
+            }
+        } else if (0 == diff) {	/* a run of constant pixels. */
+            while (src[i] == const_in) {
+                dst[i++] = const_out;
+                if (i >= num_pixels) return;
+            }
+        }
+    }
+}
diff --git a/unpremultiply-lut.c b/unpremultiply-lut.c
new file mode 100644
index 0000000..048eefa
--- /dev/null
+++ b/unpremultiply-lut.c
@@ -0,0 +1,79 @@
+/* An unpremultiplier using a 64KB division table.  It specialises
+ * constant runs and solid runs of pixels, but not quite as well as
+ * unpremultiply-inv32 (but it could given some more effort.)  It has
+ * the advantage over the others that it can clamp the result into
+ * range without any cost so that the output doesn't overflow should
+ * there be superluminant pixels in the input.  It's also reasonably
+ * fast if you have enough L1. */
+#include <stddef.h>
+#include <stdint.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components.  Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255 << ASHIFT)
+#define RMASK (255 << RSHIFT)
+#define GMASK (255 << GSHIFT)
+#define BMASK (255 << BSHIFT)
+
+/* The function to tabulate in the lookup table.  Could be anything
+ * really. */
+#define div_func(a,b)	(255*(a) / (b))
+
+#define clamp(x)	((x) > 255 ? 255 : (x))
+
+/* The entries division_table[a][b] are defined by 
+ *
+ *   0				when b = 0
+ *   min(255, ceil(255*a / b))	when b > 0
+ *
+ */
+static uint8_t const division_table[256][256] = {
+
+#define R(a,b)		((b) > 0 ? clamp(div_func((a),(b))) : 0)
+#define R1(a,i)		R(a, i),  R(a+1, i),   R(a+2, i),   R(a+3, i)
+#define R2(a,i)		R1(a, i), R1(a+4, i),  R1(a+8, i),  R1(a+12, i)
+#define R3(a,i)		R2(a, i), R2(a+16, i), R2(a+32, i), R2(a+48, i)
+
+#define S(b)	      { R3(0, b), R3(64, b),   R3(128, b),  R3(192, b) }
+#define S1(b)		S(b),	  S(b+1),      S(b+2),      S(b+3)
+#define S2(b)		S1(b),	  S1(b+4),     S1(b+8),	    S1(b+12)
+#define S3(b)		S2(b),	  S2(b+16),    S2(b+32),    S2(b+48)
+
+			S3(0),	  S3(64),      S3(128),	    S3(192)
+};
+
+void
+unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    size_t i;
+    uint32_t prev_in = 0;
+    uint32_t prev_out = 0;
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba >> ASHIFT) & 0xFF;
+	if (a == 255) {
+	    dst[i] = rgba;
+	}
+	else if (prev_in == rgba) {
+	    dst[i] = prev_out;
+	}
+	else {
+	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
+	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
+	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
+	    r = division_table[a][r];
+	    g = division_table[a][g];
+	    b = division_table[a][b];
+	    prev_in = rgba;
+	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+	    dst[i] = prev_out;
+	}
+    }
+}
diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S
index 8ecee17..1ffbf5f 100644
--- a/unpremultiply-sse2.S
+++ b/unpremultiply-sse2.S
@@ -9,6 +9,7 @@
 ;;;		uint32_t const  *src,
 ;;;		unsigned long    num_pixels);
 ;;;
+;;; Tested with nasm 2.06rc2.
 	section .text
 
 ; We're only using rax-rbp in this file so that
diff --git a/unpremultiply.c b/unpremultiply.c
index 6d904d7..201a59c 100644
--- a/unpremultiply.c
+++ b/unpremultiply.c
@@ -1,288 +1,42 @@
 #define RUN_ME /*
 nasm -g -f elf64 unpremultiply-sse2.S
-gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o $0
+CFLAGS="-W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g"
+gcc $CFLAGS -c *.c
+gcc $CFLAGS -o `basename $0 .c` *.o
 exit $?
 */
+/* Test driver for unpremultipliers. */
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include <sys/types.h>
-#include <sys/time.h>
+#include <sys/time.h>		/* gettimeofday */
 
-#if 1
+/* Pixel format config for a 32 bit pixel with 8 bit components.  Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
 # define ASHIFT 24
-# define RSHIFT 16
-# define GSHIFT  8
-# define BSHIFT  0
-#else
-# define RSHIFT 24
-# define GSHIFT 16
-# define BSHIFT  8
-# define ASHIFT  0
 #endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
 
 #define AMASK (255 << ASHIFT)
 #define RMASK (255 << RSHIFT)
 #define GMASK (255 << GSHIFT)
 #define BMASK (255 << BSHIFT)
 
+/* The methods we have available. */
 void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_inv32(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_lut(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_div(uint32_t *dst, uint32_t const *src, size_t n);
 
-static void __attribute__((noinline))
-unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
-    uint32_t prev_in = 0;
-    uint32_t prev_out = 0;
-    size_t i;
-
-    for (i=0; i<n; i++) {
-	uint32_t rgba = src[i];
-	uint32_t a = (rgba & AMASK) >> ASHIFT;
-	if (a == 255) {
-	    dst[i] = rgba;
-	    continue;
-	}
-	if (prev_in == rgba) {
-	    dst[i] = prev_out;
-	    continue;
-	}
-	if (a) {
-	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
-	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
-	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
-	    r = r*255 / a;
-	    g = g*255 / a;
-	    b = b*255 / a;
-	    assert(r < 256);
-	    assert(g < 256);
-	    assert(b < 256);
-	    assert(a < 256);
-	    prev_in = rgba;
-	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
-	} else {
-	    prev_in = prev_out = 0;
-	}
-	dst[i] = prev_out;
-    }
-}
-
-static uint8_t division_table[65536];
-
-static void __attribute__((noinline))
-unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
-    size_t i;
-    uint32_t prev_in = 0;
-    uint32_t prev_out = 0;
-    for (i=0; i<n; i++) {
-	uint32_t rgba = src[i];
-	uint32_t a = (rgba >> ASHIFT) & 0xFF;
-	if (a == 255) {
-	    dst[i] = rgba;
-	}
-	else if (prev_in == rgba) {
-	    dst[i] = prev_out;
-	}
-	else {
-	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
-	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
-	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
-	    r = division_table[a*256 + r];
-	    g = division_table[a*256 + g];
-	    b = division_table[a*256 + b];
-
-	    prev_in = rgba;
-	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
-	    dst[i] = prev_out;
-	}
-    }
-}
-
-#define RECIPROCAL_BITS 8
-
-static uint32_t reciprocal_table_A[256];
-static uint64_t reciprocal_table_B[256];
-
-#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
-
-static void __attribute__((noinline))
-unpremultiply_with_inv32(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
-    size_t i;
-    uint32_t prev_in = 0;
-    uint32_t prev_out = 0;
-    for (i=0; i<n; i++) {
-	uint32_t rgba = src[i];
-	uint32_t a = (rgba >> ASHIFT) & 255;
-	if (a == 255) {
-	    dst[i] = rgba;
-	}
- 	else if (prev_in == rgba) {
- 	    dst[i] = prev_out;
- 	}
- 	else { 
- 	    prev_in = rgba;
-	    uint32_t r = (rgba >> RSHIFT) & 255;
-	    uint32_t g = (rgba >> GSHIFT) & 255;
-	    uint32_t b = (rgba >> BSHIFT) & 255;
-	    uint32_t recip = reciprocal_table_A[a];
-	    r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
-	    g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
-	    b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
-	    prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
-	    dst[i] = prev_out;
-	}
-    }
-}
-
-#define INNER_UNROLL (2)
-
-static void __attribute__((noinline))
-unpremultiply_with_inv32_bis(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
-    for (size_t i = 0; i < n; ) {
-        uint32_t prev_in, prev_out;
-        /* 0 iff all inputs are the same */
-        unsigned delta = 0; 
-        /* adds to 0 if all inputs are opaque */
-        unsigned opaque_count = - INNER_UNROLL*255;
-
-        {
-            uint32_t rgba = prev_in = src[i];
-            uint32_t a = (rgba >> ASHIFT) & 255;
-            opaque_count += a;
-            uint32_t r = (rgba >> RSHIFT) & 255;
-            uint32_t g = (rgba >> GSHIFT) & 255;
-            uint32_t b = (rgba >> BSHIFT) & 255;
-            uint32_t recip = reciprocal_table_A[a];
-            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
-            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
-            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
-            dst[i] = prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
-        }
-
-        /* UNROLL, bitch! */
-        for (unsigned j = 1; j < INNER_UNROLL; j++) {
-            if (i+j>=n) return;
-            uint32_t rgba = src[i+j];
-            delta |= rgba ^ prev_in;
-            uint32_t a = (rgba >> ASHIFT) & 255;
-            opaque_count += a;
-            uint32_t r = (rgba >> RSHIFT) & 255;
-            uint32_t g = (rgba >> GSHIFT) & 255;
-            uint32_t b = (rgba >> BSHIFT) & 255;
-            uint32_t recip = reciprocal_table_A[a];
-            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
-            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
-            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
-            dst[i+j] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
-        }
-
-        i += INNER_UNROLL;
-
-        /* switch to special case after k of them
-         * + minimize cost when unapplicable
-         */
-
-        if (0 != (delta & opaque_count)) continue;
-
-        if (0 == opaque_count) {
-            uint32_t in;
-            while ((255U << ASHIFT) == ((in = src[i]) & (255U << ASHIFT))) {
-                dst[i++] = in;
-                if (i >= n) return;
-            }
-        } else if (0 == delta) {
-            while (src[i] == prev_in) {
-                dst[i++] = prev_out;
-                if (i >= n) return;
-            }
-        }
-    }
-}
-#undef INNER_UNROLL
-
-static void __attribute__((noinline))
-unpremultiply_with_inv32_nocache(
-    uint32_t * restrict dst,
-    uint32_t const * restrict src,
-    size_t n)
-{
-    size_t i;
-    for (i=0; i<n; i++) {
-	uint32_t rgba = src[i];
-	uint32_t a = (rgba >> ASHIFT) & 255;
-	uint32_t r = (rgba >> RSHIFT) & 255;
-	uint32_t g = (rgba >> GSHIFT) & 255;
-	uint32_t b = (rgba >> BSHIFT) & 255;
-	uint32_t recip = reciprocal_table_A[a];
-	r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
-	g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
-	b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
-	dst[i] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
-    }
-}
-
-static void __attribute__((noinline))
-unpremultiply_with_inv64(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
-{
-    size_t i;
-    uint32_t prev_in = 0;
-    uint32_t prev_out = 0;
-    for (i=0; i<n; i++) {
-	uint32_t rgba = src[i];
-	uint32_t a = (rgba >> ASHIFT) & 255;
-	if (a == 255) {
-	    dst[i] = rgba;
-	}
- 	else if (prev_in == rgba) {
- 	    dst[i] = prev_out;
- 	}
- 	else { 
- 	    prev_in = rgba;
-	    uint64_t r = rgba & RMASK;
-	    uint64_t g = rgba & GMASK;
-	    uint64_t b = rgba & BMASK;
-	    uint64_t recip = reciprocal_table_B[a];
-	    r = r * recip;
-	    g = g * recip;
-	    b = b * recip;
-	    prev_out = (rgba & AMASK) |
-		(((r & ((uint64_t)RMASK << 32)) |
-		  (g & ((uint64_t)GMASK << 32)) |
-		  (b & ((uint64_t)BMASK << 32))) >> 32);
-	    dst[i] = prev_out;
-	}
-    }
-}
-
-static void __attribute__((noinline))
-unpremultiply_with_inv64_nocache(
-    uint32_t * restrict dst,
-    uint32_t const * restrict src,
-    size_t n)
-{
-    size_t i;
-    for (i=0; i<n; i++) {
-	uint32_t rgba = src[i];
-	uint32_t a = (rgba & AMASK) >> ASHIFT;
-	uint64_t r = rgba & RMASK;
-	uint64_t g = rgba & GMASK;
-	uint64_t b = rgba & BMASK;
-	uint64_t recip = reciprocal_table_B[a];
-	r = r * recip;
-	g = g * recip;
-	b = b * recip;
-	dst[i] = (rgba & AMASK) |
-	    (((r & ((uint64_t)RMASK << 32)) |
-	      (g & ((uint64_t)GMASK << 32)) |
-	      (b & ((uint64_t)BMASK << 32))) >> 32);
-    }
-}
-
+/*
+ * Some non-functional methods to gauge the computation overhead.
+ */
 static void __attribute__((noinline))
 unpremultiply_with_memcpy(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
 {
@@ -309,47 +63,9 @@ unpremultiply_with_read(uint32_t * restrict dst, uint32_t const * restrict src,
     read_sum = sum;
 }
 
-static void
-make_division_table()
-{
-    unsigned a;
-    for (a=1; a<256; a++) {
-	unsigned x;
-	for (x=0; x<256; x++) {
-	    unsigned y = x*255/a;
-	    y = y < 255 ? y : 255;
-	    division_table[a*256 + x] = y;
-	}
-    }
-    for (a=0; a<256; a++) {
-	division_table[a] = 0;
-    }
-}
-
-static void
-make_reciprocal_table_A()
-{
-    unsigned a;
-    reciprocal_table_A[0] = 0;
-    for (a=1; a<256; a++) {
-	uint32_t r = 255U*(1<<RECIPROCAL_BITS) / a;
-	while ((a*r) >> RECIPROCAL_BITS < 255) r++;
-	reciprocal_table_A[a] = r;
-    }
-}
-
-static void
-make_reciprocal_table_B()
-{
-    unsigned a;
-    reciprocal_table_B[0] = 0;
-    for (a=1; a<256; a++) {
-	uint64_t r = 255*(1ULL<<32)/a;
-	while ((a*r) >> 32 < 255) r++;
-	reciprocal_table_B[a] = r;
-    }
-}
-
+/* Make sure we don't have superluminant pixels by clamping the colour
+ * components of n pixels at buf[] to be less than or equal to the
+ * pixels' alpha values. */
 static void
 saturate(void *buf, size_t n)
 {
@@ -361,9 +77,9 @@ saturate(void *buf, size_t n)
 	 * about the fact that it's accessing uint32_t values.  The
 	 * confusion is currently enough to make it doubt the
 	 * alignment of the buf pointer and not let the vectoriser at
-	 * this loop.  The new vectoriser aggressively uses aligned
-	 * accesses, and sometimes this loop is called with unaligned
-	 * addresses. */
+	 * this loop.  The new vectoriser aggressively depends on naturally
+	 * aligned pointers, but sometimes we want to test with unaligned
+	 * addresses too. */
 	union {
 	    uint32_t u32;
 	    uint8_t u8[4];
@@ -393,6 +109,9 @@ saturate(void *buf, size_t n)
     }
 }
 
+/*
+ * Methods to create pixel patterns to test.
+ */
 static void
 fill_random(uint32_t *buf, size_t n)
 {
@@ -428,6 +147,10 @@ fill_empty(void *buf, size_t n)
     memset(buf, 0, 4*n);
 }
 
+/*
+ * Main tester.
+ */
+
 static long
 getenvlong(char const *name, long default_value)
 {
@@ -446,10 +169,9 @@ now_ms()
 int
 main(int argc, char **argv)
 {
-    long nloops = getenvlong("loops", 50);
+    long nloops = getenvlong("loops", 500);
     size_t n = getenvlong("pixels", 2*1024*1024);
-    long offset = getenvlong("offset", 0);
-    /* non-zero is liable to segfault due to gcc alignment breakage */
+    long offset = getenvlong("offset", 0); /* byte offset */
     union {
 	uint32_t *u32;
 	char *u8;
@@ -465,10 +187,6 @@ main(int argc, char **argv)
 #define dst udst.u32
 #define src usrc.u32
 
-    make_division_table();
-    make_reciprocal_table_A();
-    make_reciprocal_table_B();
-
     for (i=1; i<argc; i++) {
 	if (0 == strcmp(argv[i], "verify")) {
 	    verify = 1;
@@ -491,10 +209,6 @@ main(int argc, char **argv)
 	else if (0 == strcmp(argv[i], "div") ||
 		 0 == strcmp(argv[i], "lut") ||
 		 0 == strcmp(argv[i], "inv32") ||
-		 0 == strcmp(argv[i], "inv32-bis") ||
-		 0 == strcmp(argv[i], "inv64") ||
-		 0 == strcmp(argv[i], "inv32-nocache") ||
-		 0 == strcmp(argv[i], "inv64-nocache") ||
 		 0 == strcmp(argv[i], "sse2") ||
 		 0 == strcmp(argv[i], "copy") ||
 		 0 == strcmp(argv[i], "read") ||
@@ -508,7 +222,7 @@ main(int argc, char **argv)
 	    return 1;
 	}
     }
-    saturate(src, n);
+    saturate(src, n);	   /* don't deal with superluminant pixels. */
 
     if (verify) {
 	ref = malloc(n*4);
@@ -531,24 +245,9 @@ main(int argc, char **argv)
 	    unpremultiply_with_inv32(dst, src, n);
 	}
     }
-    else if (0 == strcmp(method, "inv32-bis")) {
-	while (nloops-- > 0) {
-	    unpremultiply_with_inv32_bis(dst, src, n);
-	}
-    }
-    else if (0 == strcmp(method, "inv64")) {
-	while (nloops-- > 0) {
-	    unpremultiply_with_inv64(dst, src, n);
-	}
-    }
-    else if (0 == strcmp(method, "inv32-nocache")) {
-	while (nloops-- > 0) {
-	    unpremultiply_with_inv32_nocache(dst, src, n);
-	}
-    }
-    else if (0 == strcmp(method, "inv64-nocache")) {
+    else if (0 == strcmp(method, "sse2")) {
 	while (nloops-- > 0) {
-	    unpremultiply_with_inv64_nocache(dst, src, n);
+	    unpremultiply_with_sse2(dst, src, n);
 	}
     }
     else if (0 == strcmp(method, "copy")) {
@@ -566,11 +265,6 @@ main(int argc, char **argv)
 	    unpremultiply_with_read(dst, src, n);
 	}
     }
-    else if (0 == strcmp(method, "sse2")) {
-	while (nloops-- > 0) {
-	    unpremultiply_with_sse2(dst, src, n);
-	}
-    }
     else if (0 == strcmp(method, "noop")) {
 	/* do nothing. */
     } else {
author	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-13 19:33:06 +0200
committer	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-13 19:33:06 +0200
commit	b264fbcbe6f12d9f25a57f13f3747e2a51b2c2d9 (patch)
tree	8e84d115d76e50518de0f9d0fcd890815d3e5a63
parent	a30c4f05e916db2b04613dc22e357ab63235a776 (diff)