Separate special cased unpremultipliers from just the basic method.

author: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-14 18:12:55 +0200
committer: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-14 18:12:55 +0200
commit: e4058babc81f509082da126c1052b1ce389f4e3b (patch)
tree: d90edd5493b5fa7e443dbaa3508918df7dba9c9a
parent: b264fbcbe6f12d9f25a57f13f3747e2a51b2c2d9 (diff)
8 files changed, 322 insertions, 131 deletions
diff --git a/.gitignore b/.gitignore
index a70237a..b81b911 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *.o
 *~
+unpremultiply
diff --git a/unpremultiply-div.c b/unpremultiply-div.c
index 514aa28..543149c 100644
--- a/unpremultiply-div.c
+++ b/unpremultiply-div.c
@@ -24,33 +24,21 @@ unpremultiply_with_div(
     uint32_t const * restrict src,
     size_t                    n)
 {
-    uint32_t prev_in = 0;
-    uint32_t prev_out = 0;
     size_t i;
-
+    
     for (i=0; i<n; i++) {
 	uint32_t rgba = src[i];
-	uint32_t a = (rgba & AMASK) >> ASHIFT;
-	if (a == 255) {
-	    dst[i] = rgba;
-	    continue;
-	}
-	if (prev_in == rgba) {
-	    dst[i] = prev_out;
-	    continue;
-	}
-	if (a) {
+	if (rgba & AMASK) {
+	    uint32_t a = (rgba >> ASHIFT) & 0xFF;
 	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
 	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
 	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
 	    r = r*255 / a;
 	    g = g*255 / a;
 	    b = b*255 / a;
-	    prev_in = rgba;
-	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+	    dst[i] = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a << ASHIFT);
 	} else {
-	    prev_in = prev_out = 0;
+	    dst[i] = 0;
 	}
-	dst[i] = prev_out;
     }
 }
diff --git a/unpremultiply-inv32.c b/unpremultiply-inv32.c
index e9505de..fbb5572 100644
--- a/unpremultiply-inv32.c
+++ b/unpremultiply-inv32.c
@@ -1,7 +1,5 @@
-/* An unpremultiplier using reciprocal multiplication.  It specialises
- * constant runs and solid runs of pixels with low overhead loops and
- * uses only a 1KB table of reciprocals. */
-/* gcc -c -W -Wall -O3 -funroll-all-loops -fomit-frame-pointer -std=c99 unpremultiply-inv32.c */
+/* Basic unpremultiplier using reciprocal multiplication from a 1KB
+ * table of reciprocals. */
 #include <stdint.h>
 #include <stddef.h>
 
@@ -19,6 +17,9 @@
 #define GMASK (255U << GSHIFT)
 #define BMASK (255U << BSHIFT)
 
+/* Set to 1 if the input can have superluminant pixels. */
+#define DO_CLAMP_INPUT 0
+
 /* Shift x left by y bits.  Supports negative y for right shifts. */
 #define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
 
@@ -40,86 +41,29 @@ static uint32_t const reciprocal_table[256] = {
                R3(0), R3(64),   R3(128),  R3(192)
 };
 
-/* Transfer num_pixels unpremultiplied pixels from src[] to dst[].
- * This version uses a short probe period of a few pixels to identify
- * runs of constant or solid pixels.  When a run is identified it
- * falls into a special case loop for the duration of the run. */
 void
 unpremultiply_with_inv32(
     uint32_t       * restrict dst,
     uint32_t const * restrict src,
     size_t                    num_pixels)
 {
-#define PROBE_LENGTH 2
-
-   for (size_t i = 0; i < num_pixels; ) {
-	/* We want to identify long runs of constant input pixels and
-	 * cache the unpremultiplied.  */
-        uint32_t const_in, const_out;
-
-        /* Diff is the or of all bitwise differences from const_in
-	 * during the probe period.  If it is zero after the probe
-	 * period then every input pixel was identical in the
-	 * probe. */
-        unsigned diff = 0; 
-
-        /* Accumulator for all alphas of the probe period pixels,
-	 * biased to make the sum zero if the */
-        unsigned accu = -PROBE_LENGTH*255;
-
-	unsigned j;
-
-	/* The first iteration of the probe period initialises
-	 * const_in. */
-        {
-            uint32_t rgba = const_in = src[i];
-            uint32_t a = (rgba >> ASHIFT) & 255;
-            accu += a;
-            uint32_t r = (rgba >> RSHIFT) & 255;
-            uint32_t g = (rgba >> GSHIFT) & 255;
-            uint32_t b = (rgba >> BSHIFT) & 255;
-            uint32_t recip = reciprocal_table[a];
-            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
-            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
-            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
-            dst[i] = const_out = 
-		(r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
-        }
-
-        for (j = 1; j < PROBE_LENGTH; j++) {
-            if (i + j >= num_pixels)
-		return;
-            uint32_t rgba = src[i+j];
-            diff |= rgba ^ const_in;
-            uint32_t a = (rgba >> ASHIFT) & 255;
-            accu += a;
-            uint32_t r = (rgba >> RSHIFT) & 255;
-            uint32_t g = (rgba >> GSHIFT) & 255;
-            uint32_t b = (rgba >> BSHIFT) & 255;
-            uint32_t recip = reciprocal_table[a];
-            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
-            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
-            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
-            dst[i+j] =
-		(r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
-        }
-        i += PROBE_LENGTH;
-
-	/* Fall into special cases if we have special
-	 * circumstances. */
-        if (0 != (accu & diff)) continue;
-
-        if (0 == accu) {	/* a run of solid pixels. */
-            uint32_t in;
-            while (AMASK == ((in = src[i]) & AMASK)) {
-                dst[i++] = in;
-                if (i >= num_pixels) return;
-            }
-        } else if (0 == diff) {	/* a run of constant pixels. */
-            while (src[i] == const_in) {
-                dst[i++] = const_out;
-                if (i >= num_pixels) return;
-            }
-        }
+    size_t i;
+    for (i = 0; i < num_pixels; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba >> ASHIFT) & 255;
+	uint32_t r = (rgba >> RSHIFT) & 255;
+	uint32_t g = (rgba >> GSHIFT) & 255;
+	uint32_t b = (rgba >> BSHIFT) & 255;
+	uint32_t recip = reciprocal_table[a];
+#if DO_CLAMP_INPUT
+	r = r < a ? r : a;
+	g = g < a ? g : a;
+	b = b < a ? b : a;
+#endif
+	r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+	g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+	b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+	dst[i] =
+	    (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
     }
 }
diff --git a/unpremultiply-inv32b.c b/unpremultiply-inv32b.c
new file mode 100644
index 0000000..4735a78
--- /dev/null
+++ b/unpremultiply-inv32b.c
@@ -0,0 +1,138 @@
+/* An unpremultiplier using reciprocal multiplication.  It specialises
+ * constant runs and solid runs of pixels with low overhead loops and
+ * uses only a 1KB table of reciprocals. */
+/* gcc -c -W -Wall -O3 -funroll-all-loops -fomit-frame-pointer -std=c99 unpremultiply-inv32.c */
+#include <stdint.h>
+#include <stddef.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components.  Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255U << ASHIFT)
+#define RMASK (255U << RSHIFT)
+#define GMASK (255U << GSHIFT)
+#define BMASK (255U << BSHIFT)
+
+/* Set to 1 if the input can have superluminant pixels. */
+#define DO_CLAMP_INPUT 0
+
+/* Shift x left by y bits.  Supports negative y for right shifts. */
+#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
+
+#define ceil_div(a,b) ((a) + (b)-1) / (b)
+
+/* The reciprocal_table[i] entries are defined by
+ *
+ * 	0		when i = 0
+ *	255 / i		when i > 0
+ *
+ * represented in fixed point format with RECIPROCAL_BITS of
+ * precision and errors rounded up. */
+#define RECIPROCAL_BITS 16
+static uint32_t const reciprocal_table[256] = {
+# define R(i)  ((i) ? ceil_div(255*(1<<RECIPROCAL_BITS), (i)) : 0)
+# define R1(i) R(i),  R(i+1),   R(i+2),   R(i+3)
+# define R2(i) R1(i), R1(i+4),  R1(i+8),  R1(i+12)
+# define R3(i) R2(i), R2(i+16), R2(i+32), R2(i+48)
+               R3(0), R3(64),   R3(128),  R3(192)
+};
+
+/* Transfer num_pixels unpremultiplied pixels from src[] to dst[].
+ * This version uses a short probe period of a few pixels to identify
+ * runs of constant or solid pixels.  When a run is identified it
+ * falls into a special case loop for the duration of the run. */
+void
+unpremultiply_with_inv32b(
+    uint32_t       * restrict dst,
+    uint32_t const * restrict src,
+    size_t                    num_pixels)
+{
+    size_t i = 0;
+    while (i < num_pixels) {
+	/* We want to identify long runs of constant input pixels and
+	 * cache the unpremultiplied.  */
+        uint32_t const_in, const_out;
+
+        /* Diff is the or of all bitwise differences from const_in
+	 * during the probe period.  If it is zero after the probe
+	 * period then every input pixel was identical in the
+	 * probe. */
+        unsigned diff = 0; 
+
+        /* Accumulator for all alphas of the probe period pixels,
+	 * biased to make the sum zero if the */
+        unsigned accu = -2*255;
+
+        {
+	    uint32_t rgba, a, r, g, b, recip;
+            rgba = const_in = src[i];
+            a = (rgba >> ASHIFT) & 255;
+            accu += a;
+            r = (rgba >> RSHIFT) & 255;
+            g = (rgba >> GSHIFT) & 255;
+            b = (rgba >> BSHIFT) & 255;
+            recip = reciprocal_table[a];
+#if DO_CLAMP_INPUT
+	    r = r < a ? r : a;
+	    g = g < a ? g : a;
+	    b = b < a ? b : a;
+#endif
+            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+            dst[i] = const_out = 
+		(r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+        }
+
+	if (i + 1 == num_pixels)
+	    return;
+
+	{
+	    uint32_t rgba, a, r, g, b, recip;
+            rgba = src[i+1];
+            a = (rgba >> ASHIFT) & 255;
+            accu += a;
+            r = (rgba >> RSHIFT) & 255;
+            g = (rgba >> GSHIFT) & 255;
+            b = (rgba >> BSHIFT) & 255;
+            recip = reciprocal_table[a];
+#if DO_CLAMP_INPUT
+	    r = r < a ? r : a;
+	    g = g < a ? g : a;
+	    b = b < a ? b : a;
+#endif
+            diff = rgba ^ const_in;
+            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+            dst[i+1] =
+		(r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+        }
+
+        i += 2;
+
+	/* Fall into special cases if we have special
+	 * circumstances. */
+        if (0 != (accu & diff))
+	    continue;
+
+        if (0 == accu) {	/* a run of solid pixels. */
+            uint32_t in;
+            while (AMASK == ((in = src[i]) & AMASK)) {
+                dst[i++] = in;
+                if (i == num_pixels) return;
+            }
+        } else if (0 == diff) {	/* a run of constant pixels. */
+            while (src[i] == const_in) {
+                dst[i++] = const_out;
+                if (i == num_pixels) return;
+            }
+        }
+    }
+}
diff --git a/unpremultiply-lut.c b/unpremultiply-lut.c
index 048eefa..4954f51 100644
--- a/unpremultiply-lut.c
+++ b/unpremultiply-lut.c
@@ -1,10 +1,7 @@
-/* An unpremultiplier using a 64KB division table.  It specialises
- * constant runs and solid runs of pixels, but not quite as well as
- * unpremultiply-inv32 (but it could given some more effort.)  It has
- * the advantage over the others that it can clamp the result into
- * range without any cost so that the output doesn't overflow should
- * there be superluminant pixels in the input.  It's also reasonably
- * fast if you have enough L1. */
+/* Basic unpremultiplier using a 64KB division table.  It has the
+ * advantage over the others that it can clamp the result into range
+ * without any cost, so it is safe to pass it superluminant input
+ * pixels. */
 #include <stddef.h>
 #include <stdint.h>
 
@@ -17,10 +14,10 @@
 #define GSHIFT ((16 + ASHIFT) % 32)
 #define BSHIFT (( 8 + ASHIFT) % 32)
 
-#define AMASK (255 << ASHIFT)
-#define RMASK (255 << RSHIFT)
-#define GMASK (255 << GSHIFT)
-#define BMASK (255 << BSHIFT)
+#define AMASK (255U << ASHIFT)
+#define RMASK (255U << RSHIFT)
+#define GMASK (255U << GSHIFT)
+#define BMASK (255U << BSHIFT)
 
 /* The function to tabulate in the lookup table.  Could be anything
  * really. */
@@ -53,27 +50,15 @@ void
 unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
 {
     size_t i;
-    uint32_t prev_in = 0;
-    uint32_t prev_out = 0;
     for (i=0; i<n; i++) {
 	uint32_t rgba = src[i];
 	uint32_t a = (rgba >> ASHIFT) & 0xFF;
-	if (a == 255) {
-	    dst[i] = rgba;
-	}
-	else if (prev_in == rgba) {
-	    dst[i] = prev_out;
-	}
-	else {
-	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
-	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
-	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
-	    r = division_table[a][r];
-	    g = division_table[a][g];
-	    b = division_table[a][b];
-	    prev_in = rgba;
-	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
-	    dst[i] = prev_out;
-	}
+	uint32_t r = (rgba >> RSHIFT) & 0xFF;
+	uint32_t g = (rgba >> GSHIFT) & 0xFF;
+	uint32_t b = (rgba >> BSHIFT) & 0xFF;
+	r = division_table[a][r];
+	g = division_table[a][g];
+	b = division_table[a][b];
+	dst[i] = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a << ASHIFT);
     }
 }
diff --git a/unpremultiply-lutb.c b/unpremultiply-lutb.c
new file mode 100644
index 0000000..bdbe039
--- /dev/null
+++ b/unpremultiply-lutb.c
@@ -0,0 +1,120 @@
+/* An unpremultiplier using a 64KB division table.  It specialises
+ * constant runs and solid runs of pixels, but not quite as well as
+ * unpremultiply-inv32 (but it could given some more effort.)  It has
+ * the advantage over the others that it can clamp the result into
+ * range without any cost so that the output doesn't overflow should
+ * there be superluminant pixels in the input.  It's also reasonably
+ * fast if you have enough L1. */
+#include <stddef.h>
+#include <stdint.h>
+
+/* Pixel format config for a 32 bit pixel with 8 bit components.  Only
+ * the location of alpha matters. */
+#ifndef ASHIFT
+# define ASHIFT 24
+#endif
+#define RSHIFT ((24 + ASHIFT) % 32)
+#define GSHIFT ((16 + ASHIFT) % 32)
+#define BSHIFT (( 8 + ASHIFT) % 32)
+
+#define AMASK (255U << ASHIFT)
+#define RMASK (255U << RSHIFT)
+#define GMASK (255U << GSHIFT)
+#define BMASK (255U << BSHIFT)
+
+/* The function to tabulate in the lookup table.  Could be anything
+ * really. */
+#define div_func(a,b)	(255*(a) / (b))
+
+#define clamp(x)	((x) > 255 ? 255 : (x))
+
+/* The entries division_table[a][b] are defined by 
+ *
+ *   0				when b = 0
+ *   min(255, ceil(255*a / b))	when b > 0
+ *
+ */
+static uint8_t const division_table[256][256] = {
+
+#define R(a,b)		((b) > 0 ? clamp(div_func((a),(b))) : 0)
+#define R1(a,i)		R(a, i),  R(a+1, i),   R(a+2, i),   R(a+3, i)
+#define R2(a,i)		R1(a, i), R1(a+4, i),  R1(a+8, i),  R1(a+12, i)
+#define R3(a,i)		R2(a, i), R2(a+16, i), R2(a+32, i), R2(a+48, i)
+
+#define S(b)	      { R3(0, b), R3(64, b),   R3(128, b),  R3(192, b) }
+#define S1(b)		S(b),	  S(b+1),      S(b+2),      S(b+3)
+#define S2(b)		S1(b),	  S1(b+4),     S1(b+8),	    S1(b+12)
+#define S3(b)		S2(b),	  S2(b+16),    S2(b+32),    S2(b+48)
+
+			S3(0),	  S3(64),      S3(128),	    S3(192)
+};
+
+void
+unpremultiply_with_lutb(
+    uint32_t       * restrict dst,
+    uint32_t const * restrict src,
+    size_t                    num_pixels)
+{
+    size_t i = 0;
+    while (i < num_pixels) {
+	uint32_t const_in, const_out;
+	uint32_t accu = -2*255;
+	uint32_t diff;
+
+	{
+	    uint32_t rgba, r, g, b, a;
+	    rgba = const_in = src[i];
+	    a = (rgba >> ASHIFT) & 0xFF;
+	    accu = a;
+	    r = (rgba >> RSHIFT) & 0xFF;
+	    g = (rgba >> GSHIFT) & 0xFF;
+	    b = (rgba >> BSHIFT) & 0xFF;
+	    r = division_table[a][r];
+	    g = division_table[a][g];
+	    b = division_table[a][b];
+	    dst[i] = const_out =
+		(r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+	}
+
+	if (i+1 >= num_pixels)
+	    return;
+
+	{
+	    uint32_t rgba, r, g, b, a;
+	    rgba = src[i+1];
+	    a = (rgba >> ASHIFT) & 0xFF;
+	    accu += a;
+	    r = (rgba >> RSHIFT) & 0xFF;
+	    g = (rgba >> GSHIFT) & 0xFF;
+	    b = (rgba >> BSHIFT) & 0xFF;	
+	    diff = const_in ^ rgba;
+	    r = division_table[a][r];
+	    g = division_table[a][g];
+	    b = division_table[a][b];
+	    dst[i+1] =
+		(r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+	}
+
+	accu -= 2*255;
+	i += 2;
+
+	if (0 != (diff & accu))
+	    continue;
+
+	if (0 == accu) {
+	    uint32_t in;
+	    while (AMASK == ((in = src[i]) & AMASK)) {
+		dst[i++] = in;
+		if (i == num_pixels)
+		    return;
+	    }
+	}
+	else if (0 == diff) {
+	    while (src[i] == const_in) {
+		dst[i++] = const_out;
+		if (i == num_pixels)
+		    return;
+	    }
+	}
+    }
+}
diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S
index 1ffbf5f..9596876 100644
--- a/unpremultiply-sse2.S
+++ b/unpremultiply-sse2.S
@@ -84,7 +84,7 @@ unpremultiply_single_pixels:
 	mov	ebx, eax
 	mov	ebp, eax		; Initialise result pixel register.
 	and	ebp, 0xFF000000		; Mask off non-alpha from result pix.
-	jz	.next
+;	jz	.next
 	shr	ebx, 24			; Load alpha.
 	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
 
@@ -263,7 +263,7 @@ unpremultiply_with_sse2:
 	; If we don't have enough pixels for at least a few iterations
 	; of blocked unpremultiplication then do the pixels one at a time.
 	cmp	rdx, 3+4*4+3		; Max. pre/post align + 4 blocks.
-	jae	.do_blocked
+;	jae	.do_blocked
 	 mov	rcx, rdx		; Pixel count.
 	 call	unpremultiply_single_pixels
 	 jmp	.out
@@ -289,8 +289,8 @@ unpremultiply_with_sse2:
 	; the image is fairly small then use movdqa writes.
 	cmp	rdi, rsi		; Use movdqa for aliased src, dst.
 	jz	.1
-	cmp	rdx, 8192		; ... or if the src and dest are small.
-	jc	.1
+;	cmp	rdx, 128		; ... or if the src and dest are small.
+;	jc	.1
 	 unpremultiply_pixel_blocks movntdq
 	 jmp	.do_leftovers
 .1:
diff --git a/unpremultiply.c b/unpremultiply.c
index 201a59c..322367b 100644
--- a/unpremultiply.c
+++ b/unpremultiply.c
@@ -1,6 +1,7 @@
 #define RUN_ME /*
 nasm -g -f elf64 unpremultiply-sse2.S
-CFLAGS="-W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g"
+CFLAGS="-W -Wall -Wextra -std=c99 -O2 -g"
+CFLAGS="$CFLAGS -O3 -fomit-frame-pointer -funroll-all-loops"
 gcc $CFLAGS -c *.c
 gcc $CFLAGS -o `basename $0 .c` *.o
 exit $?
@@ -31,7 +32,9 @@ exit $?
 /* The methods we have available. */
 void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
 void unpremultiply_with_inv32(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_inv32b(uint32_t *dst, uint32_t const *src, size_t n);
 void unpremultiply_with_lut(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_lutb(uint32_t *dst, uint32_t const *src, size_t n);
 void unpremultiply_with_div(uint32_t *dst, uint32_t const *src, size_t n);
 
 /*
@@ -208,7 +211,9 @@ main(int argc, char **argv)
 	}
 	else if (0 == strcmp(argv[i], "div") ||
 		 0 == strcmp(argv[i], "lut") ||
+		 0 == strcmp(argv[i], "lutb") ||
 		 0 == strcmp(argv[i], "inv32") ||
+		 0 == strcmp(argv[i], "inv32b") ||
 		 0 == strcmp(argv[i], "sse2") ||
 		 0 == strcmp(argv[i], "copy") ||
 		 0 == strcmp(argv[i], "read") ||
@@ -240,11 +245,21 @@ main(int argc, char **argv)
 	    unpremultiply_with_lut(dst, src, n);
 	}
     }
+    else if (0 == strcmp(method, "lutb")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_lutb(dst, src, n);
+	}
+    }
     else if (0 == strcmp(method, "inv32")) {
 	while (nloops-- > 0) {
 	    unpremultiply_with_inv32(dst, src, n);
 	}
     }
+    else if (0 == strcmp(method, "inv32b")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_inv32b(dst, src, n);
+	}
+    }
     else if (0 == strcmp(method, "sse2")) {
 	while (nloops-- > 0) {
 	    unpremultiply_with_sse2(dst, src, n);
author	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-14 18:12:55 +0200
committer	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-14 18:12:55 +0200
commit	e4058babc81f509082da126c1052b1ce389f4e3b (patch)
tree	d90edd5493b5fa7e443dbaa3508918df7dba9c9a
parent	b264fbcbe6f12d9f25a57f13f3747e2a51b2c2d9 (diff)