diff options
author | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-13 19:33:06 +0200 |
---|---|---|
committer | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-13 19:33:06 +0200 |
commit | b264fbcbe6f12d9f25a57f13f3747e2a51b2c2d9 (patch) | |
tree | 8e84d115d76e50518de0f9d0fcd890815d3e5a63 | |
parent | a30c4f05e916db2b04613dc22e357ab63235a776 (diff) |
Separate unpremultiplier methods, clean them up, and kill the bad ones.
-rw-r--r-- | unpremultiply-div.c | 56 | ||||
-rw-r--r-- | unpremultiply-inv32.c | 125 | ||||
-rw-r--r-- | unpremultiply-lut.c | 79 | ||||
-rw-r--r-- | unpremultiply-sse2.S | 1 | ||||
-rw-r--r-- | unpremultiply.c | 378 |
5 files changed, 297 insertions, 342 deletions
diff --git a/unpremultiply-div.c b/unpremultiply-div.c new file mode 100644 index 0000000..514aa28 --- /dev/null +++ b/unpremultiply-div.c @@ -0,0 +1,56 @@ +/* Reference implementation using divisions. Since the slow path is + * so very slow this version specialises runs of constant or solid + * pixels. */ +#include <stddef.h> +#include <stdint.h> + +/* Pixel format config for a 32 bit pixel with 8 bit components. Only + * the location of alpha matters. */ +#ifndef ASHIFT +# define ASHIFT 24 +#endif +#define RSHIFT ((24 + ASHIFT) % 32) +#define GSHIFT ((16 + ASHIFT) % 32) +#define BSHIFT (( 8 + ASHIFT) % 32) + +#define AMASK (255 << ASHIFT) +#define RMASK (255 << RSHIFT) +#define GMASK (255 << GSHIFT) +#define BMASK (255 << BSHIFT) + +void +unpremultiply_with_div( + uint32_t * restrict dst, + uint32_t const * restrict src, + size_t n) +{ + uint32_t prev_in = 0; + uint32_t prev_out = 0; + size_t i; + + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba & AMASK) >> ASHIFT; + if (a == 255) { + dst[i] = rgba; + continue; + } + if (prev_in == rgba) { + dst[i] = prev_out; + continue; + } + if (a) { + uint32_t r = (rgba >> RSHIFT) & 0xFF; + uint32_t g = (rgba >> GSHIFT) & 0xFF; + uint32_t b = (rgba >> BSHIFT) & 0xFF; + r = r*255 / a; + g = g*255 / a; + b = b*255 / a; + prev_in = rgba; + prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); + } else { + prev_in = prev_out = 0; + } + dst[i] = prev_out; + } +} diff --git a/unpremultiply-inv32.c b/unpremultiply-inv32.c new file mode 100644 index 0000000..e9505de --- /dev/null +++ b/unpremultiply-inv32.c @@ -0,0 +1,125 @@ +/* An unpremultiplier using reciprocal multiplication. It specialises + * constant runs and solid runs of pixels with low overhead loops and + * uses only a 1KB table of reciprocals. */ +/* gcc -c -W -Wall -O3 -funroll-all-loops -fomit-frame-pointer -std=c99 unpremultiply-inv32.c */ +#include <stdint.h> +#include <stddef.h> + +/* Pixel format config for a 32 bit pixel with 8 bit components. Only + * the location of alpha matters. */ +#ifndef ASHIFT +# define ASHIFT 24 +#endif +#define RSHIFT ((24 + ASHIFT) % 32) +#define GSHIFT ((16 + ASHIFT) % 32) +#define BSHIFT (( 8 + ASHIFT) % 32) + +#define AMASK (255U << ASHIFT) +#define RMASK (255U << RSHIFT) +#define GMASK (255U << GSHIFT) +#define BMASK (255U << BSHIFT) + +/* Shift x left by y bits. Supports negative y for right shifts. */ +#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y)) + +#define ceil_div(a,b) ((a) + (b)-1) / (b) + +/* The reciprocal_table[i] entries are defined by + * + * 0 when i = 0 + * 255 / i when i > 0 + * + * represented in fixed point format with RECIPROCAL_BITS of + * precision and errors rounded up. */ +#define RECIPROCAL_BITS 16 +static uint32_t const reciprocal_table[256] = { +# define R(i) ((i) ? ceil_div(255*(1<<RECIPROCAL_BITS), (i)) : 0) +# define R1(i) R(i), R(i+1), R(i+2), R(i+3) +# define R2(i) R1(i), R1(i+4), R1(i+8), R1(i+12) +# define R3(i) R2(i), R2(i+16), R2(i+32), R2(i+48) + R3(0), R3(64), R3(128), R3(192) +}; + +/* Transfer num_pixels unpremultiplied pixels from src[] to dst[]. + * This version uses a short probe period of a few pixels to identify + * runs of constant or solid pixels. When a run is identified it + * falls into a special case loop for the duration of the run. */ +void +unpremultiply_with_inv32( + uint32_t * restrict dst, + uint32_t const * restrict src, + size_t num_pixels) +{ +#define PROBE_LENGTH 2 + + for (size_t i = 0; i < num_pixels; ) { + /* We want to identify long runs of constant input pixels and + * cache the unpremultiplied. */ + uint32_t const_in, const_out; + + /* Diff is the or of all bitwise differences from const_in + * during the probe period. If it is zero after the probe + * period then every input pixel was identical in the + * probe. */ + unsigned diff = 0; + + /* Accumulator for all alphas of the probe period pixels, + * biased to make the sum zero if the */ + unsigned accu = -PROBE_LENGTH*255; + + unsigned j; + + /* The first iteration of the probe period initialises + * const_in. */ + { + uint32_t rgba = const_in = src[i]; + uint32_t a = (rgba >> ASHIFT) & 255; + accu += a; + uint32_t r = (rgba >> RSHIFT) & 255; + uint32_t g = (rgba >> GSHIFT) & 255; + uint32_t b = (rgba >> BSHIFT) & 255; + uint32_t recip = reciprocal_table[a]; + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i] = const_out = + (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + } + + for (j = 1; j < PROBE_LENGTH; j++) { + if (i + j >= num_pixels) + return; + uint32_t rgba = src[i+j]; + diff |= rgba ^ const_in; + uint32_t a = (rgba >> ASHIFT) & 255; + accu += a; + uint32_t r = (rgba >> RSHIFT) & 255; + uint32_t g = (rgba >> GSHIFT) & 255; + uint32_t b = (rgba >> BSHIFT) & 255; + uint32_t recip = reciprocal_table[a]; + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i+j] = + (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + } + i += PROBE_LENGTH; + + /* Fall into special cases if we have special + * circumstances. */ + if (0 != (accu & diff)) continue; + + if (0 == accu) { /* a run of solid pixels. */ + uint32_t in; + while (AMASK == ((in = src[i]) & AMASK)) { + dst[i++] = in; + if (i >= num_pixels) return; + } + } else if (0 == diff) { /* a run of constant pixels. */ + while (src[i] == const_in) { + dst[i++] = const_out; + if (i >= num_pixels) return; + } + } + } +} diff --git a/unpremultiply-lut.c b/unpremultiply-lut.c new file mode 100644 index 0000000..048eefa --- /dev/null +++ b/unpremultiply-lut.c @@ -0,0 +1,79 @@ +/* An unpremultiplier using a 64KB division table. It specialises + * constant runs and solid runs of pixels, but not quite as well as + * unpremultiply-inv32 (but it could given some more effort.) It has + * the advantage over the others that it can clamp the result into + * range without any cost so that the output doesn't overflow should + * there be superluminant pixels in the input. It's also reasonably + * fast if you have enough L1. */ +#include <stddef.h> +#include <stdint.h> + +/* Pixel format config for a 32 bit pixel with 8 bit components. Only + * the location of alpha matters. */ +#ifndef ASHIFT +# define ASHIFT 24 +#endif +#define RSHIFT ((24 + ASHIFT) % 32) +#define GSHIFT ((16 + ASHIFT) % 32) +#define BSHIFT (( 8 + ASHIFT) % 32) + +#define AMASK (255 << ASHIFT) +#define RMASK (255 << RSHIFT) +#define GMASK (255 << GSHIFT) +#define BMASK (255 << BSHIFT) + +/* The function to tabulate in the lookup table. Could be anything + * really. */ +#define div_func(a,b) (255*(a) / (b)) + +#define clamp(x) ((x) > 255 ? 255 : (x)) + +/* The entries division_table[a][b] are defined by + * + * 0 when b = 0 + * min(255, ceil(255*a / b)) when b > 0 + * + */ +static uint8_t const division_table[256][256] = { + +#define R(a,b) ((b) > 0 ? clamp(div_func((a),(b))) : 0) +#define R1(a,i) R(a, i), R(a+1, i), R(a+2, i), R(a+3, i) +#define R2(a,i) R1(a, i), R1(a+4, i), R1(a+8, i), R1(a+12, i) +#define R3(a,i) R2(a, i), R2(a+16, i), R2(a+32, i), R2(a+48, i) + +#define S(b) { R3(0, b), R3(64, b), R3(128, b), R3(192, b) } +#define S1(b) S(b), S(b+1), S(b+2), S(b+3) +#define S2(b) S1(b), S1(b+4), S1(b+8), S1(b+12) +#define S3(b) S2(b), S2(b+16), S2(b+32), S2(b+48) + + S3(0), S3(64), S3(128), S3(192) +}; + +void +unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + size_t i; + uint32_t prev_in = 0; + uint32_t prev_out = 0; + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba >> ASHIFT) & 0xFF; + if (a == 255) { + dst[i] = rgba; + } + else if (prev_in == rgba) { + dst[i] = prev_out; + } + else { + uint32_t r = (rgba >> RSHIFT) & 0xFF; + uint32_t g = (rgba >> GSHIFT) & 0xFF; + uint32_t b = (rgba >> BSHIFT) & 0xFF; + r = division_table[a][r]; + g = division_table[a][g]; + b = division_table[a][b]; + prev_in = rgba; + prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); + dst[i] = prev_out; + } + } +} diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S index 8ecee17..1ffbf5f 100644 --- a/unpremultiply-sse2.S +++ b/unpremultiply-sse2.S @@ -9,6 +9,7 @@ ;;; uint32_t const *src, ;;; unsigned long num_pixels); ;;; +;;; Tested with nasm 2.06rc2. section .text ; We're only using rax-rbp in this file so that diff --git a/unpremultiply.c b/unpremultiply.c index 6d904d7..201a59c 100644 --- a/unpremultiply.c +++ b/unpremultiply.c @@ -1,288 +1,42 @@ #define RUN_ME /* nasm -g -f elf64 unpremultiply-sse2.S -gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o $0 +CFLAGS="-W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g" +gcc $CFLAGS -c *.c +gcc $CFLAGS -o `basename $0 .c` *.o exit $? */ +/* Test driver for unpremultipliers. */ #include <assert.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> - #include <sys/types.h> -#include <sys/time.h> +#include <sys/time.h> /* gettimeofday */ -#if 1 +/* Pixel format config for a 32 bit pixel with 8 bit components. Only + * the location of alpha matters. */ +#ifndef ASHIFT # define ASHIFT 24 -# define RSHIFT 16 -# define GSHIFT 8 -# define BSHIFT 0 -#else -# define RSHIFT 24 -# define GSHIFT 16 -# define BSHIFT 8 -# define ASHIFT 0 #endif +#define RSHIFT ((24 + ASHIFT) % 32) +#define GSHIFT ((16 + ASHIFT) % 32) +#define BSHIFT (( 8 + ASHIFT) % 32) #define AMASK (255 << ASHIFT) #define RMASK (255 << RSHIFT) #define GMASK (255 << GSHIFT) #define BMASK (255 << BSHIFT) +/* The methods we have available. */ void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n); +void unpremultiply_with_inv32(uint32_t *dst, uint32_t const *src, size_t n); +void unpremultiply_with_lut(uint32_t *dst, uint32_t const *src, size_t n); +void unpremultiply_with_div(uint32_t *dst, uint32_t const *src, size_t n); -static void __attribute__((noinline)) -unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) -{ - uint32_t prev_in = 0; - uint32_t prev_out = 0; - size_t i; - - for (i=0; i<n; i++) { - uint32_t rgba = src[i]; - uint32_t a = (rgba & AMASK) >> ASHIFT; - if (a == 255) { - dst[i] = rgba; - continue; - } - if (prev_in == rgba) { - dst[i] = prev_out; - continue; - } - if (a) { - uint32_t r = (rgba >> RSHIFT) & 0xFF; - uint32_t g = (rgba >> GSHIFT) & 0xFF; - uint32_t b = (rgba >> BSHIFT) & 0xFF; - r = r*255 / a; - g = g*255 / a; - b = b*255 / a; - assert(r < 256); - assert(g < 256); - assert(b < 256); - assert(a < 256); - prev_in = rgba; - prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); - } else { - prev_in = prev_out = 0; - } - dst[i] = prev_out; - } -} - -static uint8_t division_table[65536]; - -static void __attribute__((noinline)) -unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) -{ - size_t i; - uint32_t prev_in = 0; - uint32_t prev_out = 0; - for (i=0; i<n; i++) { - uint32_t rgba = src[i]; - uint32_t a = (rgba >> ASHIFT) & 0xFF; - if (a == 255) { - dst[i] = rgba; - } - else if (prev_in == rgba) { - dst[i] = prev_out; - } - else { - uint32_t r = (rgba >> RSHIFT) & 0xFF; - uint32_t g = (rgba >> GSHIFT) & 0xFF; - uint32_t b = (rgba >> BSHIFT) & 0xFF; - r = division_table[a*256 + r]; - g = division_table[a*256 + g]; - b = division_table[a*256 + b]; - - prev_in = rgba; - prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); - dst[i] = prev_out; - } - } -} - -#define RECIPROCAL_BITS 8 - -static uint32_t reciprocal_table_A[256]; -static uint64_t reciprocal_table_B[256]; - -#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y)) - -static void __attribute__((noinline)) -unpremultiply_with_inv32(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) -{ - size_t i; - uint32_t prev_in = 0; - uint32_t prev_out = 0; - for (i=0; i<n; i++) { - uint32_t rgba = src[i]; - uint32_t a = (rgba >> ASHIFT) & 255; - if (a == 255) { - dst[i] = rgba; - } - else if (prev_in == rgba) { - dst[i] = prev_out; - } - else { - prev_in = rgba; - uint32_t r = (rgba >> RSHIFT) & 255; - uint32_t g = (rgba >> GSHIFT) & 255; - uint32_t b = (rgba >> BSHIFT) & 255; - uint32_t recip = reciprocal_table_A[a]; - r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); - g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); - b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); - prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); - dst[i] = prev_out; - } - } -} - -#define INNER_UNROLL (2) - -static void __attribute__((noinline)) -unpremultiply_with_inv32_bis(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) -{ - for (size_t i = 0; i < n; ) { - uint32_t prev_in, prev_out; - /* 0 iff all inputs are the same */ - unsigned delta = 0; - /* adds to 0 if all inputs are opaque */ - unsigned opaque_count = - INNER_UNROLL*255; - - { - uint32_t rgba = prev_in = src[i]; - uint32_t a = (rgba >> ASHIFT) & 255; - opaque_count += a; - uint32_t r = (rgba >> RSHIFT) & 255; - uint32_t g = (rgba >> GSHIFT) & 255; - uint32_t b = (rgba >> BSHIFT) & 255; - uint32_t recip = reciprocal_table_A[a]; - r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); - g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); - b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); - dst[i] = prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); - } - - /* UNROLL, bitch! */ - for (unsigned j = 1; j < INNER_UNROLL; j++) { - if (i+j>=n) return; - uint32_t rgba = src[i+j]; - delta |= rgba ^ prev_in; - uint32_t a = (rgba >> ASHIFT) & 255; - opaque_count += a; - uint32_t r = (rgba >> RSHIFT) & 255; - uint32_t g = (rgba >> GSHIFT) & 255; - uint32_t b = (rgba >> BSHIFT) & 255; - uint32_t recip = reciprocal_table_A[a]; - r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); - g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); - b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); - dst[i+j] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); - } - - i += INNER_UNROLL; - - /* switch to special case after k of them - * + minimize cost when unapplicable - */ - - if (0 != (delta & opaque_count)) continue; - - if (0 == opaque_count) { - uint32_t in; - while ((255U << ASHIFT) == ((in = src[i]) & (255U << ASHIFT))) { - dst[i++] = in; - if (i >= n) return; - } - } else if (0 == delta) { - while (src[i] == prev_in) { - dst[i++] = prev_out; - if (i >= n) return; - } - } - } -} -#undef INNER_UNROLL - -static void __attribute__((noinline)) -unpremultiply_with_inv32_nocache( - uint32_t * restrict dst, - uint32_t const * restrict src, - size_t n) -{ - size_t i; - for (i=0; i<n; i++) { - uint32_t rgba = src[i]; - uint32_t a = (rgba >> ASHIFT) & 255; - uint32_t r = (rgba >> RSHIFT) & 255; - uint32_t g = (rgba >> GSHIFT) & 255; - uint32_t b = (rgba >> BSHIFT) & 255; - uint32_t recip = reciprocal_table_A[a]; - r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); - g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); - b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); - dst[i] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); - } -} - -static void __attribute__((noinline)) -unpremultiply_with_inv64(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) -{ - size_t i; - uint32_t prev_in = 0; - uint32_t prev_out = 0; - for (i=0; i<n; i++) { - uint32_t rgba = src[i]; - uint32_t a = (rgba >> ASHIFT) & 255; - if (a == 255) { - dst[i] = rgba; - } - else if (prev_in == rgba) { - dst[i] = prev_out; - } - else { - prev_in = rgba; - uint64_t r = rgba & RMASK; - uint64_t g = rgba & GMASK; - uint64_t b = rgba & BMASK; - uint64_t recip = reciprocal_table_B[a]; - r = r * recip; - g = g * recip; - b = b * recip; - prev_out = (rgba & AMASK) | - (((r & ((uint64_t)RMASK << 32)) | - (g & ((uint64_t)GMASK << 32)) | - (b & ((uint64_t)BMASK << 32))) >> 32); - dst[i] = prev_out; - } - } -} - -static void __attribute__((noinline)) -unpremultiply_with_inv64_nocache( - uint32_t * restrict dst, - uint32_t const * restrict src, - size_t n) -{ - size_t i; - for (i=0; i<n; i++) { - uint32_t rgba = src[i]; - uint32_t a = (rgba & AMASK) >> ASHIFT; - uint64_t r = rgba & RMASK; - uint64_t g = rgba & GMASK; - uint64_t b = rgba & BMASK; - uint64_t recip = reciprocal_table_B[a]; - r = r * recip; - g = g * recip; - b = b * recip; - dst[i] = (rgba & AMASK) | - (((r & ((uint64_t)RMASK << 32)) | - (g & ((uint64_t)GMASK << 32)) | - (b & ((uint64_t)BMASK << 32))) >> 32); - } -} - +/* + * Some non-functional methods to gauge the computation overhead. + */ static void __attribute__((noinline)) unpremultiply_with_memcpy(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) { @@ -309,47 +63,9 @@ unpremultiply_with_read(uint32_t * restrict dst, uint32_t const * restrict src, read_sum = sum; } -static void -make_division_table() -{ - unsigned a; - for (a=1; a<256; a++) { - unsigned x; - for (x=0; x<256; x++) { - unsigned y = x*255/a; - y = y < 255 ? y : 255; - division_table[a*256 + x] = y; - } - } - for (a=0; a<256; a++) { - division_table[a] = 0; - } -} - -static void -make_reciprocal_table_A() -{ - unsigned a; - reciprocal_table_A[0] = 0; - for (a=1; a<256; a++) { - uint32_t r = 255U*(1<<RECIPROCAL_BITS) / a; - while ((a*r) >> RECIPROCAL_BITS < 255) r++; - reciprocal_table_A[a] = r; - } -} - -static void -make_reciprocal_table_B() -{ - unsigned a; - reciprocal_table_B[0] = 0; - for (a=1; a<256; a++) { - uint64_t r = 255*(1ULL<<32)/a; - while ((a*r) >> 32 < 255) r++; - reciprocal_table_B[a] = r; - } -} - +/* Make sure we don't have superluminant pixels by clamping the colour + * components of n pixels at buf[] to be less than or equal to the + * pixels' alpha values. */ static void saturate(void *buf, size_t n) { @@ -361,9 +77,9 @@ saturate(void *buf, size_t n) * about the fact that it's accessing uint32_t values. The * confusion is currently enough to make it doubt the * alignment of the buf pointer and not let the vectoriser at - * this loop. The new vectoriser aggressively uses aligned - * accesses, and sometimes this loop is called with unaligned - * addresses. */ + * this loop. The new vectoriser aggressively depends on naturally + * aligned pointers, but sometimes we want to test with unaligned + * addresses too. */ union { uint32_t u32; uint8_t u8[4]; @@ -393,6 +109,9 @@ saturate(void *buf, size_t n) } } +/* + * Methods to create pixel patterns to test. + */ static void fill_random(uint32_t *buf, size_t n) { @@ -428,6 +147,10 @@ fill_empty(void *buf, size_t n) memset(buf, 0, 4*n); } +/* + * Main tester. + */ + static long getenvlong(char const *name, long default_value) { @@ -446,10 +169,9 @@ now_ms() int main(int argc, char **argv) { - long nloops = getenvlong("loops", 50); + long nloops = getenvlong("loops", 500); size_t n = getenvlong("pixels", 2*1024*1024); - long offset = getenvlong("offset", 0); - /* non-zero is liable to segfault due to gcc alignment breakage */ + long offset = getenvlong("offset", 0); /* byte offset */ union { uint32_t *u32; char *u8; @@ -465,10 +187,6 @@ main(int argc, char **argv) #define dst udst.u32 #define src usrc.u32 - make_division_table(); - make_reciprocal_table_A(); - make_reciprocal_table_B(); - for (i=1; i<argc; i++) { if (0 == strcmp(argv[i], "verify")) { verify = 1; @@ -491,10 +209,6 @@ main(int argc, char **argv) else if (0 == strcmp(argv[i], "div") || 0 == strcmp(argv[i], "lut") || 0 == strcmp(argv[i], "inv32") || - 0 == strcmp(argv[i], "inv32-bis") || - 0 == strcmp(argv[i], "inv64") || - 0 == strcmp(argv[i], "inv32-nocache") || - 0 == strcmp(argv[i], "inv64-nocache") || 0 == strcmp(argv[i], "sse2") || 0 == strcmp(argv[i], "copy") || 0 == strcmp(argv[i], "read") || @@ -508,7 +222,7 @@ main(int argc, char **argv) return 1; } } - saturate(src, n); + saturate(src, n); /* don't deal with superluminant pixels. */ if (verify) { ref = malloc(n*4); @@ -531,24 +245,9 @@ main(int argc, char **argv) unpremultiply_with_inv32(dst, src, n); } } - else if (0 == strcmp(method, "inv32-bis")) { - while (nloops-- > 0) { - unpremultiply_with_inv32_bis(dst, src, n); - } - } - else if (0 == strcmp(method, "inv64")) { - while (nloops-- > 0) { - unpremultiply_with_inv64(dst, src, n); - } - } - else if (0 == strcmp(method, "inv32-nocache")) { - while (nloops-- > 0) { - unpremultiply_with_inv32_nocache(dst, src, n); - } - } - else if (0 == strcmp(method, "inv64-nocache")) { + else if (0 == strcmp(method, "sse2")) { while (nloops-- > 0) { - unpremultiply_with_inv64_nocache(dst, src, n); + unpremultiply_with_sse2(dst, src, n); } } else if (0 == strcmp(method, "copy")) { @@ -566,11 +265,6 @@ main(int argc, char **argv) unpremultiply_with_read(dst, src, n); } } - else if (0 == strcmp(method, "sse2")) { - while (nloops-- > 0) { - unpremultiply_with_sse2(dst, src, n); - } - } else if (0 == strcmp(method, "noop")) { /* do nothing. */ } else { |