diff options
author | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-14 18:12:55 +0200 |
---|---|---|
committer | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-14 18:12:55 +0200 |
commit | e4058babc81f509082da126c1052b1ce389f4e3b (patch) | |
tree | d90edd5493b5fa7e443dbaa3508918df7dba9c9a | |
parent | b264fbcbe6f12d9f25a57f13f3747e2a51b2c2d9 (diff) |
Separate special cased unpremultipliers from just the basic method.
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | unpremultiply-div.c | 22 | ||||
-rw-r--r-- | unpremultiply-inv32.c | 102 | ||||
-rw-r--r-- | unpremultiply-inv32b.c | 138 | ||||
-rw-r--r-- | unpremultiply-lut.c | 45 | ||||
-rw-r--r-- | unpremultiply-lutb.c | 120 | ||||
-rw-r--r-- | unpremultiply-sse2.S | 8 | ||||
-rw-r--r-- | unpremultiply.c | 17 |
8 files changed, 322 insertions, 131 deletions
@@ -1,2 +1,3 @@ *.o *~ +unpremultiply diff --git a/unpremultiply-div.c b/unpremultiply-div.c index 514aa28..543149c 100644 --- a/unpremultiply-div.c +++ b/unpremultiply-div.c @@ -24,33 +24,21 @@ unpremultiply_with_div( uint32_t const * restrict src, size_t n) { - uint32_t prev_in = 0; - uint32_t prev_out = 0; size_t i; - + for (i=0; i<n; i++) { uint32_t rgba = src[i]; - uint32_t a = (rgba & AMASK) >> ASHIFT; - if (a == 255) { - dst[i] = rgba; - continue; - } - if (prev_in == rgba) { - dst[i] = prev_out; - continue; - } - if (a) { + if (rgba & AMASK) { + uint32_t a = (rgba >> ASHIFT) & 0xFF; uint32_t r = (rgba >> RSHIFT) & 0xFF; uint32_t g = (rgba >> GSHIFT) & 0xFF; uint32_t b = (rgba >> BSHIFT) & 0xFF; r = r*255 / a; g = g*255 / a; b = b*255 / a; - prev_in = rgba; - prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); + dst[i] = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a << ASHIFT); } else { - prev_in = prev_out = 0; + dst[i] = 0; } - dst[i] = prev_out; } } diff --git a/unpremultiply-inv32.c b/unpremultiply-inv32.c index e9505de..fbb5572 100644 --- a/unpremultiply-inv32.c +++ b/unpremultiply-inv32.c @@ -1,7 +1,5 @@ -/* An unpremultiplier using reciprocal multiplication. It specialises - * constant runs and solid runs of pixels with low overhead loops and - * uses only a 1KB table of reciprocals. */ -/* gcc -c -W -Wall -O3 -funroll-all-loops -fomit-frame-pointer -std=c99 unpremultiply-inv32.c */ +/* Basic unpremultiplier using reciprocal multiplication from a 1KB + * table of reciprocals. */ #include <stdint.h> #include <stddef.h> @@ -19,6 +17,9 @@ #define GMASK (255U << GSHIFT) #define BMASK (255U << BSHIFT) +/* Set to 1 if the input can have superluminant pixels. */ +#define DO_CLAMP_INPUT 0 + /* Shift x left by y bits. Supports negative y for right shifts. */ #define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y)) @@ -40,86 +41,29 @@ static uint32_t const reciprocal_table[256] = { R3(0), R3(64), R3(128), R3(192) }; -/* Transfer num_pixels unpremultiplied pixels from src[] to dst[]. - * This version uses a short probe period of a few pixels to identify - * runs of constant or solid pixels. When a run is identified it - * falls into a special case loop for the duration of the run. */ void unpremultiply_with_inv32( uint32_t * restrict dst, uint32_t const * restrict src, size_t num_pixels) { -#define PROBE_LENGTH 2 - - for (size_t i = 0; i < num_pixels; ) { - /* We want to identify long runs of constant input pixels and - * cache the unpremultiplied. */ - uint32_t const_in, const_out; - - /* Diff is the or of all bitwise differences from const_in - * during the probe period. If it is zero after the probe - * period then every input pixel was identical in the - * probe. */ - unsigned diff = 0; - - /* Accumulator for all alphas of the probe period pixels, - * biased to make the sum zero if the */ - unsigned accu = -PROBE_LENGTH*255; - - unsigned j; - - /* The first iteration of the probe period initialises - * const_in. */ - { - uint32_t rgba = const_in = src[i]; - uint32_t a = (rgba >> ASHIFT) & 255; - accu += a; - uint32_t r = (rgba >> RSHIFT) & 255; - uint32_t g = (rgba >> GSHIFT) & 255; - uint32_t b = (rgba >> BSHIFT) & 255; - uint32_t recip = reciprocal_table[a]; - r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); - g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); - b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); - dst[i] = const_out = - (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); - } - - for (j = 1; j < PROBE_LENGTH; j++) { - if (i + j >= num_pixels) - return; - uint32_t rgba = src[i+j]; - diff |= rgba ^ const_in; - uint32_t a = (rgba >> ASHIFT) & 255; - accu += a; - uint32_t r = (rgba >> RSHIFT) & 255; - uint32_t g = (rgba >> GSHIFT) & 255; - uint32_t b = (rgba >> BSHIFT) & 255; - uint32_t recip = reciprocal_table[a]; - r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); - g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); - b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); - dst[i+j] = - (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); - } - i += PROBE_LENGTH; - - /* Fall into special cases if we have special - * circumstances. */ - if (0 != (accu & diff)) continue; - - if (0 == accu) { /* a run of solid pixels. */ - uint32_t in; - while (AMASK == ((in = src[i]) & AMASK)) { - dst[i++] = in; - if (i >= num_pixels) return; - } - } else if (0 == diff) { /* a run of constant pixels. */ - while (src[i] == const_in) { - dst[i++] = const_out; - if (i >= num_pixels) return; - } - } + size_t i; + for (i = 0; i < num_pixels; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba >> ASHIFT) & 255; + uint32_t r = (rgba >> RSHIFT) & 255; + uint32_t g = (rgba >> GSHIFT) & 255; + uint32_t b = (rgba >> BSHIFT) & 255; + uint32_t recip = reciprocal_table[a]; +#if DO_CLAMP_INPUT + r = r < a ? r : a; + g = g < a ? g : a; + b = b < a ? b : a; +#endif + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i] = + (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); } } diff --git a/unpremultiply-inv32b.c b/unpremultiply-inv32b.c new file mode 100644 index 0000000..4735a78 --- /dev/null +++ b/unpremultiply-inv32b.c @@ -0,0 +1,138 @@ +/* An unpremultiplier using reciprocal multiplication. It specialises + * constant runs and solid runs of pixels with low overhead loops and + * uses only a 1KB table of reciprocals. */ +/* gcc -c -W -Wall -O3 -funroll-all-loops -fomit-frame-pointer -std=c99 unpremultiply-inv32.c */ +#include <stdint.h> +#include <stddef.h> + +/* Pixel format config for a 32 bit pixel with 8 bit components. Only + * the location of alpha matters. */ +#ifndef ASHIFT +# define ASHIFT 24 +#endif +#define RSHIFT ((24 + ASHIFT) % 32) +#define GSHIFT ((16 + ASHIFT) % 32) +#define BSHIFT (( 8 + ASHIFT) % 32) + +#define AMASK (255U << ASHIFT) +#define RMASK (255U << RSHIFT) +#define GMASK (255U << GSHIFT) +#define BMASK (255U << BSHIFT) + +/* Set to 1 if the input can have superluminant pixels. */ +#define DO_CLAMP_INPUT 0 + +/* Shift x left by y bits. Supports negative y for right shifts. */ +#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y)) + +#define ceil_div(a,b) ((a) + (b)-1) / (b) + +/* The reciprocal_table[i] entries are defined by + * + * 0 when i = 0 + * 255 / i when i > 0 + * + * represented in fixed point format with RECIPROCAL_BITS of + * precision and errors rounded up. */ +#define RECIPROCAL_BITS 16 +static uint32_t const reciprocal_table[256] = { +# define R(i) ((i) ? ceil_div(255*(1<<RECIPROCAL_BITS), (i)) : 0) +# define R1(i) R(i), R(i+1), R(i+2), R(i+3) +# define R2(i) R1(i), R1(i+4), R1(i+8), R1(i+12) +# define R3(i) R2(i), R2(i+16), R2(i+32), R2(i+48) + R3(0), R3(64), R3(128), R3(192) +}; + +/* Transfer num_pixels unpremultiplied pixels from src[] to dst[]. + * This version uses a short probe period of a few pixels to identify + * runs of constant or solid pixels. When a run is identified it + * falls into a special case loop for the duration of the run. */ +void +unpremultiply_with_inv32b( + uint32_t * restrict dst, + uint32_t const * restrict src, + size_t num_pixels) +{ + size_t i = 0; + while (i < num_pixels) { + /* We want to identify long runs of constant input pixels and + * cache the unpremultiplied. */ + uint32_t const_in, const_out; + + /* Diff is the or of all bitwise differences from const_in + * during the probe period. If it is zero after the probe + * period then every input pixel was identical in the + * probe. */ + unsigned diff = 0; + + /* Accumulator for all alphas of the probe period pixels, + * biased to make the sum zero if the */ + unsigned accu = -2*255; + + { + uint32_t rgba, a, r, g, b, recip; + rgba = const_in = src[i]; + a = (rgba >> ASHIFT) & 255; + accu += a; + r = (rgba >> RSHIFT) & 255; + g = (rgba >> GSHIFT) & 255; + b = (rgba >> BSHIFT) & 255; + recip = reciprocal_table[a]; +#if DO_CLAMP_INPUT + r = r < a ? r : a; + g = g < a ? g : a; + b = b < a ? b : a; +#endif + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i] = const_out = + (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + } + + if (i + 1 == num_pixels) + return; + + { + uint32_t rgba, a, r, g, b, recip; + rgba = src[i+1]; + a = (rgba >> ASHIFT) & 255; + accu += a; + r = (rgba >> RSHIFT) & 255; + g = (rgba >> GSHIFT) & 255; + b = (rgba >> BSHIFT) & 255; + recip = reciprocal_table[a]; +#if DO_CLAMP_INPUT + r = r < a ? r : a; + g = g < a ? g : a; + b = b < a ? b : a; +#endif + diff = rgba ^ const_in; + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i+1] = + (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + } + + i += 2; + + /* Fall into special cases if we have special + * circumstances. */ + if (0 != (accu & diff)) + continue; + + if (0 == accu) { /* a run of solid pixels. */ + uint32_t in; + while (AMASK == ((in = src[i]) & AMASK)) { + dst[i++] = in; + if (i == num_pixels) return; + } + } else if (0 == diff) { /* a run of constant pixels. */ + while (src[i] == const_in) { + dst[i++] = const_out; + if (i == num_pixels) return; + } + } + } +} diff --git a/unpremultiply-lut.c b/unpremultiply-lut.c index 048eefa..4954f51 100644 --- a/unpremultiply-lut.c +++ b/unpremultiply-lut.c @@ -1,10 +1,7 @@ -/* An unpremultiplier using a 64KB division table. It specialises - * constant runs and solid runs of pixels, but not quite as well as - * unpremultiply-inv32 (but it could given some more effort.) It has - * the advantage over the others that it can clamp the result into - * range without any cost so that the output doesn't overflow should - * there be superluminant pixels in the input. It's also reasonably - * fast if you have enough L1. */ +/* Basic unpremultiplier using a 64KB division table. It has the + * advantage over the others that it can clamp the result into range + * without any cost, so it is safe to pass it superluminant input + * pixels. */ #include <stddef.h> #include <stdint.h> @@ -17,10 +14,10 @@ #define GSHIFT ((16 + ASHIFT) % 32) #define BSHIFT (( 8 + ASHIFT) % 32) -#define AMASK (255 << ASHIFT) -#define RMASK (255 << RSHIFT) -#define GMASK (255 << GSHIFT) -#define BMASK (255 << BSHIFT) +#define AMASK (255U << ASHIFT) +#define RMASK (255U << RSHIFT) +#define GMASK (255U << GSHIFT) +#define BMASK (255U << BSHIFT) /* The function to tabulate in the lookup table. Could be anything * really. */ @@ -53,27 +50,15 @@ void unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) { size_t i; - uint32_t prev_in = 0; - uint32_t prev_out = 0; for (i=0; i<n; i++) { uint32_t rgba = src[i]; uint32_t a = (rgba >> ASHIFT) & 0xFF; - if (a == 255) { - dst[i] = rgba; - } - else if (prev_in == rgba) { - dst[i] = prev_out; - } - else { - uint32_t r = (rgba >> RSHIFT) & 0xFF; - uint32_t g = (rgba >> GSHIFT) & 0xFF; - uint32_t b = (rgba >> BSHIFT) & 0xFF; - r = division_table[a][r]; - g = division_table[a][g]; - b = division_table[a][b]; - prev_in = rgba; - prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); - dst[i] = prev_out; - } + uint32_t r = (rgba >> RSHIFT) & 0xFF; + uint32_t g = (rgba >> GSHIFT) & 0xFF; + uint32_t b = (rgba >> BSHIFT) & 0xFF; + r = division_table[a][r]; + g = division_table[a][g]; + b = division_table[a][b]; + dst[i] = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a << ASHIFT); } } diff --git a/unpremultiply-lutb.c b/unpremultiply-lutb.c new file mode 100644 index 0000000..bdbe039 --- /dev/null +++ b/unpremultiply-lutb.c @@ -0,0 +1,120 @@ +/* An unpremultiplier using a 64KB division table. It specialises + * constant runs and solid runs of pixels, but not quite as well as + * unpremultiply-inv32 (but it could given some more effort.) It has + * the advantage over the others that it can clamp the result into + * range without any cost so that the output doesn't overflow should + * there be superluminant pixels in the input. It's also reasonably + * fast if you have enough L1. */ +#include <stddef.h> +#include <stdint.h> + +/* Pixel format config for a 32 bit pixel with 8 bit components. Only + * the location of alpha matters. */ +#ifndef ASHIFT +# define ASHIFT 24 +#endif +#define RSHIFT ((24 + ASHIFT) % 32) +#define GSHIFT ((16 + ASHIFT) % 32) +#define BSHIFT (( 8 + ASHIFT) % 32) + +#define AMASK (255U << ASHIFT) +#define RMASK (255U << RSHIFT) +#define GMASK (255U << GSHIFT) +#define BMASK (255U << BSHIFT) + +/* The function to tabulate in the lookup table. Could be anything + * really. */ +#define div_func(a,b) (255*(a) / (b)) + +#define clamp(x) ((x) > 255 ? 255 : (x)) + +/* The entries division_table[a][b] are defined by + * + * 0 when b = 0 + * min(255, ceil(255*a / b)) when b > 0 + * + */ +static uint8_t const division_table[256][256] = { + +#define R(a,b) ((b) > 0 ? clamp(div_func((a),(b))) : 0) +#define R1(a,i) R(a, i), R(a+1, i), R(a+2, i), R(a+3, i) +#define R2(a,i) R1(a, i), R1(a+4, i), R1(a+8, i), R1(a+12, i) +#define R3(a,i) R2(a, i), R2(a+16, i), R2(a+32, i), R2(a+48, i) + +#define S(b) { R3(0, b), R3(64, b), R3(128, b), R3(192, b) } +#define S1(b) S(b), S(b+1), S(b+2), S(b+3) +#define S2(b) S1(b), S1(b+4), S1(b+8), S1(b+12) +#define S3(b) S2(b), S2(b+16), S2(b+32), S2(b+48) + + S3(0), S3(64), S3(128), S3(192) +}; + +void +unpremultiply_with_lutb( + uint32_t * restrict dst, + uint32_t const * restrict src, + size_t num_pixels) +{ + size_t i = 0; + while (i < num_pixels) { + uint32_t const_in, const_out; + uint32_t accu = -2*255; + uint32_t diff; + + { + uint32_t rgba, r, g, b, a; + rgba = const_in = src[i]; + a = (rgba >> ASHIFT) & 0xFF; + accu = a; + r = (rgba >> RSHIFT) & 0xFF; + g = (rgba >> GSHIFT) & 0xFF; + b = (rgba >> BSHIFT) & 0xFF; + r = division_table[a][r]; + g = division_table[a][g]; + b = division_table[a][b]; + dst[i] = const_out = + (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); + } + + if (i+1 >= num_pixels) + return; + + { + uint32_t rgba, r, g, b, a; + rgba = src[i+1]; + a = (rgba >> ASHIFT) & 0xFF; + accu += a; + r = (rgba >> RSHIFT) & 0xFF; + g = (rgba >> GSHIFT) & 0xFF; + b = (rgba >> BSHIFT) & 0xFF; + diff = const_in ^ rgba; + r = division_table[a][r]; + g = division_table[a][g]; + b = division_table[a][b]; + dst[i+1] = + (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); + } + + accu -= 2*255; + i += 2; + + if (0 != (diff & accu)) + continue; + + if (0 == accu) { + uint32_t in; + while (AMASK == ((in = src[i]) & AMASK)) { + dst[i++] = in; + if (i == num_pixels) + return; + } + } + else if (0 == diff) { + while (src[i] == const_in) { + dst[i++] = const_out; + if (i == num_pixels) + return; + } + } + } +} diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S index 1ffbf5f..9596876 100644 --- a/unpremultiply-sse2.S +++ b/unpremultiply-sse2.S @@ -84,7 +84,7 @@ unpremultiply_single_pixels: mov ebx, eax mov ebp, eax ; Initialise result pixel register. and ebp, 0xFF000000 ; Mask off non-alpha from result pix. - jz .next +; jz .next shr ebx, 24 ; Load alpha. mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. @@ -263,7 +263,7 @@ unpremultiply_with_sse2: ; If we don't have enough pixels for at least a few iterations ; of blocked unpremultiplication then do the pixels one at a time. cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks. - jae .do_blocked +; jae .do_blocked mov rcx, rdx ; Pixel count. call unpremultiply_single_pixels jmp .out @@ -289,8 +289,8 @@ unpremultiply_with_sse2: ; the image is fairly small then use movdqa writes. cmp rdi, rsi ; Use movdqa for aliased src, dst. jz .1 - cmp rdx, 8192 ; ... or if the src and dest are small. - jc .1 +; cmp rdx, 128 ; ... or if the src and dest are small. +; jc .1 unpremultiply_pixel_blocks movntdq jmp .do_leftovers .1: diff --git a/unpremultiply.c b/unpremultiply.c index 201a59c..322367b 100644 --- a/unpremultiply.c +++ b/unpremultiply.c @@ -1,6 +1,7 @@ #define RUN_ME /* nasm -g -f elf64 unpremultiply-sse2.S -CFLAGS="-W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g" +CFLAGS="-W -Wall -Wextra -std=c99 -O2 -g" +CFLAGS="$CFLAGS -O3 -fomit-frame-pointer -funroll-all-loops" gcc $CFLAGS -c *.c gcc $CFLAGS -o `basename $0 .c` *.o exit $? @@ -31,7 +32,9 @@ exit $? /* The methods we have available. */ void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_inv32(uint32_t *dst, uint32_t const *src, size_t n); +void unpremultiply_with_inv32b(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_lut(uint32_t *dst, uint32_t const *src, size_t n); +void unpremultiply_with_lutb(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_div(uint32_t *dst, uint32_t const *src, size_t n); /* @@ -208,7 +211,9 @@ main(int argc, char **argv) } else if (0 == strcmp(argv[i], "div") || 0 == strcmp(argv[i], "lut") || + 0 == strcmp(argv[i], "lutb") || 0 == strcmp(argv[i], "inv32") || + 0 == strcmp(argv[i], "inv32b") || 0 == strcmp(argv[i], "sse2") || 0 == strcmp(argv[i], "copy") || 0 == strcmp(argv[i], "read") || @@ -240,11 +245,21 @@ main(int argc, char **argv) unpremultiply_with_lut(dst, src, n); } } + else if (0 == strcmp(method, "lutb")) { + while (nloops-- > 0) { + unpremultiply_with_lutb(dst, src, n); + } + } else if (0 == strcmp(method, "inv32")) { while (nloops-- > 0) { unpremultiply_with_inv32(dst, src, n); } } + else if (0 == strcmp(method, "inv32b")) { + while (nloops-- > 0) { + unpremultiply_with_inv32b(dst, src, n); + } + } else if (0 == strcmp(method, "sse2")) { while (nloops-- > 0) { unpremultiply_with_sse2(dst, src, n); |