diff options
author | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-03 04:21:08 +0200 |
---|---|---|
committer | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-03 04:21:08 +0200 |
commit | 109b7cdbd6e617b9f818bbc110c1f6841487eb27 (patch) | |
tree | f824aa868611194d584530aebbd06cff6c1003b3 |
Import sources from oneshots/
-rw-r--r-- | unpremultiply-sse2-float.S | 105 | ||||
-rw-r--r-- | unpremultiply-sse2-old.S | 165 | ||||
-rw-r--r-- | unpremultiply-sse2.S | 110 | ||||
-rw-r--r-- | unpremultiply.c | 549 |
4 files changed, 929 insertions, 0 deletions
diff --git a/unpremultiply-sse2-float.S b/unpremultiply-sse2-float.S new file mode 100644 index 0000000..b8c9182 --- /dev/null +++ b/unpremultiply-sse2-float.S @@ -0,0 +1,105 @@ + section .text + +%macro function 1 + global %1 +%1: +%endmacro + +%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d)) + +; Unpremultiply a pixel in-place with uint32 components in xmm register %1. +; Invariant: +; xmm0: 0 +; xmm6: 255.0f +; xmm7: (?,?,1.0f,?) +; Scratch: xmm5 +%macro unpremultiply1 1 + cvtdq2ps %1, %1 ; uint32 components -> float + rcpss xmm7, %1 ; xmm7: (?,?,1.0,1/a) + mulss xmm7, xmm6 ; xmm7: (?,?,1.0,255/a), xmm6: 255.0 + shufps xmm5, xmm7, SELECT(0,1,0,0) ; xmm5: (255/a,1.0,?,?) + shufps xmm5, xmm7, SELECT(0,0,3,2); xmm5: (255/a,255/a,255/a,1.0) + mulps %1, xmm5 ; %1: (255*r/a,.., 255*b/a, a) + cvtps2dq %1, %1 ; float components -> uint32 +%endmacro + +; Unpremultiply two pixels in-place with uint16 components in xmm register %1. +; Invariant: as above. +; Scratch: xmm4-5 +%macro unpremultiply2 1 + movdqa xmm4, %1 + punpckhwd xmm4, xmm0 + punpcklwd %1, xmm0 + unpremultiply1 xmm4 + unpremultiply1 %1 + packssdw %1, xmm4 +%endmacro + +; Unpremultiply four pixels in-place with uint8 components in xmm register %1. +; Invariant: as above. +; Scratch: xmm3-5 +%macro unpremultiply4 1 + movdqa xmm3, %1 + punpckhbw xmm3, xmm0 + punpcklbw %1, xmm0 + unpremultiply2 xmm3 + unpremultiply2 %1 + packuswb %1, xmm3 +%endmacro + +; Input: +; %1: movdqa or movdqu, depending on the alignment of rsi and rdi. +; r8: number of times N > 0 to loop. +; rsi: uint32_t[4*N]: source pixels. +; rdi: uint32_t[4*N]: destination pixels. +; Invariant: as above. +; Scratch: rsi,rdi,r8-9,xmm2-5 +%macro unpremultiply_loop_with 1 + xor r9,r9 ; index register. + align 16 +%%loop: + prefetchnta [rsi + r9*8 + 16*64] + %1 xmm2, [rsi+r9*8] + unpremultiply4 xmm2 + movntdq [rdi+r9*8], xmm2 + + add r9, 2 + dec r8 + jnz %%loop +%endmacro + +;; void unpremultiply_with_sse2_float(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) +;; +function unpremultiply_with_sse2_float + mov r8, rdx + shr r8, 2 ; TODO: left over pixels. + test r8, r8 + jnz .setup_invariants + ret + +.setup_invariants: + pxor xmm0, xmm0 ; constant zero for unpacking. + + mov rax, 1 + movd xmm7, eax + cvtdq2ps xmm7, xmm7 + shufps xmm7, xmm7, SELECT(1,1,0,1) ; xmm7: (0,0,1.0f,0) + + mov rax, 255 + movd xmm6, eax + cvtdq2ps xmm6, xmm6 + shufps xmm6, xmm6, SELECT(1,1,1,0) ; xmm6: 255f + + ; Decide on whether to use movdqu or movdqa based on source + ; alignment. We always use movntdq to write the dest. + test rsi, 15 + jz .aligned_case + jmp .unaligned_case + +.aligned_case: + unpremultiply_loop_with movdqa + ret + +.unaligned_case: + unpremultiply_loop_with movdqu + ret diff --git a/unpremultiply-sse2-old.S b/unpremultiply-sse2-old.S new file mode 100644 index 0000000..d8c6209 --- /dev/null +++ b/unpremultiply-sse2-old.S @@ -0,0 +1,165 @@ + section .text + +%macro function 1 + global %1 +%1: +%endmacro + +%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d)) + +reciprocal_table: + dd 0 +%assign i 1 +%rep 255 + dd (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. +%assign i i+1 +%endrep + +; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12. +; The pixels must not be supersaturated. +; Output: xmm1: u8[], Four unpremultiplied pixels. +; Invariant: +; xmm0: 0 +; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) +; Scratch: xmm2, rax, rbx, rcx, rdx +%macro unpremultiply_xmm1_old 0 + ; Expand the 8 bit components into 16 bit ones in + ; two registers. + movdqa xmm2, xmm1 + punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a) + punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a) + + ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. + pextrw eax, xmm1, 0 ; Extract low half reg alphas. + pextrw ecx, xmm2, 0 + pextrw ebx, xmm1, 4 ; Extract high half reg alphas. + pextrw edx, xmm2, 4 + mov eax, [reciprocal_table + 4*eax] ; Fetch 255/alpha + mov ecx, [reciprocal_table + 4*ecx] ; as 8.8 fp numbers. + mov ebx, [reciprocal_table + 4*ebx] + mov edx, [reciprocal_table + 4*edx] + pinsrw xmm3, eax, 1 ; Inject into coefficient regs. + pinsrw xmm4, ecx, 1 + pshuflw xmm3, xmm3, SELECT(1,1,1,0) ; Replicate non-alpha coefs. + pshuflw xmm4, xmm4, SELECT(1,1,1,0) + pinsrw xmm3, ebx, 5 + pinsrw xmm4, edx, 5 + pshufhw xmm3, xmm3, SELECT(1,1,1,0) + pshufhw xmm4, xmm4, SELECT(1,1,1,0) + pmullw xmm1, xmm3 ; Multiply components by coefs. + pmullw xmm2, xmm4 ; to produce 8.8 fp components. + psrlw xmm1, 8 ; Take floor of components. + psrlw xmm2, 8 + + ; Pack the four resulting pixels from 16 to 8 bit components. + packuswb xmm1, xmm2 +%endmacro + +; Reciprocal table with 64 bit entries of a 4x16 vector +; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format. +reciprocal_table_Q: + dq 0 +%assign i 1 +%rep 255 + dw 256 ; unity + dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. + dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. + dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. +%assign i i+1 +%endrep + +; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12. +; The pixels must not be supersaturated. +; Output: xmm1: u8[], Four unpremultiplied pixels. +; Invariant: +; xmm0: 0 +; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) +; Scratch: xmm2, rax, rbx, rcx, rdx +%macro unpremultiply_xmm1 0 + ; Expand the 8 bit components into 16 bit ones in + ; two registers. + movdqa xmm2, xmm1 + punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a) + punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a) + + ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. + pextrw ebx, xmm1, 4 ; Extract pixel alphas into registers. + pextrw edx, xmm2, 4 + pextrw eax, xmm1, 0 + pextrw ecx, xmm2, 0 + movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers + movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs. + movq xmm3, [reciprocal_table_Q + 8*eax] + movq xmm4, [reciprocal_table_Q + 8*ecx] + pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper. + pshufd xmm6, xmm6, SELECT(1,0,3,2) + por xmm3, xmm5 ; Combine 64 bit upper and lower + por xmm4, xmm6 ; into 128 bit multipliers. + pmullw xmm1, xmm3 ; Multiply components by coefs. + pmullw xmm2, xmm4 ; to produce 8.8 fp components. + psrlw xmm1, 8 ; Take floor of components. + psrlw xmm2, 8 + + ; Pack the four resulting pixels from 16 to 8 bit components. + packuswb xmm1, xmm2 +%endmacro + +; Input: +; %1: movdqa or movdqu, depending on the alignment of rsi and rdi. +; r8: number of times N > 0 to loop. +; rsi: uint32_t[4*N]: source pixels. +; rdi: uint32_t[4*N]: destination pixels. +; Invariant: +; xmm0: 0 +; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) +; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9 +%macro unpremultiply_loop_with 1 + xor r9,r9 ; index register. + align 16 +%%loop: + prefetchnta [rsi + r9*8 + 16*64] + %1 xmm1, [rsi+r9*8] + unpremultiply_xmm1 + movntdq [rdi+r9*8], xmm1 + + add r9, 2 + dec r8 + jnz %%loop +%endmacro + +;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) +;; +function unpremultiply_with_sse2 + mov r8, rdx + shr r8, 2 ; TODO: left over pixels. + test r8, r8 + jnz .setup_invariants + ret + +.setup_invariants: + push rbx + pxor xmm0, xmm0 ; constant zero for unpacking. + + ; Setup component multiplier registers xmm3/xmm4 with + ; 8.8 fixed point coefficients. The alpha component always + ; has a coefficient of one. + mov rax, 256*256+256 ; unity in 8.8 fp + movd xmm3, eax + pshufd xmm3, xmm3, SELECT(0,0,0,0) + movdqa xmm4, xmm3 + + ; Decide on whether to use movdqu or movdqa based on source + ; alignment. We always use mvntdq to write the dest. + test rsi, 15 + jz .aligned_case + jmp .unaligned_case + +.aligned_case: + unpremultiply_loop_with movdqa + pop rbx + ret + +.unaligned_case: + unpremultiply_loop_with movdqu + pop rbx + ret diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S new file mode 100644 index 0000000..1b699cc --- /dev/null +++ b/unpremultiply-sse2.S @@ -0,0 +1,110 @@ + section .text + +%define ASHIFT 24 +;%define ASHIFT 0 + +%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d)) + +; Reciprocal table with 64 bit entries of a 4x16 vector +; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format. +reciprocal_table_Q: + dq 0 +%assign i 1 +%rep 255 +%assign recip ((255*256 + i-1) / i) +%if ASHIFT == 0 + dw 256, recip, recip, recip +%elif ASHIFT==24 + dw recip, recip, recip, 256 +%endif +%assign i i+1 +%endrep + +; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12. +; The pixels must not be supersaturated. +; Output: xmm1: u8[], Four unpremultiplied pixels. +; Invariant: +; xmm0: 0 +; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) +; Scratch: xmm2, rax, rbx, rcx, rdx +%macro unpremultiply_xmm1 0 + ; Expand the 8 bit components into 16 bit ones in + ; two registers. + movdqa xmm2, xmm1 + punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a) + punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a) + + ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. + pextrw ebx, xmm1, 4+ASHIFT/8 ; Extract pixel alphas into registers. + pextrw edx, xmm2, 4+ASHIFT/8 + pextrw eax, xmm1, 0+ASHIFT/8 + pextrw ecx, xmm2, 0+ASHIFT/8 + movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers + movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs. + movq xmm3, [reciprocal_table_Q + 8*eax] + movq xmm4, [reciprocal_table_Q + 8*ecx] + pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper. + pshufd xmm6, xmm6, SELECT(1,0,3,2) + por xmm3, xmm5 ; Combine 64 bit upper and lower + por xmm4, xmm6 ; into 128 bit multipliers. + pmullw xmm1, xmm3 ; Multiply components by coefs. + pmullw xmm2, xmm4 ; to produce 8.8 fp components. + psrlw xmm1, 8 ; Take floor of components. + psrlw xmm2, 8 + + ; Pack the four resulting pixels from 16 to 8 bit components. + packuswb xmm1, xmm2 +%endmacro + +; Input: +; %1: movdqa or movdqu, depending on the alignment of rsi and rdi. +; r8: number of times N > 0 to loop. +; rsi: uint32_t[4*N]: source pixels. +; rdi: uint32_t[4*N]: destination pixels. +; Invariant: +; xmm0: 0 +; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) +; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9 +%macro unpremultiply_loop_with 1 + xor r9,r9 ; index register. + align 16 +%%loop: + prefetchnta [rsi + r9*8 + 16*64] + %1 xmm1, [rsi+r9*8] + unpremultiply_xmm1 + movntdq [rdi+r9*8], xmm1 + + add r9, 2 + dec r8 + jnz %%loop +%endmacro + +;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) +;; +global unpremultiply_with_sse2 +unpremultiply_with_sse2: + mov r8, rdx + shr r8, 2 ; TODO: left over pixels. + test r8, r8 + jnz .setup_invariants + ret + +.setup_invariants: + push rbx + pxor xmm0, xmm0 ; constant zero for unpacking. + + ; Decide on whether to use movdqu or movdqa based on source + ; alignment. We always use mvntdq to write the dest. + test rsi, 15 + jz .aligned_case + jmp .unaligned_case + +.aligned_case: + unpremultiply_loop_with movdqa + pop rbx + ret + +.unaligned_case: + unpremultiply_loop_with movdqu + pop rbx + ret diff --git a/unpremultiply.c b/unpremultiply.c new file mode 100644 index 0000000..99932fb --- /dev/null +++ b/unpremultiply.c @@ -0,0 +1,549 @@ +#define RUN_ME /* +nasm -g -f elf64 unpremultiply-sse2.S +nasm -g -f elf64 unpremultiply-sse2-float.S +gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o unpremultiply-sse2-float.o $0 +exit $? +*/ +#include <assert.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#if 1 +# define ASHIFT 24 +# define RSHIFT 16 +# define GSHIFT 8 +# define BSHIFT 0 +#else +# define RSHIFT 24 +# define GSHIFT 16 +# define BSHIFT 8 +# define ASHIFT 0 +#endif + +#define AMASK (255 << ASHIFT) +#define RMASK (255 << RSHIFT) +#define GMASK (255 << GSHIFT) +#define BMASK (255 << BSHIFT) + +void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n); +void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n); + +static void __attribute__((noinline)) +unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + uint32_t prev_in = 0; + uint32_t prev_out = 0; + size_t i; + + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba & AMASK) >> ASHIFT; + if (a == 255) { + dst[i] = rgba; + continue; + } + if (prev_in == rgba) { + dst[i] = prev_out; + continue; + } + if (a) { + uint32_t r = (rgba >> RSHIFT) & 0xFF; + uint32_t g = (rgba >> GSHIFT) & 0xFF; + uint32_t b = (rgba >> BSHIFT) & 0xFF; + r = r*255 / a; + g = g*255 / a; + b = b*255 / a; + assert(r < 256); + assert(g < 256); + assert(b < 256); + assert(a < 256); + prev_in = rgba; + prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); + } else { + prev_in = prev_out = 0; + } + dst[i] = prev_out; + } +} + +static uint8_t division_table[65536]; + +static void __attribute__((noinline)) +unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + size_t i; + uint32_t prev_in = 0; + uint32_t prev_out = 0; + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba >> ASHIFT) & 0xFF; + if (a == 255) { + dst[i] = rgba; + } + else if (prev_in == rgba) { + dst[i] = prev_out; + } + else { + uint32_t r = (rgba >> RSHIFT) & 0xFF; + uint32_t g = (rgba >> GSHIFT) & 0xFF; + uint32_t b = (rgba >> BSHIFT) & 0xFF; + r = division_table[a*256 + r]; + g = division_table[a*256 + g]; + b = division_table[a*256 + b]; + + prev_in = rgba; + prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT); + dst[i] = prev_out; + } + } +} + +#define RECIPROCAL_BITS 8 + +static uint32_t reciprocal_table_A[256]; +static uint64_t reciprocal_table_B[256]; + +#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y)) + +static void __attribute__((noinline)) +unpremultiply_with_inv32(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + size_t i; + uint32_t prev_in = 0; + uint32_t prev_out = 0; + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba >> ASHIFT) & 255; + if (a == 255) { + dst[i] = rgba; + } + else if (prev_in == rgba) { + dst[i] = prev_out; + } + else { + prev_in = rgba; + uint32_t r = (rgba >> RSHIFT) & 255; + uint32_t g = (rgba >> GSHIFT) & 255; + uint32_t b = (rgba >> BSHIFT) & 255; + uint32_t recip = reciprocal_table_A[a]; + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + dst[i] = prev_out; + } + } +} + +#define INNER_UNROLL (2) + +static void __attribute__((noinline)) +unpremultiply_with_inv32_bis(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + for (size_t i = 0; i < n; ) { + uint32_t prev_in, prev_out; + /* 0 iff all inputs are the same */ + unsigned delta = 0; + /* adds to 0 if all inputs are opaque */ + unsigned opaque_count = - INNER_UNROLL*255; + + { + uint32_t rgba = prev_in = src[i]; + uint32_t a = (rgba >> ASHIFT) & 255; + opaque_count += a; + uint32_t r = (rgba >> RSHIFT) & 255; + uint32_t g = (rgba >> GSHIFT) & 255; + uint32_t b = (rgba >> BSHIFT) & 255; + uint32_t recip = reciprocal_table_A[a]; + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i] = prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + } + + /* UNROLL, bitch! */ + for (unsigned j = 1; j < INNER_UNROLL; j++) { + if (i+j>=n) return; + uint32_t rgba = src[i+j]; + delta |= rgba ^ prev_in; + uint32_t a = (rgba >> ASHIFT) & 255; + opaque_count += a; + uint32_t r = (rgba >> RSHIFT) & 255; + uint32_t g = (rgba >> GSHIFT) & 255; + uint32_t b = (rgba >> BSHIFT) & 255; + uint32_t recip = reciprocal_table_A[a]; + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i+j] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + } + + i += INNER_UNROLL; + + /* switch to special case after k of them + * + minimize cost when unapplicable + */ + + if (0 != (delta & opaque_count)) continue; + + if (0 == opaque_count) { + uint32_t in; + while ((255U << ASHIFT) == ((in = src[i]) & (255U << ASHIFT))) { + dst[i++] = in; + if (i >= n) return; + } + } else if (0 == delta) { + while (src[i] == prev_in) { + dst[i++] = prev_out; + if (i >= n) return; + } + } + } +} +#undef INNER_UNROLL + +static void __attribute__((noinline)) +unpremultiply_with_inv32_nocache( + uint32_t * restrict dst, + uint32_t const * restrict src, + size_t n) +{ + size_t i; + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba >> ASHIFT) & 255; + uint32_t r = (rgba >> RSHIFT) & 255; + uint32_t g = (rgba >> GSHIFT) & 255; + uint32_t b = (rgba >> BSHIFT) & 255; + uint32_t recip = reciprocal_table_A[a]; + r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS); + g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS); + b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS); + dst[i] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK); + } +} + +static void __attribute__((noinline)) +unpremultiply_with_inv64(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + size_t i; + uint32_t prev_in = 0; + uint32_t prev_out = 0; + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba >> ASHIFT) & 255; + if (a == 255) { + dst[i] = rgba; + } + else if (prev_in == rgba) { + dst[i] = prev_out; + } + else { + prev_in = rgba; + uint64_t r = rgba & RMASK; + uint64_t g = rgba & GMASK; + uint64_t b = rgba & BMASK; + uint64_t recip = reciprocal_table_B[a]; + r = r * recip; + g = g * recip; + b = b * recip; + prev_out = (rgba & AMASK) | + (((r & ((uint64_t)RMASK << 32)) | + (g & ((uint64_t)GMASK << 32)) | + (b & ((uint64_t)BMASK << 32))) >> 32); + dst[i] = prev_out; + } + } +} + +static void __attribute__((noinline)) +unpremultiply_with_inv64_nocache( + uint32_t * restrict dst, + uint32_t const * restrict src, + size_t n) +{ + size_t i; + for (i=0; i<n; i++) { + uint32_t rgba = src[i]; + uint32_t a = (rgba & AMASK) >> ASHIFT; + uint64_t r = rgba & RMASK; + uint64_t g = rgba & GMASK; + uint64_t b = rgba & BMASK; + uint64_t recip = reciprocal_table_B[a]; + r = r * recip; + g = g * recip; + b = b * recip; + dst[i] = (rgba & AMASK) | + (((r & ((uint64_t)RMASK << 32)) | + (g & ((uint64_t)GMASK << 32)) | + (b & ((uint64_t)BMASK << 32))) >> 32); + } +} + +static void __attribute__((noinline)) +unpremultiply_with_memcpy(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + memcpy(dst, src, 4*n); +} + +static void __attribute__((noinline)) +unpremultiply_with_memset(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + (void)src; + memset(dst, 0, 4*n); +} + +volatile uint32_t read_sum; + +static void __attribute__((noinline)) +unpremultiply_with_read(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) +{ + size_t i; + uint32_t sum = 0; + (void)src; + (void)dst; + for (i=0; i<n; i++) sum += src[i]; + read_sum = sum; +} + +static void +make_division_table() +{ + unsigned a; + for (a=1; a<256; a++) { + unsigned x; + for (x=0; x<256; x++) { + unsigned y = x*255/a; + y = y < 255 ? y : 255; + division_table[a*256 + x] = y; + } + } + for (a=0; a<256; a++) { + division_table[a] = 0; + } +} + +static void +make_reciprocal_table_A() +{ + unsigned a; + reciprocal_table_A[0] = 0; + for (a=1; a<256; a++) { + uint32_t r = 255U*(1<<RECIPROCAL_BITS) / a; + while ((a*r) >> RECIPROCAL_BITS < 255) r++; + reciprocal_table_A[a] = r; + } +} + +static void +make_reciprocal_table_B() +{ + unsigned a; + reciprocal_table_B[0] = 0; + for (a=1; a<256; a++) { + uint64_t r = 255*(1ULL<<32)/a; + while ((a*r) >> 32 < 255) r++; + reciprocal_table_B[a] = r; + } +} + +static void +saturate(uint32_t *buf, size_t n) +{ + size_t i; + for (i=0; i<n; i++) { + uint32_t rgba = buf[i]; + uint32_t a = (rgba >> ASHIFT) & 0xFF; + uint32_t r = (rgba >> RSHIFT) & 0xFF; + uint32_t g = (rgba >> GSHIFT) & 0xFF; + uint32_t b = (rgba >> BSHIFT) & 0xFF; + r = r < a ? r : a; + g = g < a ? g : a; + b = b < a ? b : a; + buf[i] = (a << ASHIFT) | + (r << RSHIFT) | + (g << GSHIFT) | + (b << BSHIFT); + } +} + +static void +fill_random(uint32_t *buf, size_t n) +{ + uint32_t x = 123456789; + size_t i; + for (i=0; i<n; i++) { + x = x*33 + 1; + buf[i] = x; + } +} + +static void +fill_gradient(uint32_t *buf, size_t n) +{ + size_t i; + for (i=0; i<n; i++) { + buf[i] = i*(0x01020301); + } +} + +static void +fill_solid(uint32_t *buf, size_t n) +{ + size_t i; + for (i=0; i<n; i++) { + buf[i] = buf[i] | AMASK; + } +} + +static void +fill_empty(uint32_t *buf, size_t n) +{ + memset(buf, 0, 4*n); +} + +int +main(int argc, char **argv) +{ + long nloops = argc > 1 ? atol(argv[1]) : 1; + size_t n = 2*1024*1024; + uint32_t *dst = calloc(n, 4); + uint32_t *src = calloc(n, 4); + char const *method = "lut"; + int verify = 0; + int i; + + make_division_table(); + make_reciprocal_table_A(); + make_reciprocal_table_B(); + + for (i=2; i<argc; i++) { + if (0 == strcmp(argv[i], "verify")) { + verify = 1; + } + else if (0 == strcmp(argv[i], "random")) { + fill_random (src, n); + } + else if (0 == strcmp(argv[i], "solid")) { + fill_solid (src, n); + } + else if (0 == strcmp(argv[i], "clear")) { + fill_empty (src, n); + } + else if (0 == strcmp(argv[i], "gradient")) { + fill_gradient (src, n); + } + else if (0 == strcmp(argv[i], "div") || + 0 == strcmp(argv[i], "lut") || + 0 == strcmp(argv[i], "inv32") || + 0 == strcmp(argv[i], "inv32-bis") || + 0 == strcmp(argv[i], "inv64") || + 0 == strcmp(argv[i], "inv32-nocache") || + 0 == strcmp(argv[i], "inv64-nocache") || + 0 == strcmp(argv[i], "sse2") || + 0 == strcmp(argv[i], "sse2-float") || + 0 == strcmp(argv[i], "copy") || + 0 == strcmp(argv[i], "read") || + 0 == strcmp(argv[i], "write")) + { + method = argv[i]; + } + else { + fprintf(stderr, "unknown arg %s\n", argv[i]); + return 1; + } + } + saturate(src, n); + + if (0 == strcmp(method, "div")) { + while (nloops-- > 0) { + unpremultiply_with_div(dst, src, n); + } + } + else if (0 == strcmp(method, "lut")) { + while (nloops-- > 0) { + unpremultiply_with_lut(dst, src, n); + } + } + else if (0 == strcmp(method, "inv32")) { + while (nloops-- > 0) { + unpremultiply_with_inv32(dst, src, n); + } + } + else if (0 == strcmp(method, "inv32-bis")) { + while (nloops-- > 0) { + unpremultiply_with_inv32_bis(dst, src, n); + } + } + else if (0 == strcmp(method, "inv64")) { + while (nloops-- > 0) { + unpremultiply_with_inv64(dst, src, n); + } + } + else if (0 == strcmp(method, "inv32-nocache")) { + while (nloops-- > 0) { + unpremultiply_with_inv32_nocache(dst, src, n); + } + } + else if (0 == strcmp(method, "inv64-nocache")) { + while (nloops-- > 0) { + unpremultiply_with_inv64_nocache(dst, src, n); + } + } + else if (0 == strcmp(method, "copy")) { + while (nloops-- > 0) { + unpremultiply_with_memcpy(dst, src, n); + } + } + else if (0 == strcmp(method, "write")) { + while (nloops-- > 0) { + unpremultiply_with_memset(dst, src, n); + } + } + else if (0 == strcmp(method, "read")) { + while (nloops-- > 0) { + unpremultiply_with_read(dst, src, n); + } + } + else if (0 == strcmp(method, "sse2")) { + while (nloops-- > 0) { + unpremultiply_with_sse2(dst, src, n); + } + } + else if (0 == strcmp(method, "sse2-float")) { + while (nloops-- > 0) { + unpremultiply_with_sse2_float(dst, src, n); + } + } + else { + fprintf(stderr, "unknown method %s\n", method); + return 1; + } + + if (verify) { + uint32_t *ref = malloc(n*4); + size_t i; + int maxdiff = 0; + unpremultiply_with_div(ref, src, n); + for (i=0; i<n; i++) { + uint32_t x = dst[i]; + uint32_t y = ref[i]; + int j; + for (j=0; j<4; j++){ + int diff = (x & 255) - (y & 255); + if (diff < 0) diff = -diff; + if (diff > maxdiff) { + printf("src:%08x dst:%08x ref:%08x diff:%d, i:%ld.%d\n", + src[i], dst[i], ref[i], diff, i,j); + maxdiff = diff; + } + x >>= 8; + y >>= 8; + } + } + } + + return 0; +} |