summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-03 04:21:08 +0200
committerM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-03 04:21:08 +0200
commit109b7cdbd6e617b9f818bbc110c1f6841487eb27 (patch)
treef824aa868611194d584530aebbd06cff6c1003b3
Import sources from oneshots/
-rw-r--r--unpremultiply-sse2-float.S105
-rw-r--r--unpremultiply-sse2-old.S165
-rw-r--r--unpremultiply-sse2.S110
-rw-r--r--unpremultiply.c549
4 files changed, 929 insertions, 0 deletions
diff --git a/unpremultiply-sse2-float.S b/unpremultiply-sse2-float.S
new file mode 100644
index 0000000..b8c9182
--- /dev/null
+++ b/unpremultiply-sse2-float.S
@@ -0,0 +1,105 @@
+ section .text
+
+%macro function 1
+ global %1
+%1:
+%endmacro
+
+%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
+
+; Unpremultiply a pixel in-place with uint32 components in xmm register %1.
+; Invariant:
+; xmm0: 0
+; xmm6: 255.0f
+; xmm7: (?,?,1.0f,?)
+; Scratch: xmm5
+%macro unpremultiply1 1
+ cvtdq2ps %1, %1 ; uint32 components -> float
+ rcpss xmm7, %1 ; xmm7: (?,?,1.0,1/a)
+ mulss xmm7, xmm6 ; xmm7: (?,?,1.0,255/a), xmm6: 255.0
+ shufps xmm5, xmm7, SELECT(0,1,0,0) ; xmm5: (255/a,1.0,?,?)
+ shufps xmm5, xmm7, SELECT(0,0,3,2); xmm5: (255/a,255/a,255/a,1.0)
+ mulps %1, xmm5 ; %1: (255*r/a,.., 255*b/a, a)
+ cvtps2dq %1, %1 ; float components -> uint32
+%endmacro
+
+; Unpremultiply two pixels in-place with uint16 components in xmm register %1.
+; Invariant: as above.
+; Scratch: xmm4-5
+%macro unpremultiply2 1
+ movdqa xmm4, %1
+ punpckhwd xmm4, xmm0
+ punpcklwd %1, xmm0
+ unpremultiply1 xmm4
+ unpremultiply1 %1
+ packssdw %1, xmm4
+%endmacro
+
+; Unpremultiply four pixels in-place with uint8 components in xmm register %1.
+; Invariant: as above.
+; Scratch: xmm3-5
+%macro unpremultiply4 1
+ movdqa xmm3, %1
+ punpckhbw xmm3, xmm0
+ punpcklbw %1, xmm0
+ unpremultiply2 xmm3
+ unpremultiply2 %1
+ packuswb %1, xmm3
+%endmacro
+
+; Input:
+; %1: movdqa or movdqu, depending on the alignment of rsi and rdi.
+; r8: number of times N > 0 to loop.
+; rsi: uint32_t[4*N]: source pixels.
+; rdi: uint32_t[4*N]: destination pixels.
+; Invariant: as above.
+; Scratch: rsi,rdi,r8-9,xmm2-5
+%macro unpremultiply_loop_with 1
+ xor r9,r9 ; index register.
+ align 16
+%%loop:
+ prefetchnta [rsi + r9*8 + 16*64]
+ %1 xmm2, [rsi+r9*8]
+ unpremultiply4 xmm2
+ movntdq [rdi+r9*8], xmm2
+
+ add r9, 2
+ dec r8
+ jnz %%loop
+%endmacro
+
+;; void unpremultiply_with_sse2_float(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+function unpremultiply_with_sse2_float
+ mov r8, rdx
+ shr r8, 2 ; TODO: left over pixels.
+ test r8, r8
+ jnz .setup_invariants
+ ret
+
+.setup_invariants:
+ pxor xmm0, xmm0 ; constant zero for unpacking.
+
+ mov rax, 1
+ movd xmm7, eax
+ cvtdq2ps xmm7, xmm7
+ shufps xmm7, xmm7, SELECT(1,1,0,1) ; xmm7: (0,0,1.0f,0)
+
+ mov rax, 255
+ movd xmm6, eax
+ cvtdq2ps xmm6, xmm6
+ shufps xmm6, xmm6, SELECT(1,1,1,0) ; xmm6: 255f
+
+ ; Decide on whether to use movdqu or movdqa based on source
+ ; alignment. We always use movntdq to write the dest.
+ test rsi, 15
+ jz .aligned_case
+ jmp .unaligned_case
+
+.aligned_case:
+ unpremultiply_loop_with movdqa
+ ret
+
+.unaligned_case:
+ unpremultiply_loop_with movdqu
+ ret
diff --git a/unpremultiply-sse2-old.S b/unpremultiply-sse2-old.S
new file mode 100644
index 0000000..d8c6209
--- /dev/null
+++ b/unpremultiply-sse2-old.S
@@ -0,0 +1,165 @@
+ section .text
+
+%macro function 1
+ global %1
+%1:
+%endmacro
+
+%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
+
+reciprocal_table:
+ dd 0
+%assign i 1
+%rep 255
+ dd (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+%assign i i+1
+%endrep
+
+; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
+; The pixels must not be supersaturated.
+; Output: xmm1: u8[], Four unpremultiplied pixels.
+; Invariant:
+; xmm0: 0
+; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx
+%macro unpremultiply_xmm1_old 0
+ ; Expand the 8 bit components into 16 bit ones in
+ ; two registers.
+ movdqa xmm2, xmm1
+ punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a)
+ punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a)
+
+ ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+ pextrw eax, xmm1, 0 ; Extract low half reg alphas.
+ pextrw ecx, xmm2, 0
+ pextrw ebx, xmm1, 4 ; Extract high half reg alphas.
+ pextrw edx, xmm2, 4
+ mov eax, [reciprocal_table + 4*eax] ; Fetch 255/alpha
+ mov ecx, [reciprocal_table + 4*ecx] ; as 8.8 fp numbers.
+ mov ebx, [reciprocal_table + 4*ebx]
+ mov edx, [reciprocal_table + 4*edx]
+ pinsrw xmm3, eax, 1 ; Inject into coefficient regs.
+ pinsrw xmm4, ecx, 1
+ pshuflw xmm3, xmm3, SELECT(1,1,1,0) ; Replicate non-alpha coefs.
+ pshuflw xmm4, xmm4, SELECT(1,1,1,0)
+ pinsrw xmm3, ebx, 5
+ pinsrw xmm4, edx, 5
+ pshufhw xmm3, xmm3, SELECT(1,1,1,0)
+ pshufhw xmm4, xmm4, SELECT(1,1,1,0)
+ pmullw xmm1, xmm3 ; Multiply components by coefs.
+ pmullw xmm2, xmm4 ; to produce 8.8 fp components.
+ psrlw xmm1, 8 ; Take floor of components.
+ psrlw xmm2, 8
+
+ ; Pack the four resulting pixels from 16 to 8 bit components.
+ packuswb xmm1, xmm2
+%endmacro
+
+; Reciprocal table with 64 bit entries of a 4x16 vector
+; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
+reciprocal_table_Q:
+ dq 0
+%assign i 1
+%rep 255
+ dw 256 ; unity
+ dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+ dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+ dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+%assign i i+1
+%endrep
+
+; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
+; The pixels must not be supersaturated.
+; Output: xmm1: u8[], Four unpremultiplied pixels.
+; Invariant:
+; xmm0: 0
+; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx
+%macro unpremultiply_xmm1 0
+ ; Expand the 8 bit components into 16 bit ones in
+ ; two registers.
+ movdqa xmm2, xmm1
+ punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a)
+ punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a)
+
+ ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+ pextrw ebx, xmm1, 4 ; Extract pixel alphas into registers.
+ pextrw edx, xmm2, 4
+ pextrw eax, xmm1, 0
+ pextrw ecx, xmm2, 0
+ movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
+ movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs.
+ movq xmm3, [reciprocal_table_Q + 8*eax]
+ movq xmm4, [reciprocal_table_Q + 8*ecx]
+ pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
+ pshufd xmm6, xmm6, SELECT(1,0,3,2)
+ por xmm3, xmm5 ; Combine 64 bit upper and lower
+ por xmm4, xmm6 ; into 128 bit multipliers.
+ pmullw xmm1, xmm3 ; Multiply components by coefs.
+ pmullw xmm2, xmm4 ; to produce 8.8 fp components.
+ psrlw xmm1, 8 ; Take floor of components.
+ psrlw xmm2, 8
+
+ ; Pack the four resulting pixels from 16 to 8 bit components.
+ packuswb xmm1, xmm2
+%endmacro
+
+; Input:
+; %1: movdqa or movdqu, depending on the alignment of rsi and rdi.
+; r8: number of times N > 0 to loop.
+; rsi: uint32_t[4*N]: source pixels.
+; rdi: uint32_t[4*N]: destination pixels.
+; Invariant:
+; xmm0: 0
+; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
+%macro unpremultiply_loop_with 1
+ xor r9,r9 ; index register.
+ align 16
+%%loop:
+ prefetchnta [rsi + r9*8 + 16*64]
+ %1 xmm1, [rsi+r9*8]
+ unpremultiply_xmm1
+ movntdq [rdi+r9*8], xmm1
+
+ add r9, 2
+ dec r8
+ jnz %%loop
+%endmacro
+
+;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+function unpremultiply_with_sse2
+ mov r8, rdx
+ shr r8, 2 ; TODO: left over pixels.
+ test r8, r8
+ jnz .setup_invariants
+ ret
+
+.setup_invariants:
+ push rbx
+ pxor xmm0, xmm0 ; constant zero for unpacking.
+
+ ; Setup component multiplier registers xmm3/xmm4 with
+ ; 8.8 fixed point coefficients. The alpha component always
+ ; has a coefficient of one.
+ mov rax, 256*256+256 ; unity in 8.8 fp
+ movd xmm3, eax
+ pshufd xmm3, xmm3, SELECT(0,0,0,0)
+ movdqa xmm4, xmm3
+
+ ; Decide on whether to use movdqu or movdqa based on source
+ ; alignment. We always use mvntdq to write the dest.
+ test rsi, 15
+ jz .aligned_case
+ jmp .unaligned_case
+
+.aligned_case:
+ unpremultiply_loop_with movdqa
+ pop rbx
+ ret
+
+.unaligned_case:
+ unpremultiply_loop_with movdqu
+ pop rbx
+ ret
diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S
new file mode 100644
index 0000000..1b699cc
--- /dev/null
+++ b/unpremultiply-sse2.S
@@ -0,0 +1,110 @@
+ section .text
+
+%define ASHIFT 24
+;%define ASHIFT 0
+
+%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
+
+; Reciprocal table with 64 bit entries of a 4x16 vector
+; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
+reciprocal_table_Q:
+ dq 0
+%assign i 1
+%rep 255
+%assign recip ((255*256 + i-1) / i)
+%if ASHIFT == 0
+ dw 256, recip, recip, recip
+%elif ASHIFT==24
+ dw recip, recip, recip, 256
+%endif
+%assign i i+1
+%endrep
+
+; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
+; The pixels must not be supersaturated.
+; Output: xmm1: u8[], Four unpremultiplied pixels.
+; Invariant:
+; xmm0: 0
+; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx
+%macro unpremultiply_xmm1 0
+ ; Expand the 8 bit components into 16 bit ones in
+ ; two registers.
+ movdqa xmm2, xmm1
+ punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a)
+ punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a)
+
+ ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+ pextrw ebx, xmm1, 4+ASHIFT/8 ; Extract pixel alphas into registers.
+ pextrw edx, xmm2, 4+ASHIFT/8
+ pextrw eax, xmm1, 0+ASHIFT/8
+ pextrw ecx, xmm2, 0+ASHIFT/8
+ movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
+ movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs.
+ movq xmm3, [reciprocal_table_Q + 8*eax]
+ movq xmm4, [reciprocal_table_Q + 8*ecx]
+ pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
+ pshufd xmm6, xmm6, SELECT(1,0,3,2)
+ por xmm3, xmm5 ; Combine 64 bit upper and lower
+ por xmm4, xmm6 ; into 128 bit multipliers.
+ pmullw xmm1, xmm3 ; Multiply components by coefs.
+ pmullw xmm2, xmm4 ; to produce 8.8 fp components.
+ psrlw xmm1, 8 ; Take floor of components.
+ psrlw xmm2, 8
+
+ ; Pack the four resulting pixels from 16 to 8 bit components.
+ packuswb xmm1, xmm2
+%endmacro
+
+; Input:
+; %1: movdqa or movdqu, depending on the alignment of rsi and rdi.
+; r8: number of times N > 0 to loop.
+; rsi: uint32_t[4*N]: source pixels.
+; rdi: uint32_t[4*N]: destination pixels.
+; Invariant:
+; xmm0: 0
+; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
+%macro unpremultiply_loop_with 1
+ xor r9,r9 ; index register.
+ align 16
+%%loop:
+ prefetchnta [rsi + r9*8 + 16*64]
+ %1 xmm1, [rsi+r9*8]
+ unpremultiply_xmm1
+ movntdq [rdi+r9*8], xmm1
+
+ add r9, 2
+ dec r8
+ jnz %%loop
+%endmacro
+
+;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+global unpremultiply_with_sse2
+unpremultiply_with_sse2:
+ mov r8, rdx
+ shr r8, 2 ; TODO: left over pixels.
+ test r8, r8
+ jnz .setup_invariants
+ ret
+
+.setup_invariants:
+ push rbx
+ pxor xmm0, xmm0 ; constant zero for unpacking.
+
+ ; Decide on whether to use movdqu or movdqa based on source
+ ; alignment. We always use mvntdq to write the dest.
+ test rsi, 15
+ jz .aligned_case
+ jmp .unaligned_case
+
+.aligned_case:
+ unpremultiply_loop_with movdqa
+ pop rbx
+ ret
+
+.unaligned_case:
+ unpremultiply_loop_with movdqu
+ pop rbx
+ ret
diff --git a/unpremultiply.c b/unpremultiply.c
new file mode 100644
index 0000000..99932fb
--- /dev/null
+++ b/unpremultiply.c
@@ -0,0 +1,549 @@
+#define RUN_ME /*
+nasm -g -f elf64 unpremultiply-sse2.S
+nasm -g -f elf64 unpremultiply-sse2-float.S
+gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o unpremultiply-sse2-float.o $0
+exit $?
+*/
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if 1
+# define ASHIFT 24
+# define RSHIFT 16
+# define GSHIFT 8
+# define BSHIFT 0
+#else
+# define RSHIFT 24
+# define GSHIFT 16
+# define BSHIFT 8
+# define ASHIFT 0
+#endif
+
+#define AMASK (255 << ASHIFT)
+#define RMASK (255 << RSHIFT)
+#define GMASK (255 << GSHIFT)
+#define BMASK (255 << BSHIFT)
+
+void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n);
+
+static void __attribute__((noinline))
+unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ uint32_t prev_in = 0;
+ uint32_t prev_out = 0;
+ size_t i;
+
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba & AMASK) >> ASHIFT;
+ if (a == 255) {
+ dst[i] = rgba;
+ continue;
+ }
+ if (prev_in == rgba) {
+ dst[i] = prev_out;
+ continue;
+ }
+ if (a) {
+ uint32_t r = (rgba >> RSHIFT) & 0xFF;
+ uint32_t g = (rgba >> GSHIFT) & 0xFF;
+ uint32_t b = (rgba >> BSHIFT) & 0xFF;
+ r = r*255 / a;
+ g = g*255 / a;
+ b = b*255 / a;
+ assert(r < 256);
+ assert(g < 256);
+ assert(b < 256);
+ assert(a < 256);
+ prev_in = rgba;
+ prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+ } else {
+ prev_in = prev_out = 0;
+ }
+ dst[i] = prev_out;
+ }
+}
+
+static uint8_t division_table[65536];
+
+static void __attribute__((noinline))
+unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ size_t i;
+ uint32_t prev_in = 0;
+ uint32_t prev_out = 0;
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba >> ASHIFT) & 0xFF;
+ if (a == 255) {
+ dst[i] = rgba;
+ }
+ else if (prev_in == rgba) {
+ dst[i] = prev_out;
+ }
+ else {
+ uint32_t r = (rgba >> RSHIFT) & 0xFF;
+ uint32_t g = (rgba >> GSHIFT) & 0xFF;
+ uint32_t b = (rgba >> BSHIFT) & 0xFF;
+ r = division_table[a*256 + r];
+ g = division_table[a*256 + g];
+ b = division_table[a*256 + b];
+
+ prev_in = rgba;
+ prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+ dst[i] = prev_out;
+ }
+ }
+}
+
+#define RECIPROCAL_BITS 8
+
+static uint32_t reciprocal_table_A[256];
+static uint64_t reciprocal_table_B[256];
+
+#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
+
+static void __attribute__((noinline))
+unpremultiply_with_inv32(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ size_t i;
+ uint32_t prev_in = 0;
+ uint32_t prev_out = 0;
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba >> ASHIFT) & 255;
+ if (a == 255) {
+ dst[i] = rgba;
+ }
+ else if (prev_in == rgba) {
+ dst[i] = prev_out;
+ }
+ else {
+ prev_in = rgba;
+ uint32_t r = (rgba >> RSHIFT) & 255;
+ uint32_t g = (rgba >> GSHIFT) & 255;
+ uint32_t b = (rgba >> BSHIFT) & 255;
+ uint32_t recip = reciprocal_table_A[a];
+ r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+ g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+ b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+ prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+ dst[i] = prev_out;
+ }
+ }
+}
+
+#define INNER_UNROLL (2)
+
+static void __attribute__((noinline))
+unpremultiply_with_inv32_bis(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ for (size_t i = 0; i < n; ) {
+ uint32_t prev_in, prev_out;
+ /* 0 iff all inputs are the same */
+ unsigned delta = 0;
+ /* adds to 0 if all inputs are opaque */
+ unsigned opaque_count = - INNER_UNROLL*255;
+
+ {
+ uint32_t rgba = prev_in = src[i];
+ uint32_t a = (rgba >> ASHIFT) & 255;
+ opaque_count += a;
+ uint32_t r = (rgba >> RSHIFT) & 255;
+ uint32_t g = (rgba >> GSHIFT) & 255;
+ uint32_t b = (rgba >> BSHIFT) & 255;
+ uint32_t recip = reciprocal_table_A[a];
+ r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+ g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+ b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+ dst[i] = prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+ }
+
+ /* UNROLL, bitch! */
+ for (unsigned j = 1; j < INNER_UNROLL; j++) {
+ if (i+j>=n) return;
+ uint32_t rgba = src[i+j];
+ delta |= rgba ^ prev_in;
+ uint32_t a = (rgba >> ASHIFT) & 255;
+ opaque_count += a;
+ uint32_t r = (rgba >> RSHIFT) & 255;
+ uint32_t g = (rgba >> GSHIFT) & 255;
+ uint32_t b = (rgba >> BSHIFT) & 255;
+ uint32_t recip = reciprocal_table_A[a];
+ r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+ g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+ b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+ dst[i+j] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+ }
+
+ i += INNER_UNROLL;
+
+ /* switch to special case after k of them
+ * + minimize cost when unapplicable
+ */
+
+ if (0 != (delta & opaque_count)) continue;
+
+ if (0 == opaque_count) {
+ uint32_t in;
+ while ((255U << ASHIFT) == ((in = src[i]) & (255U << ASHIFT))) {
+ dst[i++] = in;
+ if (i >= n) return;
+ }
+ } else if (0 == delta) {
+ while (src[i] == prev_in) {
+ dst[i++] = prev_out;
+ if (i >= n) return;
+ }
+ }
+ }
+}
+#undef INNER_UNROLL
+
+static void __attribute__((noinline))
+unpremultiply_with_inv32_nocache(
+ uint32_t * restrict dst,
+ uint32_t const * restrict src,
+ size_t n)
+{
+ size_t i;
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba >> ASHIFT) & 255;
+ uint32_t r = (rgba >> RSHIFT) & 255;
+ uint32_t g = (rgba >> GSHIFT) & 255;
+ uint32_t b = (rgba >> BSHIFT) & 255;
+ uint32_t recip = reciprocal_table_A[a];
+ r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+ g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+ b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+ dst[i] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+ }
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_inv64(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ size_t i;
+ uint32_t prev_in = 0;
+ uint32_t prev_out = 0;
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba >> ASHIFT) & 255;
+ if (a == 255) {
+ dst[i] = rgba;
+ }
+ else if (prev_in == rgba) {
+ dst[i] = prev_out;
+ }
+ else {
+ prev_in = rgba;
+ uint64_t r = rgba & RMASK;
+ uint64_t g = rgba & GMASK;
+ uint64_t b = rgba & BMASK;
+ uint64_t recip = reciprocal_table_B[a];
+ r = r * recip;
+ g = g * recip;
+ b = b * recip;
+ prev_out = (rgba & AMASK) |
+ (((r & ((uint64_t)RMASK << 32)) |
+ (g & ((uint64_t)GMASK << 32)) |
+ (b & ((uint64_t)BMASK << 32))) >> 32);
+ dst[i] = prev_out;
+ }
+ }
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_inv64_nocache(
+ uint32_t * restrict dst,
+ uint32_t const * restrict src,
+ size_t n)
+{
+ size_t i;
+ for (i=0; i<n; i++) {
+ uint32_t rgba = src[i];
+ uint32_t a = (rgba & AMASK) >> ASHIFT;
+ uint64_t r = rgba & RMASK;
+ uint64_t g = rgba & GMASK;
+ uint64_t b = rgba & BMASK;
+ uint64_t recip = reciprocal_table_B[a];
+ r = r * recip;
+ g = g * recip;
+ b = b * recip;
+ dst[i] = (rgba & AMASK) |
+ (((r & ((uint64_t)RMASK << 32)) |
+ (g & ((uint64_t)GMASK << 32)) |
+ (b & ((uint64_t)BMASK << 32))) >> 32);
+ }
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_memcpy(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ memcpy(dst, src, 4*n);
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_memset(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ (void)src;
+ memset(dst, 0, 4*n);
+}
+
+volatile uint32_t read_sum;
+
+static void __attribute__((noinline))
+unpremultiply_with_read(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+ size_t i;
+ uint32_t sum = 0;
+ (void)src;
+ (void)dst;
+ for (i=0; i<n; i++) sum += src[i];
+ read_sum = sum;
+}
+
+static void
+make_division_table()
+{
+ unsigned a;
+ for (a=1; a<256; a++) {
+ unsigned x;
+ for (x=0; x<256; x++) {
+ unsigned y = x*255/a;
+ y = y < 255 ? y : 255;
+ division_table[a*256 + x] = y;
+ }
+ }
+ for (a=0; a<256; a++) {
+ division_table[a] = 0;
+ }
+}
+
+static void
+make_reciprocal_table_A()
+{
+ unsigned a;
+ reciprocal_table_A[0] = 0;
+ for (a=1; a<256; a++) {
+ uint32_t r = 255U*(1<<RECIPROCAL_BITS) / a;
+ while ((a*r) >> RECIPROCAL_BITS < 255) r++;
+ reciprocal_table_A[a] = r;
+ }
+}
+
+static void
+make_reciprocal_table_B()
+{
+ unsigned a;
+ reciprocal_table_B[0] = 0;
+ for (a=1; a<256; a++) {
+ uint64_t r = 255*(1ULL<<32)/a;
+ while ((a*r) >> 32 < 255) r++;
+ reciprocal_table_B[a] = r;
+ }
+}
+
+static void
+saturate(uint32_t *buf, size_t n)
+{
+ size_t i;
+ for (i=0; i<n; i++) {
+ uint32_t rgba = buf[i];
+ uint32_t a = (rgba >> ASHIFT) & 0xFF;
+ uint32_t r = (rgba >> RSHIFT) & 0xFF;
+ uint32_t g = (rgba >> GSHIFT) & 0xFF;
+ uint32_t b = (rgba >> BSHIFT) & 0xFF;
+ r = r < a ? r : a;
+ g = g < a ? g : a;
+ b = b < a ? b : a;
+ buf[i] = (a << ASHIFT) |
+ (r << RSHIFT) |
+ (g << GSHIFT) |
+ (b << BSHIFT);
+ }
+}
+
+static void
+fill_random(uint32_t *buf, size_t n)
+{
+ uint32_t x = 123456789;
+ size_t i;
+ for (i=0; i<n; i++) {
+ x = x*33 + 1;
+ buf[i] = x;
+ }
+}
+
+static void
+fill_gradient(uint32_t *buf, size_t n)
+{
+ size_t i;
+ for (i=0; i<n; i++) {
+ buf[i] = i*(0x01020301);
+ }
+}
+
+static void
+fill_solid(uint32_t *buf, size_t n)
+{
+ size_t i;
+ for (i=0; i<n; i++) {
+ buf[i] = buf[i] | AMASK;
+ }
+}
+
+static void
+fill_empty(uint32_t *buf, size_t n)
+{
+ memset(buf, 0, 4*n);
+}
+
+int
+main(int argc, char **argv)
+{
+ long nloops = argc > 1 ? atol(argv[1]) : 1;
+ size_t n = 2*1024*1024;
+ uint32_t *dst = calloc(n, 4);
+ uint32_t *src = calloc(n, 4);
+ char const *method = "lut";
+ int verify = 0;
+ int i;
+
+ make_division_table();
+ make_reciprocal_table_A();
+ make_reciprocal_table_B();
+
+ for (i=2; i<argc; i++) {
+ if (0 == strcmp(argv[i], "verify")) {
+ verify = 1;
+ }
+ else if (0 == strcmp(argv[i], "random")) {
+ fill_random (src, n);
+ }
+ else if (0 == strcmp(argv[i], "solid")) {
+ fill_solid (src, n);
+ }
+ else if (0 == strcmp(argv[i], "clear")) {
+ fill_empty (src, n);
+ }
+ else if (0 == strcmp(argv[i], "gradient")) {
+ fill_gradient (src, n);
+ }
+ else if (0 == strcmp(argv[i], "div") ||
+ 0 == strcmp(argv[i], "lut") ||
+ 0 == strcmp(argv[i], "inv32") ||
+ 0 == strcmp(argv[i], "inv32-bis") ||
+ 0 == strcmp(argv[i], "inv64") ||
+ 0 == strcmp(argv[i], "inv32-nocache") ||
+ 0 == strcmp(argv[i], "inv64-nocache") ||
+ 0 == strcmp(argv[i], "sse2") ||
+ 0 == strcmp(argv[i], "sse2-float") ||
+ 0 == strcmp(argv[i], "copy") ||
+ 0 == strcmp(argv[i], "read") ||
+ 0 == strcmp(argv[i], "write"))
+ {
+ method = argv[i];
+ }
+ else {
+ fprintf(stderr, "unknown arg %s\n", argv[i]);
+ return 1;
+ }
+ }
+ saturate(src, n);
+
+ if (0 == strcmp(method, "div")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_div(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "lut")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_lut(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "inv32")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_inv32(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "inv32-bis")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_inv32_bis(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "inv64")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_inv64(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "inv32-nocache")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_inv32_nocache(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "inv64-nocache")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_inv64_nocache(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "copy")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_memcpy(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "write")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_memset(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "read")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_read(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "sse2")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_sse2(dst, src, n);
+ }
+ }
+ else if (0 == strcmp(method, "sse2-float")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_sse2_float(dst, src, n);
+ }
+ }
+ else {
+ fprintf(stderr, "unknown method %s\n", method);
+ return 1;
+ }
+
+ if (verify) {
+ uint32_t *ref = malloc(n*4);
+ size_t i;
+ int maxdiff = 0;
+ unpremultiply_with_div(ref, src, n);
+ for (i=0; i<n; i++) {
+ uint32_t x = dst[i];
+ uint32_t y = ref[i];
+ int j;
+ for (j=0; j<4; j++){
+ int diff = (x & 255) - (y & 255);
+ if (diff < 0) diff = -diff;
+ if (diff > maxdiff) {
+ printf("src:%08x dst:%08x ref:%08x diff:%d, i:%ld.%d\n",
+ src[i], dst[i], ref[i], diff, i,j);
+ maxdiff = diff;
+ }
+ x >>= 8;
+ y >>= 8;
+ }
+ }
+ }
+
+ return 0;
+}