Import sources from oneshots/

author: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-03 04:21:08 +0200
committer: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-03 04:21:08 +0200
commit: 109b7cdbd6e617b9f818bbc110c1f6841487eb27 (patch)
tree: f824aa868611194d584530aebbd06cff6c1003b3
4 files changed, 929 insertions, 0 deletions
diff --git a/unpremultiply-sse2-float.S b/unpremultiply-sse2-float.S
new file mode 100644
index 0000000..b8c9182
--- /dev/null
+++ b/unpremultiply-sse2-float.S
@@ -0,0 +1,105 @@
+	section .text
+
+%macro function 1
+	global	%1
+%1:
+%endmacro
+
+%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
+
+; Unpremultiply a pixel in-place with uint32 components in xmm register %1.
+; Invariant:
+;	xmm0: 0
+;	xmm6: 255.0f
+;	xmm7: (?,?,1.0f,?)
+; Scratch: xmm5
+%macro unpremultiply1 1
+	cvtdq2ps %1, %1			; uint32 components -> float
+	rcpss	xmm7, %1		; xmm7: (?,?,1.0,1/a)
+	mulss	xmm7, xmm6		; xmm7: (?,?,1.0,255/a), xmm6: 255.0
+	shufps	xmm5, xmm7, SELECT(0,1,0,0) ; xmm5: (255/a,1.0,?,?)
+	shufps	xmm5, xmm7, SELECT(0,0,3,2); xmm5: (255/a,255/a,255/a,1.0)
+	mulps	%1, xmm5		; %1: (255*r/a,.., 255*b/a, a)
+	cvtps2dq %1, %1			; float components -> uint32
+%endmacro
+
+; Unpremultiply two pixels in-place with uint16 components in xmm register %1.
+; Invariant: as above.
+; Scratch: xmm4-5
+%macro unpremultiply2 1
+	movdqa	xmm4, %1
+	punpckhwd xmm4, xmm0
+	punpcklwd %1, xmm0
+	unpremultiply1 xmm4
+	unpremultiply1 %1
+	packssdw %1, xmm4
+%endmacro
+
+; Unpremultiply four pixels in-place with uint8 components in xmm register %1.
+; Invariant: as above.
+; Scratch: xmm3-5
+%macro unpremultiply4 1
+	movdqa	xmm3, %1
+	punpckhbw xmm3, xmm0
+	punpcklbw %1, xmm0
+	unpremultiply2 xmm3
+	unpremultiply2 %1
+	packuswb %1, xmm3
+%endmacro
+
+; Input:
+;	%1:	movdqa or movdqu, depending on the alignment of rsi and rdi.
+;	r8:	number of times N > 0 to loop.
+;	rsi:	uint32_t[4*N]: source pixels.
+;	rdi:	uint32_t[4*N]: destination pixels.
+; Invariant: as above.
+; Scratch: rsi,rdi,r8-9,xmm2-5
+%macro	unpremultiply_loop_with 1
+	xor	r9,r9			; index register.
+	align 16
+%%loop:
+	prefetchnta	[rsi + r9*8 + 16*64]
+	%1	xmm2, [rsi+r9*8]
+	unpremultiply4 xmm2
+	movntdq	[rdi+r9*8], xmm2
+
+	add	r9, 2
+	dec	r8
+	jnz	%%loop
+%endmacro
+
+;; void unpremultiply_with_sse2_float(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+function unpremultiply_with_sse2_float
+	mov	r8, rdx
+	shr	r8, 2			; TODO: left over pixels.
+	test	r8, r8
+	jnz	.setup_invariants
+	ret
+
+.setup_invariants:
+	pxor	xmm0, xmm0		; constant zero for unpacking.
+
+	mov	rax, 1
+	movd	xmm7, eax
+	cvtdq2ps xmm7, xmm7
+	shufps	xmm7, xmm7, SELECT(1,1,0,1) ; xmm7: (0,0,1.0f,0)
+
+	mov	rax, 255
+	movd	xmm6, eax
+	cvtdq2ps xmm6, xmm6
+	shufps	xmm6, xmm6, SELECT(1,1,1,0) ; xmm6: 255f
+
+	; Decide on whether to use movdqu or movdqa based on source
+	; alignment. We always use movntdq to write the dest.
+	test	rsi, 15
+	jz	.aligned_case
+	jmp	.unaligned_case
+
+.aligned_case:
+	unpremultiply_loop_with movdqa
+	ret
+
+.unaligned_case:
+	unpremultiply_loop_with movdqu
+	ret
diff --git a/unpremultiply-sse2-old.S b/unpremultiply-sse2-old.S
new file mode 100644
index 0000000..d8c6209
--- /dev/null
+++ b/unpremultiply-sse2-old.S
@@ -0,0 +1,165 @@
+	section .text
+
+%macro function 1
+	global	%1
+%1:
+%endmacro
+
+%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
+
+reciprocal_table:
+	dd	0
+%assign i 1
+%rep    255
+	dd	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+%assign i i+1
+%endrep
+
+; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
+;		The pixels must not be supersaturated.
+; Output: xmm1: u8[], Four unpremultiplied pixels.
+; Invariant:
+;	xmm0: 0
+;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx
+%macro unpremultiply_xmm1_old 0
+	; Expand the 8 bit components into 16 bit ones in
+	; two registers.
+	movdqa	xmm2, xmm1
+	punpckhbw xmm2, xmm0		; xmm2: (r,g,b,a|r,g,b,a)
+	punpcklbw xmm1, xmm0		; xmm1: (r,g,b,a|r,g,b,a)
+
+	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+	pextrw	eax, xmm1, 0		; Extract low half reg alphas.
+	pextrw	ecx, xmm2, 0
+	pextrw	ebx, xmm1, 4	       ; Extract high half reg alphas.
+	pextrw	edx, xmm2, 4
+	mov	eax, [reciprocal_table + 4*eax] ; Fetch 255/alpha
+	mov	ecx, [reciprocal_table + 4*ecx] ; as 8.8 fp numbers.
+	mov	ebx, [reciprocal_table + 4*ebx]
+	mov	edx, [reciprocal_table + 4*edx]
+	pinsrw	xmm3, eax, 1	       ; Inject into coefficient regs.
+	pinsrw	xmm4, ecx, 1
+	pshuflw	xmm3, xmm3, SELECT(1,1,1,0) ; Replicate non-alpha coefs.
+	pshuflw	xmm4, xmm4, SELECT(1,1,1,0)
+	pinsrw	xmm3, ebx, 5
+	pinsrw	xmm4, edx, 5
+	pshufhw	xmm3, xmm3, SELECT(1,1,1,0)
+	pshufhw	xmm4, xmm4, SELECT(1,1,1,0)
+	pmullw	xmm1, xmm3	        ; Multiply components by coefs.
+	pmullw	xmm2, xmm4		; to produce 8.8 fp components.
+	psrlw	xmm1, 8			; Take floor of components.
+	psrlw	xmm2, 8
+
+	; Pack the four resulting pixels from 16 to 8 bit components.
+	packuswb xmm1, xmm2
+%endmacro
+
+; Reciprocal table with 64 bit entries of a 4x16 vector 
+; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
+reciprocal_table_Q:
+	dq	0
+%assign i 1
+%rep    255
+	dw	256			; unity
+	dw	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+	dw	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+	dw	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
+%assign i i+1
+%endrep
+
+; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
+;		The pixels must not be supersaturated.
+; Output: xmm1: u8[], Four unpremultiplied pixels.
+; Invariant:
+;	xmm0: 0
+;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx
+%macro unpremultiply_xmm1 0
+	; Expand the 8 bit components into 16 bit ones in
+	; two registers.
+	movdqa	xmm2, xmm1
+	punpckhbw xmm2, xmm0		; xmm2: (r,g,b,a|r,g,b,a)
+	punpcklbw xmm1, xmm0		; xmm1: (r,g,b,a|r,g,b,a)
+
+	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+	pextrw	ebx, xmm1, 4		; Extract pixel alphas into registers.
+	pextrw	edx, xmm2, 4
+	pextrw	eax, xmm1, 0
+	pextrw	ecx, xmm2, 0
+	movq	xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
+	movq	xmm6, [reciprocal_table_Q + 8*edx] ;  into lower regs.
+	movq	xmm3, [reciprocal_table_Q + 8*eax]
+	movq	xmm4, [reciprocal_table_Q + 8*ecx]
+	pshufd	xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
+	pshufd	xmm6, xmm6, SELECT(1,0,3,2)
+	por	xmm3, xmm5		; Combine 64 bit upper and lower
+	por	xmm4, xmm6		;  into 128 bit multipliers.
+	pmullw	xmm1, xmm3	        ; Multiply components by coefs.
+	pmullw	xmm2, xmm4		;  to produce 8.8 fp components.
+	psrlw	xmm1, 8			; Take floor of components.
+	psrlw	xmm2, 8
+
+	; Pack the four resulting pixels from 16 to 8 bit components.
+	packuswb xmm1, xmm2
+%endmacro
+
+; Input:
+;	%1:	movdqa or movdqu, depending on the alignment of rsi and rdi.
+;	r8:	number of times N > 0 to loop.
+;	rsi:	uint32_t[4*N]: source pixels.
+;	rdi:	uint32_t[4*N]: destination pixels.
+; Invariant:
+;	xmm0: 0
+;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
+%macro	unpremultiply_loop_with 1
+	xor	r9,r9			; index register.
+	align 16
+%%loop:
+	prefetchnta	[rsi + r9*8 + 16*64]
+	%1	xmm1, [rsi+r9*8]
+	unpremultiply_xmm1
+	movntdq	[rdi+r9*8], xmm1
+
+	add	r9, 2
+	dec	r8
+	jnz	%%loop
+%endmacro
+
+;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+function unpremultiply_with_sse2
+	mov	r8, rdx
+	shr	r8, 2			; TODO: left over pixels.
+	test	r8, r8
+	jnz	.setup_invariants
+	ret
+
+.setup_invariants:
+	push	rbx
+	pxor	xmm0, xmm0		; constant zero for unpacking.
+
+	; Setup component multiplier registers xmm3/xmm4 with
+	; 8.8 fixed point coefficients.  The alpha component always
+	; has a coefficient of one.
+	mov	rax, 256*256+256	; unity in 8.8 fp
+	movd	xmm3, eax
+	pshufd	xmm3, xmm3, SELECT(0,0,0,0)
+	movdqa	xmm4, xmm3
+
+	; Decide on whether to use movdqu or movdqa based on source
+	; alignment. We always use mvntdq to write the dest.
+	test	rsi, 15
+	jz	.aligned_case
+	jmp	.unaligned_case
+
+.aligned_case:
+	unpremultiply_loop_with movdqa
+	pop	rbx
+	ret
+
+.unaligned_case:
+	unpremultiply_loop_with movdqu
+	pop	rbx
+	ret
diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S
new file mode 100644
index 0000000..1b699cc
--- /dev/null
+++ b/unpremultiply-sse2.S
@@ -0,0 +1,110 @@
+	section .text
+
+%define ASHIFT 24
+;%define ASHIFT 0
+
+%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
+
+; Reciprocal table with 64 bit entries of a 4x16 vector 
+; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
+reciprocal_table_Q:
+	dq	0
+%assign i 1
+%rep    255
+%assign recip	((255*256 + i-1) / i)
+%if ASHIFT == 0
+	dw	256, recip, recip, recip
+%elif ASHIFT==24
+	dw	recip, recip, recip, 256
+%endif
+%assign i i+1
+%endrep
+
+; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
+;		The pixels must not be supersaturated.
+; Output: xmm1: u8[], Four unpremultiplied pixels.
+; Invariant:
+;	xmm0: 0
+;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx
+%macro unpremultiply_xmm1 0
+	; Expand the 8 bit components into 16 bit ones in
+	; two registers.
+	movdqa	xmm2, xmm1
+	punpckhbw xmm2, xmm0		; xmm2: (r,g,b,a|r,g,b,a)
+	punpcklbw xmm1, xmm0		; xmm1: (r,g,b,a|r,g,b,a)
+
+	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+	pextrw	ebx, xmm1, 4+ASHIFT/8	; Extract pixel alphas into registers.
+	pextrw	edx, xmm2, 4+ASHIFT/8
+	pextrw	eax, xmm1, 0+ASHIFT/8
+	pextrw	ecx, xmm2, 0+ASHIFT/8
+	movq	xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
+	movq	xmm6, [reciprocal_table_Q + 8*edx] ;  into lower regs.
+	movq	xmm3, [reciprocal_table_Q + 8*eax]
+	movq	xmm4, [reciprocal_table_Q + 8*ecx]
+	pshufd	xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
+	pshufd	xmm6, xmm6, SELECT(1,0,3,2)
+	por	xmm3, xmm5		; Combine 64 bit upper and lower
+	por	xmm4, xmm6		;  into 128 bit multipliers.
+	pmullw	xmm1, xmm3	        ; Multiply components by coefs.
+	pmullw	xmm2, xmm4		;  to produce 8.8 fp components.
+	psrlw	xmm1, 8			; Take floor of components.
+	psrlw	xmm2, 8
+
+	; Pack the four resulting pixels from 16 to 8 bit components.
+	packuswb xmm1, xmm2
+%endmacro
+
+; Input:
+;	%1:	movdqa or movdqu, depending on the alignment of rsi and rdi.
+;	r8:	number of times N > 0 to loop.
+;	rsi:	uint32_t[4*N]: source pixels.
+;	rdi:	uint32_t[4*N]: destination pixels.
+; Invariant:
+;	xmm0: 0
+;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
+; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
+%macro	unpremultiply_loop_with 1
+	xor	r9,r9			; index register.
+	align 16
+%%loop:
+	prefetchnta	[rsi + r9*8 + 16*64]
+	%1	xmm1, [rsi+r9*8]
+	unpremultiply_xmm1
+	movntdq	[rdi+r9*8], xmm1
+
+	add	r9, 2
+	dec	r8
+	jnz	%%loop
+%endmacro
+
+;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+global unpremultiply_with_sse2
+unpremultiply_with_sse2:
+	mov	r8, rdx
+	shr	r8, 2			; TODO: left over pixels.
+	test	r8, r8
+	jnz	.setup_invariants
+	ret
+
+.setup_invariants:
+	push	rbx
+	pxor	xmm0, xmm0		; constant zero for unpacking.
+
+	; Decide on whether to use movdqu or movdqa based on source
+	; alignment. We always use mvntdq to write the dest.
+	test	rsi, 15
+	jz	.aligned_case
+	jmp	.unaligned_case
+
+.aligned_case:
+	unpremultiply_loop_with movdqa
+	pop	rbx
+	ret
+
+.unaligned_case:
+	unpremultiply_loop_with movdqu
+	pop	rbx
+	ret
diff --git a/unpremultiply.c b/unpremultiply.c
new file mode 100644
index 0000000..99932fb
--- /dev/null
+++ b/unpremultiply.c
@@ -0,0 +1,549 @@
+#define RUN_ME /*
+nasm -g -f elf64 unpremultiply-sse2.S
+nasm -g -f elf64 unpremultiply-sse2-float.S
+gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o unpremultiply-sse2-float.o $0
+exit $?
+*/
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if 1
+# define ASHIFT 24
+# define RSHIFT 16
+# define GSHIFT  8
+# define BSHIFT  0
+#else
+# define RSHIFT 24
+# define GSHIFT 16
+# define BSHIFT  8
+# define ASHIFT  0
+#endif
+
+#define AMASK (255 << ASHIFT)
+#define RMASK (255 << RSHIFT)
+#define GMASK (255 << GSHIFT)
+#define BMASK (255 << BSHIFT)
+
+void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n);
+
+static void __attribute__((noinline))
+unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    uint32_t prev_in = 0;
+    uint32_t prev_out = 0;
+    size_t i;
+
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba & AMASK) >> ASHIFT;
+	if (a == 255) {
+	    dst[i] = rgba;
+	    continue;
+	}
+	if (prev_in == rgba) {
+	    dst[i] = prev_out;
+	    continue;
+	}
+	if (a) {
+	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
+	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
+	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
+	    r = r*255 / a;
+	    g = g*255 / a;
+	    b = b*255 / a;
+	    assert(r < 256);
+	    assert(g < 256);
+	    assert(b < 256);
+	    assert(a < 256);
+	    prev_in = rgba;
+	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+	} else {
+	    prev_in = prev_out = 0;
+	}
+	dst[i] = prev_out;
+    }
+}
+
+static uint8_t division_table[65536];
+
+static void __attribute__((noinline))
+unpremultiply_with_lut(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    size_t i;
+    uint32_t prev_in = 0;
+    uint32_t prev_out = 0;
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba >> ASHIFT) & 0xFF;
+	if (a == 255) {
+	    dst[i] = rgba;
+	}
+	else if (prev_in == rgba) {
+	    dst[i] = prev_out;
+	}
+	else {
+	    uint32_t r = (rgba >> RSHIFT) & 0xFF;
+	    uint32_t g = (rgba >> GSHIFT) & 0xFF;
+	    uint32_t b = (rgba >> BSHIFT) & 0xFF;
+	    r = division_table[a*256 + r];
+	    g = division_table[a*256 + g];
+	    b = division_table[a*256 + b];
+
+	    prev_in = rgba;
+	    prev_out = (r<<RSHIFT) | (g<<GSHIFT) | (b<<BSHIFT) | (a<<ASHIFT);
+	    dst[i] = prev_out;
+	}
+    }
+}
+
+#define RECIPROCAL_BITS 8
+
+static uint32_t reciprocal_table_A[256];
+static uint64_t reciprocal_table_B[256];
+
+#define SHIFT(x, y) ((y) < 0 ? (x) >> (-(y)) : (x) << (y))
+
+static void __attribute__((noinline))
+unpremultiply_with_inv32(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    size_t i;
+    uint32_t prev_in = 0;
+    uint32_t prev_out = 0;
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba >> ASHIFT) & 255;
+	if (a == 255) {
+	    dst[i] = rgba;
+	}
+ 	else if (prev_in == rgba) {
+ 	    dst[i] = prev_out;
+ 	}
+ 	else { 
+ 	    prev_in = rgba;
+	    uint32_t r = (rgba >> RSHIFT) & 255;
+	    uint32_t g = (rgba >> GSHIFT) & 255;
+	    uint32_t b = (rgba >> BSHIFT) & 255;
+	    uint32_t recip = reciprocal_table_A[a];
+	    r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+	    g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+	    b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+	    prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+	    dst[i] = prev_out;
+	}
+    }
+}
+
+#define INNER_UNROLL (2)
+
+static void __attribute__((noinline))
+unpremultiply_with_inv32_bis(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    for (size_t i = 0; i < n; ) {
+        uint32_t prev_in, prev_out;
+        /* 0 iff all inputs are the same */
+        unsigned delta = 0; 
+        /* adds to 0 if all inputs are opaque */
+        unsigned opaque_count = - INNER_UNROLL*255;
+
+        {
+            uint32_t rgba = prev_in = src[i];
+            uint32_t a = (rgba >> ASHIFT) & 255;
+            opaque_count += a;
+            uint32_t r = (rgba >> RSHIFT) & 255;
+            uint32_t g = (rgba >> GSHIFT) & 255;
+            uint32_t b = (rgba >> BSHIFT) & 255;
+            uint32_t recip = reciprocal_table_A[a];
+            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+            dst[i] = prev_out = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+        }
+
+        /* UNROLL, bitch! */
+        for (unsigned j = 1; j < INNER_UNROLL; j++) {
+            if (i+j>=n) return;
+            uint32_t rgba = src[i+j];
+            delta |= rgba ^ prev_in;
+            uint32_t a = (rgba >> ASHIFT) & 255;
+            opaque_count += a;
+            uint32_t r = (rgba >> RSHIFT) & 255;
+            uint32_t g = (rgba >> GSHIFT) & 255;
+            uint32_t b = (rgba >> BSHIFT) & 255;
+            uint32_t recip = reciprocal_table_A[a];
+            r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+            g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+            b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+            dst[i+j] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+        }
+
+        i += INNER_UNROLL;
+
+        /* switch to special case after k of them
+         * + minimize cost when unapplicable
+         */
+
+        if (0 != (delta & opaque_count)) continue;
+
+        if (0 == opaque_count) {
+            uint32_t in;
+            while ((255U << ASHIFT) == ((in = src[i]) & (255U << ASHIFT))) {
+                dst[i++] = in;
+                if (i >= n) return;
+            }
+        } else if (0 == delta) {
+            while (src[i] == prev_in) {
+                dst[i++] = prev_out;
+                if (i >= n) return;
+            }
+        }
+    }
+}
+#undef INNER_UNROLL
+
+static void __attribute__((noinline))
+unpremultiply_with_inv32_nocache(
+    uint32_t * restrict dst,
+    uint32_t const * restrict src,
+    size_t n)
+{
+    size_t i;
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba >> ASHIFT) & 255;
+	uint32_t r = (rgba >> RSHIFT) & 255;
+	uint32_t g = (rgba >> GSHIFT) & 255;
+	uint32_t b = (rgba >> BSHIFT) & 255;
+	uint32_t recip = reciprocal_table_A[a];
+	r = SHIFT(r * recip, RSHIFT - RECIPROCAL_BITS);
+	g = SHIFT(g * recip, GSHIFT - RECIPROCAL_BITS);
+	b = SHIFT(b * recip, BSHIFT - RECIPROCAL_BITS);
+	dst[i] = (r & RMASK) | (g & GMASK) | (b & BMASK) | (rgba & AMASK);
+    }
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_inv64(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    size_t i;
+    uint32_t prev_in = 0;
+    uint32_t prev_out = 0;
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba >> ASHIFT) & 255;
+	if (a == 255) {
+	    dst[i] = rgba;
+	}
+ 	else if (prev_in == rgba) {
+ 	    dst[i] = prev_out;
+ 	}
+ 	else { 
+ 	    prev_in = rgba;
+	    uint64_t r = rgba & RMASK;
+	    uint64_t g = rgba & GMASK;
+	    uint64_t b = rgba & BMASK;
+	    uint64_t recip = reciprocal_table_B[a];
+	    r = r * recip;
+	    g = g * recip;
+	    b = b * recip;
+	    prev_out = (rgba & AMASK) |
+		(((r & ((uint64_t)RMASK << 32)) |
+		  (g & ((uint64_t)GMASK << 32)) |
+		  (b & ((uint64_t)BMASK << 32))) >> 32);
+	    dst[i] = prev_out;
+	}
+    }
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_inv64_nocache(
+    uint32_t * restrict dst,
+    uint32_t const * restrict src,
+    size_t n)
+{
+    size_t i;
+    for (i=0; i<n; i++) {
+	uint32_t rgba = src[i];
+	uint32_t a = (rgba & AMASK) >> ASHIFT;
+	uint64_t r = rgba & RMASK;
+	uint64_t g = rgba & GMASK;
+	uint64_t b = rgba & BMASK;
+	uint64_t recip = reciprocal_table_B[a];
+	r = r * recip;
+	g = g * recip;
+	b = b * recip;
+	dst[i] = (rgba & AMASK) |
+	    (((r & ((uint64_t)RMASK << 32)) |
+	      (g & ((uint64_t)GMASK << 32)) |
+	      (b & ((uint64_t)BMASK << 32))) >> 32);
+    }
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_memcpy(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    memcpy(dst, src, 4*n);
+}
+
+static void __attribute__((noinline))
+unpremultiply_with_memset(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    (void)src;
+    memset(dst, 0, 4*n);
+}
+
+volatile uint32_t read_sum;
+
+static void __attribute__((noinline))
+unpremultiply_with_read(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
+{
+    size_t i;
+    uint32_t sum = 0;
+    (void)src;
+    (void)dst;
+    for (i=0; i<n; i++) sum += src[i];
+    read_sum = sum;
+}
+
+static void
+make_division_table()
+{
+    unsigned a;
+    for (a=1; a<256; a++) {
+	unsigned x;
+	for (x=0; x<256; x++) {
+	    unsigned y = x*255/a;
+	    y = y < 255 ? y : 255;
+	    division_table[a*256 + x] = y;
+	}
+    }
+    for (a=0; a<256; a++) {
+	division_table[a] = 0;
+    }
+}
+
+static void
+make_reciprocal_table_A()
+{
+    unsigned a;
+    reciprocal_table_A[0] = 0;
+    for (a=1; a<256; a++) {
+	uint32_t r = 255U*(1<<RECIPROCAL_BITS) / a;
+	while ((a*r) >> RECIPROCAL_BITS < 255) r++;
+	reciprocal_table_A[a] = r;
+    }
+}
+
+static void
+make_reciprocal_table_B()
+{
+    unsigned a;
+    reciprocal_table_B[0] = 0;
+    for (a=1; a<256; a++) {
+	uint64_t r = 255*(1ULL<<32)/a;
+	while ((a*r) >> 32 < 255) r++;
+	reciprocal_table_B[a] = r;
+    }
+}
+
+static void
+saturate(uint32_t *buf, size_t n)
+{
+    size_t i;
+    for (i=0; i<n; i++) {
+	uint32_t rgba = buf[i];
+	uint32_t a = (rgba >> ASHIFT) & 0xFF;
+	uint32_t r = (rgba >> RSHIFT) & 0xFF;
+	uint32_t g = (rgba >> GSHIFT) & 0xFF;
+	uint32_t b = (rgba >> BSHIFT) & 0xFF;
+	r = r < a ? r : a;
+	g = g < a ? g : a;
+	b = b < a ? b : a;
+	buf[i] = (a << ASHIFT) |
+	    (r << RSHIFT) |
+	    (g << GSHIFT) |
+	    (b << BSHIFT);
+    }
+}
+
+static void
+fill_random(uint32_t *buf, size_t n)
+{
+    uint32_t x = 123456789;
+    size_t i;
+    for (i=0; i<n; i++) {
+	x = x*33 + 1;
+	buf[i] = x;
+    }
+}
+
+static void
+fill_gradient(uint32_t *buf, size_t n)
+{
+    size_t i;
+    for (i=0; i<n; i++) {
+	buf[i] = i*(0x01020301);
+    }
+}
+
+static void
+fill_solid(uint32_t *buf, size_t n)
+{
+    size_t i;
+    for (i=0; i<n; i++) {
+	buf[i] = buf[i] | AMASK;
+    }
+}
+
+static void
+fill_empty(uint32_t *buf, size_t n)
+{
+    memset(buf, 0, 4*n);
+}
+
+int
+main(int argc, char **argv)
+{
+    long nloops = argc > 1 ? atol(argv[1]) : 1;
+    size_t n = 2*1024*1024;
+    uint32_t *dst = calloc(n, 4);
+    uint32_t *src = calloc(n, 4);
+    char const *method = "lut";
+    int verify = 0;
+    int i;
+
+    make_division_table();
+    make_reciprocal_table_A();
+    make_reciprocal_table_B();
+
+    for (i=2; i<argc; i++) {
+	if (0 == strcmp(argv[i], "verify")) {
+	    verify = 1;
+	}
+	else if (0 == strcmp(argv[i], "random")) {
+	    fill_random (src, n);
+	}
+	else if (0 == strcmp(argv[i], "solid")) {
+	    fill_solid (src, n);
+	}
+	else if (0 == strcmp(argv[i], "clear")) {
+	    fill_empty (src, n);
+	}
+	else if (0 == strcmp(argv[i], "gradient")) {
+	    fill_gradient (src, n);
+	}
+	else if (0 == strcmp(argv[i], "div") ||
+		 0 == strcmp(argv[i], "lut") ||
+		 0 == strcmp(argv[i], "inv32") ||
+		 0 == strcmp(argv[i], "inv32-bis") ||
+		 0 == strcmp(argv[i], "inv64") ||
+		 0 == strcmp(argv[i], "inv32-nocache") ||
+		 0 == strcmp(argv[i], "inv64-nocache") ||
+		 0 == strcmp(argv[i], "sse2") ||
+		 0 == strcmp(argv[i], "sse2-float") ||
+		 0 == strcmp(argv[i], "copy") ||
+		 0 == strcmp(argv[i], "read") ||
+		 0 == strcmp(argv[i], "write"))
+	{
+	    method = argv[i];
+	}
+	else {
+	    fprintf(stderr, "unknown arg %s\n", argv[i]);
+	    return 1;
+	}
+    }
+    saturate(src, n);
+
+    if (0 == strcmp(method, "div")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_div(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "lut")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_lut(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "inv32")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_inv32(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "inv32-bis")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_inv32_bis(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "inv64")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_inv64(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "inv32-nocache")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_inv32_nocache(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "inv64-nocache")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_inv64_nocache(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "copy")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_memcpy(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "write")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_memset(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "read")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_read(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "sse2")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_sse2(dst, src, n);
+	}
+    }
+    else if (0 == strcmp(method, "sse2-float")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_sse2_float(dst, src, n);
+	}
+    }
+    else {
+	fprintf(stderr, "unknown method %s\n", method);
+	return 1;
+    }
+
+    if (verify) {
+	uint32_t *ref = malloc(n*4);
+	size_t i;
+	int maxdiff = 0;
+	unpremultiply_with_div(ref, src, n);
+	for (i=0; i<n; i++) {
+	    uint32_t x = dst[i];
+	    uint32_t y = ref[i];
+	    int j;
+	    for (j=0; j<4; j++){
+		int diff = (x & 255) - (y & 255);
+		if (diff < 0) diff = -diff;
+		if (diff > maxdiff) {
+		    printf("src:%08x dst:%08x ref:%08x diff:%d, i:%ld.%d\n",
+			   src[i], dst[i], ref[i], diff, i,j);
+		    maxdiff = diff;
+		}
+		x >>= 8;
+		y >>= 8;
+	    }
+	}
+    }
+
+    return 0;
+}
author	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-03 04:21:08 +0200
committer	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-03 04:21:08 +0200
commit	109b7cdbd6e617b9f818bbc110c1f6841487eb27 (patch)
tree	f824aa868611194d584530aebbd06cff6c1003b3