Add new tighter SSE2 unpremultiply.

The same issues as with memcpy are now affecting SSE2 enabled unpremultiplication: the choice of movntd vs. movdqa vs. movdqu for writing to the destination buffer is crucial, but depends on how the result is used and what size it is. If it fits in L2 and is going to be used quickly, then it makes sense to use movdqa/dqu to write to the destination. If it's not going to be used or doesn't fit then it's far better to use movntdq.
author: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-10 22:32:01 +0200
committer: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-10 22:32:01 +0200
commit: fa34cbbd7ace9918979a8519f6b99b5909c84f75 (patch)
tree: 3ba65dff90e67443da649be286e33581e18aeb99
parent: 3eb27ac1329e5c6335593d40bc9b77ae27884b38 (diff)
2 files changed, 107 insertions, 2 deletions
diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S
new file mode 100644
index 0000000..a0dccf4
--- /dev/null
+++ b/unpremultiply-sse2-test.S
@@ -0,0 +1,93 @@
+	section .text
+
+%define ASHIFT 24
+;%define ASHIFT 0
+
+	align	16
+; Reciprocal table with 64 bit entries of a 4x16 vectors
+; of the form
+;
+;  (255/i, 255/i, 255/i, 1.0)   for ASHIFT=0
+;  (255/i, 255/i, 255/i, 1.0)   for ASHIFT=24
+;
+; in 8.8 fixed point format.
+reciprocal_table_Q:
+	dq	0
+%assign i 1
+%rep    255
+%assign recip	(255*256 / i)
+%if ASHIFT == 0
+	dw	256, recip, recip, recip
+%elif ASHIFT==24
+	dw	recip, recip, recip, 256
+%endif
+%assign i i+1
+%endrep
+
+;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+global unpremultiply_with_sse2_test
+unpremultiply_with_sse2_test:
+	mov	r8, rdx
+	shr	r8, 2			; TODO: left over pixels.
+	test	r8, r8
+	jnz	.else
+	 ret
+.else:
+	push	rbx
+	xor	r9,r9
+	align 16
+.loop:
+	; Load four pixels into xmm1.  The prefetchnta here
+	; hides the difference between movdqa vs. movdqu on
+	; aligned input.
+	prefetchnta	[rsi + r9*8 + 16*64]
+	movdqu	xmm1, [rsi+r9*8]
+
+	; Expand the 8 bit components into 16 bit ones in
+	; two registers.
+	movdqa	xmm2, xmm1
+	punpckhbw xmm2, xmm2
+	punpcklbw xmm1, xmm1
+
+	; Load alphas into GPRs.
+	movzx	eax, byte [rsi + r9*8 + ASHIFT/8 + 0]
+	movzx	ebx, byte [rsi + r9*8 + ASHIFT/8 + 4]
+	movzx	ecx, byte [rsi + r9*8 + ASHIFT/8 + 8]
+	movzx	edx, byte [rsi + r9*8 + ASHIFT/8 + 12]
+
+	; Fetch component multipliers for each pixel based on the alphas
+	; into the xmm3/xmm4 registers.
+	movq	xmm3, [reciprocal_table_Q + 8*eax]
+	movq	xmm4, [reciprocal_table_Q + 8*ecx]
+	movhpd	xmm3, [reciprocal_table_Q + 8*ebx]
+	movhpd	xmm4, [reciprocal_table_Q + 8*edx]
+
+	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+	; Treating the components as 0.16 bit fixed point, the pmulhuw
+	; leaves the integer part of x*255/a in the result for the colour
+	; components and alphas themselves for the alpha components.
+	pmulhuw	xmm1, xmm3
+	pmulhuw	xmm2, xmm4
+
+	; Pack the four resulting pixels from 16 to 8 bit components.
+	packuswb xmm1, xmm2
+
+	; Write the result.
+	; - When the destination is expected (say due to aliasing)
+	;   to be in at least the L2 cache then the write should
+	;   be done using movdqa or movdqu.
+	;
+	; - Otherwise the destination won't be or fit in any cache level
+	; in which case we should use movntdq.
+
+;	movdqa	[rdi+r9*8], xmm1
+	movntdq	[rdi+r9*8], xmm1
+
+	add	r9, 2
+	dec	r8
+	jnz	.loop
+
+.out:
+	pop	rbx
+	ret
diff --git a/unpremultiply.c b/unpremultiply.c
index 99932fb..8e6a25d 100644
--- a/unpremultiply.c
+++ b/unpremultiply.c
@@ -1,7 +1,8 @@
 #define RUN_ME /*
 nasm -g -f elf64 unpremultiply-sse2.S
+nasm -g -f elf64 unpremultiply-sse2-test.S
 nasm -g -f elf64 unpremultiply-sse2-float.S
-gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o unpremultiply-sse2-float.o $0
+gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2*.o $0
 exit $?
 */
 #include <assert.h>
@@ -28,6 +29,7 @@ exit $?
 #define BMASK (255 << BSHIFT)
 
 void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_sse2_test(uint32_t *dst, uint32_t const *src, size_t n);
 void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n);
 
 static void __attribute__((noinline))
@@ -408,7 +410,7 @@ int
 main(int argc, char **argv)
 {
     long nloops = argc > 1 ? atol(argv[1]) : 1;
-    size_t n = 2*1024*1024;
+    size_t n = 2*1024*1024*0 + (256*1024/4)/2;
     uint32_t *dst = calloc(n, 4);
     uint32_t *src = calloc(n, 4);
     char const *method = "lut";
@@ -435,6 +437,9 @@ main(int argc, char **argv)
 	else if (0 == strcmp(argv[i], "gradient")) {
 	    fill_gradient (src, n);
 	}
+	else if (0 == strcmp(argv[i], "aliased")) {
+	    dst = src;
+	}
 	else if (0 == strcmp(argv[i], "div") ||
 		 0 == strcmp(argv[i], "lut") ||
 		 0 == strcmp(argv[i], "inv32") ||
@@ -443,6 +448,7 @@ main(int argc, char **argv)
 		 0 == strcmp(argv[i], "inv32-nocache") ||
 		 0 == strcmp(argv[i], "inv64-nocache") ||
 		 0 == strcmp(argv[i], "sse2") ||
+		 0 == strcmp(argv[i], "sse2-test") ||
 		 0 == strcmp(argv[i], "sse2-float") ||
 		 0 == strcmp(argv[i], "copy") ||
 		 0 == strcmp(argv[i], "read") ||
@@ -457,6 +463,7 @@ main(int argc, char **argv)
     }
     saturate(src, n);
 
+
     if (0 == strcmp(method, "div")) {
 	while (nloops-- > 0) {
 	    unpremultiply_with_div(dst, src, n);
@@ -512,6 +519,11 @@ main(int argc, char **argv)
 	    unpremultiply_with_sse2(dst, src, n);
 	}
     }
+    else if (0 == strcmp(method, "sse2-test")) {
+	while (nloops-- > 0) {
+	    unpremultiply_with_sse2_test(dst, src, n);
+	}
+    }
     else if (0 == strcmp(method, "sse2-float")) {
 	while (nloops-- > 0) {
 	    unpremultiply_with_sse2_float(dst, src, n);
author	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-10 22:32:01 +0200
committer	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-10 22:32:01 +0200
commit	fa34cbbd7ace9918979a8519f6b99b5909c84f75 (patch)
tree	3ba65dff90e67443da649be286e33581e18aeb99
parent	3eb27ac1329e5c6335593d40bc9b77ae27884b38 (diff)