summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-10 22:32:01 +0200
committerM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-10 22:32:01 +0200
commitfa34cbbd7ace9918979a8519f6b99b5909c84f75 (patch)
tree3ba65dff90e67443da649be286e33581e18aeb99
parent3eb27ac1329e5c6335593d40bc9b77ae27884b38 (diff)
Add new tighter SSE2 unpremultiply.
The same issues as with memcpy are now affecting SSE2 enabled unpremultiplication: the choice of movntd vs. movdqa vs. movdqu for writing to the destination buffer is crucial, but depends on how the result is used and what size it is. If it fits in L2 and is going to be used quickly, then it makes sense to use movdqa/dqu to write to the destination. If it's not going to be used or doesn't fit then it's far better to use movntdq.
-rw-r--r--unpremultiply-sse2-test.S93
-rw-r--r--unpremultiply.c16
2 files changed, 107 insertions, 2 deletions
diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S
new file mode 100644
index 0000000..a0dccf4
--- /dev/null
+++ b/unpremultiply-sse2-test.S
@@ -0,0 +1,93 @@
+ section .text
+
+%define ASHIFT 24
+;%define ASHIFT 0
+
+ align 16
+; Reciprocal table with 64 bit entries of a 4x16 vectors
+; of the form
+;
+; (255/i, 255/i, 255/i, 1.0) for ASHIFT=0
+; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24
+;
+; in 8.8 fixed point format.
+reciprocal_table_Q:
+ dq 0
+%assign i 1
+%rep 255
+%assign recip (255*256 / i)
+%if ASHIFT == 0
+ dw 256, recip, recip, recip
+%elif ASHIFT==24
+ dw recip, recip, recip, 256
+%endif
+%assign i i+1
+%endrep
+
+;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;;
+global unpremultiply_with_sse2_test
+unpremultiply_with_sse2_test:
+ mov r8, rdx
+ shr r8, 2 ; TODO: left over pixels.
+ test r8, r8
+ jnz .else
+ ret
+.else:
+ push rbx
+ xor r9,r9
+ align 16
+.loop:
+ ; Load four pixels into xmm1. The prefetchnta here
+ ; hides the difference between movdqa vs. movdqu on
+ ; aligned input.
+ prefetchnta [rsi + r9*8 + 16*64]
+ movdqu xmm1, [rsi+r9*8]
+
+ ; Expand the 8 bit components into 16 bit ones in
+ ; two registers.
+ movdqa xmm2, xmm1
+ punpckhbw xmm2, xmm2
+ punpcklbw xmm1, xmm1
+
+ ; Load alphas into GPRs.
+ movzx eax, byte [rsi + r9*8 + ASHIFT/8 + 0]
+ movzx ebx, byte [rsi + r9*8 + ASHIFT/8 + 4]
+ movzx ecx, byte [rsi + r9*8 + ASHIFT/8 + 8]
+ movzx edx, byte [rsi + r9*8 + ASHIFT/8 + 12]
+
+ ; Fetch component multipliers for each pixel based on the alphas
+ ; into the xmm3/xmm4 registers.
+ movq xmm3, [reciprocal_table_Q + 8*eax]
+ movq xmm4, [reciprocal_table_Q + 8*ecx]
+ movhpd xmm3, [reciprocal_table_Q + 8*ebx]
+ movhpd xmm4, [reciprocal_table_Q + 8*edx]
+
+ ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+ ; Treating the components as 0.16 bit fixed point, the pmulhuw
+ ; leaves the integer part of x*255/a in the result for the colour
+ ; components and alphas themselves for the alpha components.
+ pmulhuw xmm1, xmm3
+ pmulhuw xmm2, xmm4
+
+ ; Pack the four resulting pixels from 16 to 8 bit components.
+ packuswb xmm1, xmm2
+
+ ; Write the result.
+ ; - When the destination is expected (say due to aliasing)
+ ; to be in at least the L2 cache then the write should
+ ; be done using movdqa or movdqu.
+ ;
+ ; - Otherwise the destination won't be or fit in any cache level
+ ; in which case we should use movntdq.
+
+; movdqa [rdi+r9*8], xmm1
+ movntdq [rdi+r9*8], xmm1
+
+ add r9, 2
+ dec r8
+ jnz .loop
+
+.out:
+ pop rbx
+ ret
diff --git a/unpremultiply.c b/unpremultiply.c
index 99932fb..8e6a25d 100644
--- a/unpremultiply.c
+++ b/unpremultiply.c
@@ -1,7 +1,8 @@
#define RUN_ME /*
nasm -g -f elf64 unpremultiply-sse2.S
+nasm -g -f elf64 unpremultiply-sse2-test.S
nasm -g -f elf64 unpremultiply-sse2-float.S
-gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o unpremultiply-sse2-float.o $0
+gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2*.o $0
exit $?
*/
#include <assert.h>
@@ -28,6 +29,7 @@ exit $?
#define BMASK (255 << BSHIFT)
void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
+void unpremultiply_with_sse2_test(uint32_t *dst, uint32_t const *src, size_t n);
void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n);
static void __attribute__((noinline))
@@ -408,7 +410,7 @@ int
main(int argc, char **argv)
{
long nloops = argc > 1 ? atol(argv[1]) : 1;
- size_t n = 2*1024*1024;
+ size_t n = 2*1024*1024*0 + (256*1024/4)/2;
uint32_t *dst = calloc(n, 4);
uint32_t *src = calloc(n, 4);
char const *method = "lut";
@@ -435,6 +437,9 @@ main(int argc, char **argv)
else if (0 == strcmp(argv[i], "gradient")) {
fill_gradient (src, n);
}
+ else if (0 == strcmp(argv[i], "aliased")) {
+ dst = src;
+ }
else if (0 == strcmp(argv[i], "div") ||
0 == strcmp(argv[i], "lut") ||
0 == strcmp(argv[i], "inv32") ||
@@ -443,6 +448,7 @@ main(int argc, char **argv)
0 == strcmp(argv[i], "inv32-nocache") ||
0 == strcmp(argv[i], "inv64-nocache") ||
0 == strcmp(argv[i], "sse2") ||
+ 0 == strcmp(argv[i], "sse2-test") ||
0 == strcmp(argv[i], "sse2-float") ||
0 == strcmp(argv[i], "copy") ||
0 == strcmp(argv[i], "read") ||
@@ -457,6 +463,7 @@ main(int argc, char **argv)
}
saturate(src, n);
+
if (0 == strcmp(method, "div")) {
while (nloops-- > 0) {
unpremultiply_with_div(dst, src, n);
@@ -512,6 +519,11 @@ main(int argc, char **argv)
unpremultiply_with_sse2(dst, src, n);
}
}
+ else if (0 == strcmp(method, "sse2-test")) {
+ while (nloops-- > 0) {
+ unpremultiply_with_sse2_test(dst, src, n);
+ }
+ }
else if (0 == strcmp(method, "sse2-float")) {
while (nloops-- > 0) {
unpremultiply_with_sse2_float(dst, src, n);