diff options
author | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-10 22:32:01 +0200 |
---|---|---|
committer | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-10 22:32:01 +0200 |
commit | fa34cbbd7ace9918979a8519f6b99b5909c84f75 (patch) | |
tree | 3ba65dff90e67443da649be286e33581e18aeb99 | |
parent | 3eb27ac1329e5c6335593d40bc9b77ae27884b38 (diff) |
Add new tighter SSE2 unpremultiply.
The same issues as with memcpy are now affecting SSE2 enabled unpremultiplication:
the choice of movntd vs. movdqa vs. movdqu for writing to the destination buffer
is crucial, but depends on how the result is used and what size it is. If it
fits in L2 and is going to be used quickly, then it makes sense to use movdqa/dqu
to write to the destination. If it's not going to be used or doesn't fit then
it's far better to use movntdq.
-rw-r--r-- | unpremultiply-sse2-test.S | 93 | ||||
-rw-r--r-- | unpremultiply.c | 16 |
2 files changed, 107 insertions, 2 deletions
diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S new file mode 100644 index 0000000..a0dccf4 --- /dev/null +++ b/unpremultiply-sse2-test.S @@ -0,0 +1,93 @@ + section .text + +%define ASHIFT 24 +;%define ASHIFT 0 + + align 16 +; Reciprocal table with 64 bit entries of a 4x16 vectors +; of the form +; +; (255/i, 255/i, 255/i, 1.0) for ASHIFT=0 +; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24 +; +; in 8.8 fixed point format. +reciprocal_table_Q: + dq 0 +%assign i 1 +%rep 255 +%assign recip (255*256 / i) +%if ASHIFT == 0 + dw 256, recip, recip, recip +%elif ASHIFT==24 + dw recip, recip, recip, 256 +%endif +%assign i i+1 +%endrep + +;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) +;; +global unpremultiply_with_sse2_test +unpremultiply_with_sse2_test: + mov r8, rdx + shr r8, 2 ; TODO: left over pixels. + test r8, r8 + jnz .else + ret +.else: + push rbx + xor r9,r9 + align 16 +.loop: + ; Load four pixels into xmm1. The prefetchnta here + ; hides the difference between movdqa vs. movdqu on + ; aligned input. + prefetchnta [rsi + r9*8 + 16*64] + movdqu xmm1, [rsi+r9*8] + + ; Expand the 8 bit components into 16 bit ones in + ; two registers. + movdqa xmm2, xmm1 + punpckhbw xmm2, xmm2 + punpcklbw xmm1, xmm1 + + ; Load alphas into GPRs. + movzx eax, byte [rsi + r9*8 + ASHIFT/8 + 0] + movzx ebx, byte [rsi + r9*8 + ASHIFT/8 + 4] + movzx ecx, byte [rsi + r9*8 + ASHIFT/8 + 8] + movzx edx, byte [rsi + r9*8 + ASHIFT/8 + 12] + + ; Fetch component multipliers for each pixel based on the alphas + ; into the xmm3/xmm4 registers. + movq xmm3, [reciprocal_table_Q + 8*eax] + movq xmm4, [reciprocal_table_Q + 8*ecx] + movhpd xmm3, [reciprocal_table_Q + 8*ebx] + movhpd xmm4, [reciprocal_table_Q + 8*edx] + + ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. + ; Treating the components as 0.16 bit fixed point, the pmulhuw + ; leaves the integer part of x*255/a in the result for the colour + ; components and alphas themselves for the alpha components. + pmulhuw xmm1, xmm3 + pmulhuw xmm2, xmm4 + + ; Pack the four resulting pixels from 16 to 8 bit components. + packuswb xmm1, xmm2 + + ; Write the result. + ; - When the destination is expected (say due to aliasing) + ; to be in at least the L2 cache then the write should + ; be done using movdqa or movdqu. + ; + ; - Otherwise the destination won't be or fit in any cache level + ; in which case we should use movntdq. + +; movdqa [rdi+r9*8], xmm1 + movntdq [rdi+r9*8], xmm1 + + add r9, 2 + dec r8 + jnz .loop + +.out: + pop rbx + ret diff --git a/unpremultiply.c b/unpremultiply.c index 99932fb..8e6a25d 100644 --- a/unpremultiply.c +++ b/unpremultiply.c @@ -1,7 +1,8 @@ #define RUN_ME /* nasm -g -f elf64 unpremultiply-sse2.S +nasm -g -f elf64 unpremultiply-sse2-test.S nasm -g -f elf64 unpremultiply-sse2-float.S -gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o unpremultiply-sse2-float.o $0 +gcc -W -Wall -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2*.o $0 exit $? */ #include <assert.h> @@ -28,6 +29,7 @@ exit $? #define BMASK (255 << BSHIFT) void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n); +void unpremultiply_with_sse2_test(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n); static void __attribute__((noinline)) @@ -408,7 +410,7 @@ int main(int argc, char **argv) { long nloops = argc > 1 ? atol(argv[1]) : 1; - size_t n = 2*1024*1024; + size_t n = 2*1024*1024*0 + (256*1024/4)/2; uint32_t *dst = calloc(n, 4); uint32_t *src = calloc(n, 4); char const *method = "lut"; @@ -435,6 +437,9 @@ main(int argc, char **argv) else if (0 == strcmp(argv[i], "gradient")) { fill_gradient (src, n); } + else if (0 == strcmp(argv[i], "aliased")) { + dst = src; + } else if (0 == strcmp(argv[i], "div") || 0 == strcmp(argv[i], "lut") || 0 == strcmp(argv[i], "inv32") || @@ -443,6 +448,7 @@ main(int argc, char **argv) 0 == strcmp(argv[i], "inv32-nocache") || 0 == strcmp(argv[i], "inv64-nocache") || 0 == strcmp(argv[i], "sse2") || + 0 == strcmp(argv[i], "sse2-test") || 0 == strcmp(argv[i], "sse2-float") || 0 == strcmp(argv[i], "copy") || 0 == strcmp(argv[i], "read") || @@ -457,6 +463,7 @@ main(int argc, char **argv) } saturate(src, n); + if (0 == strcmp(method, "div")) { while (nloops-- > 0) { unpremultiply_with_div(dst, src, n); @@ -512,6 +519,11 @@ main(int argc, char **argv) unpremultiply_with_sse2(dst, src, n); } } + else if (0 == strcmp(method, "sse2-test")) { + while (nloops-- > 0) { + unpremultiply_with_sse2_test(dst, src, n); + } + } else if (0 == strcmp(method, "sse2-float")) { while (nloops-- > 0) { unpremultiply_with_sse2_float(dst, src, n); |