From bca43fc5c350dece7cac8a12a237e5bf37a666ed Mon Sep 17 00:00:00 2001 From: M Joonas Pihlaja Date: Tue, 13 Jan 2009 11:10:44 +0200 Subject: [sse2-test] sync it. --- unpremultiply-sse2-test.S | 292 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 247 insertions(+), 45 deletions(-) diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S index a0dccf4..4d45f7a 100644 --- a/unpremultiply-sse2-test.S +++ b/unpremultiply-sse2-test.S @@ -1,21 +1,30 @@ section .text +;;; +;;; Unpremultiply routine for SSE2/AMD64. +;;; +; We're only using rax-rbp in this file so that +; conversion to 32 bit SSE2 would be easier by +; updating the register names and the +; argument extraction to the calling convention. + +; Location of alpha in a 32 bit pixel. %define ASHIFT 24 ;%define ASHIFT 0 +;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors +;; of the form +;; +;; (1.0, 255/i, 255/i, 255/i) for ASHIFT=0 +;; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24 +;; +;; in 8.8 bit fixed point format. align 16 -; Reciprocal table with 64 bit entries of a 4x16 vectors -; of the form -; -; (255/i, 255/i, 255/i, 1.0) for ASHIFT=0 -; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24 -; -; in 8.8 fixed point format. reciprocal_table_Q: dq 0 %assign i 1 -%rep 255 -%assign recip (255*256 / i) +%rep 255 +%assign recip 255*256 / i %if ASHIFT == 0 dw 256, recip, recip, recip %elif ASHIFT==24 @@ -24,25 +33,153 @@ reciprocal_table_Q: %assign i i+1 %endrep -;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) +;; Reciprocal table with 32 bit entries of ceil(255/i) in +;; 16.16 bit fixed point. +reciprocal_table_D: + dd 0 +%assign i 1 +%rep 255 +%assign recip (255*65536 + i-1) / i + dd recip +%assign i i+1 +%endrep + +unpremultiply_single_pixels: +;; Slower version for the odd pixels at the beginning and +;; and. ;; -global unpremultiply_with_sse2_test -unpremultiply_with_sse2_test: - mov r8, rdx - shr r8, 2 ; TODO: left over pixels. - test r8, r8 - jnz .else - ret -.else: - push rbx - xor r9,r9 - align 16 +;; In: +;; uint32_t *dst/rdi: Destination pixels. +;; uint32_t *src/rsi: Source pixels. +;; num_pixels/rcx: # pixels to unpremultiply. +;; +;; Out: +;; rdi: dst + 4*num_pixels; advanced past dst. +;; rsi: src + 4*num_pixels; advanced past src. +;; +;; Saved: rdx +;; Scratched: rax-rcx, rbp + ; Advance src/dst pointers to the end and setup iteration + ; from -num_pixels up to 0. + lea rsi, [rsi + rcx*4] + lea rdi, [rdi + rcx*4] + neg rcx + jz .out ; No pixels at all? -> .out + + push rdx ; Save callee-save register. .loop: + ; Load the next source pixel. + mov eax, [rsi + rcx*4] + +%if ASHIFT == 24 + ; Extract alpha and look up the reciprocal. + mov ebx, eax + mov ebp, eax ; Initialise result pixel register. + shr ebx, 24 ; Load alpha. + mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. + and ebp, 0xFF000000 ; Mask off non-alpha from result pix. + + ; Do the component from bits 0..7. + mov edx, eax + and edx, 255 ; Extract the next component. + shr eax, 8 ; Shift it out. + imul edx, ebx ; Divide for a result in 8.16 fixed pt. + shr edx, 16 ; Truncate and move to bits 0..7. + or ebp, edx ; Merge into result pixel. + + ; Do the component from bits 8..15. + mov edx, eax + and edx, 255 ; Extract the next component. + shr eax, 8 ; Shift it out. + imul edx, ebx ; Divide for a result in 8.16 fixed pt. + and edx, 0x00FF0000 ; Truncate fraction. + shr edx, 8 ; Move to bits 8..15. + or ebp, edx ; Merge into result pixel. + + ; Do the component from bits 16..23. + and eax, 255 ; Mask off alpha. + imul eax, ebx ; Divide for a result in 8.16 fixed pt. + and eax, 0x00FF0000 ; Truncate fraction. + or ebp, eax ; Merge into result pixel. + +%elif ASHIFT == 0 + ; Extract alpha and loop up the reciprocal. + mov ebx, eax + shr eax, 8 ; Shift out alpha. + and ebx, 255 ; Mask off non-alpha. + mov ebp, ebx ; Initialise result pixel. + mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. + + ; Do the component from bits 8..15. + mov edx, eax + shr eax, 8 + and edx, 255 + imul edx, ebx + and edx, 0x00FF0000 + shr edx, 8 + or ebp, edx + + ; Do the component from bits 16..23 + mov edx, eax + shr eax, 8 + and edx, 255 + imul edx, ebx + and edx, 0x00FF0000 + or ebp, edx + + ; Do the component from bits 24..31. + imul eax, ebx + and eax, 0x00FF0000 + shl eax, 8 + or ebp, eax +%endif + ; Write the result pixel. + mov [rdi + rcx*4], ebp + + inc rcx + jnz .loop + + pop rdx ; Restore callee-save reg. +.out: + ret + +%macro unpremultiply_pixel_blocks 1 +;; Faster version that does it in blocks of four pixels at a time. +;; The macro is parameterised on the instruction used to move +;; an XMM register to memory. +;; +;; In: +;; uint32_t *src/rdi: Destination pixels. +;; uint32_t *dst/rsi: Source pixels. +;; num_pixels/rdx: # pixels to unpremultiply. Only +;; floor(num_pixels/4) will be. +;; +;; %1: Instruction used to write an xmm reg to dst. +;; +;; Out: +;; rcx: num_pixels mod 4 = # leftover pixels. +;; rdi: rdi + 16*floor(num_pixels/4); advanced past dst. +;; rsi: rsi + 16*floor(num_pixels/4); advanced past src. +;; +;; Scratched: xmm1-xmm4, rax-rdx, rbx + + ; Advance the src and dst pointers to the end. The bias + ; of +-15 is used to have the loop condition trigger an exit + ; just before we access the last incomplete block. + shl rdx, 2 ; Size in bytes. + lea rsi, [rsi + rdx - 15] + lea rdi, [rdi + rdx - 15] + neg rdx + add rdx, 15 ; Offset to the last byte of the + ; first block from the end. + jmp %%test_cc + align 16 +%%loop: ; Load four pixels into xmm1. The prefetchnta here - ; hides the difference between movdqa vs. movdqu on + ; hides the difference between movdqa vs. movdqu for ; aligned input. - prefetchnta [rsi + r9*8 + 16*64] - movdqu xmm1, [rsi+r9*8] + prefetchnta [rsi + rdx + 64*8] ; TODO: check the prefetch dist? + movdqu xmm1, [rsi + rdx] ; Expand the 8 bit components into 16 bit ones in ; two registers. @@ -50,44 +187,109 @@ unpremultiply_with_sse2_test: punpckhbw xmm2, xmm2 punpcklbw xmm1, xmm1 - ; Load alphas into GPRs. - movzx eax, byte [rsi + r9*8 + ASHIFT/8 + 0] - movzx ebx, byte [rsi + r9*8 + ASHIFT/8 + 4] - movzx ecx, byte [rsi + r9*8 + ASHIFT/8 + 8] - movzx edx, byte [rsi + r9*8 + ASHIFT/8 + 12] + ; Load alphas into registers. + movzx eax, byte [rsi + rdx + ASHIFT/8 + 0] + movzx ebx, byte [rsi + rdx + ASHIFT/8 + 4] + movzx ecx, byte [rsi + rdx + ASHIFT/8 + 8] + movzx ebp, byte [rsi + rdx + ASHIFT/8 + 12] - ; Fetch component multipliers for each pixel based on the alphas + ; Fetch multplier vectors for each pixel based on the alphas ; into the xmm3/xmm4 registers. movq xmm3, [reciprocal_table_Q + 8*eax] movq xmm4, [reciprocal_table_Q + 8*ecx] movhpd xmm3, [reciprocal_table_Q + 8*ebx] - movhpd xmm4, [reciprocal_table_Q + 8*edx] + movhpd xmm4, [reciprocal_table_Q + 8*ebp] ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. ; Treating the components as 0.16 bit fixed point, the pmulhuw ; leaves the integer part of x*255/a in the result for the colour - ; components and alphas themselves for the alpha components. + ; components x in (r,g,b) but leaves the alphas alone. pmulhuw xmm1, xmm3 pmulhuw xmm2, xmm4 ; Pack the four resulting pixels from 16 to 8 bit components. + ; Here we saturate the result in case the input was superluminant. packuswb xmm1, xmm2 ; Write the result. - ; - When the destination is expected (say due to aliasing) - ; to be in at least the L2 cache then the write should - ; be done using movdqa or movdqu. - ; - ; - Otherwise the destination won't be or fit in any cache level - ; in which case we should use movntdq. - -; movdqa [rdi+r9*8], xmm1 - movntdq [rdi+r9*8], xmm1 - - add r9, 2 - dec r8 - jnz .loop + %1 [rdi + rdx], xmm1 + + ; Increment to the next pixel. When this add overflows to >= 0 + ; then the next read of a block would venture past the end of + ; the buffer. + add rdx, 16 +%%test_cc: + jnc %%loop + + ; Offset the pointers back to the last incomplete block. + lea rsi, [rsi + rdx] + lea rdi, [rdi + rdx] + + ; Compute the # leftover pixels. + lea rcx, [rdx - 15] + neg rcx + and rcx, 15 ; # bytes leftover. + shr rcx, 2 ; # pixels leftover. +%endmacro + +global unpremultiply_with_sse2_test + +unpremultiply_with_sse2_test: +;; +;; void unpremultiply_with_sse2( +;; uint32_t *dst/rdi, +;; uint32_t const *src/rsi, +;; ulong n/rdx); +;; +;; This is the main entry point callable from the outside. +;; The calling convention used here is the ELF64 one. +;; + ; Save callee-saved registers. + push rbp + push rbx + + ; Save start dst for alignment tests later. + mov rcx, rdi + + ; If we don't have enough pixels for at least a few iterations + ; of blocked unpremultiplication then do the pixels one at a time. + cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks. + jae .do_blocked + mov rcx, rdx ; Pixel count. + call unpremultiply_single_pixels + jmp .out + +.do_blocked: + ; If the destination pointer isn't even aligned to uint32_t + ; then we can't align it to 0 mod 16 using single pixels. + test rcx, 3 + jz .can_align_dst + unpremultiply_pixel_blocks movdqu + jmp .do_leftovers + +.can_align_dst: + ; Align the destination pointer to 0 mod 16 by + ; doing 0..3 single pixels. + neg rcx + and rcx, 15 ; # bytes to align to 16. + shr rcx, 2 ; # pixels to align to 16. + sub rdx, rcx + call unpremultiply_single_pixels + + ; If the source and dest are exactly aliased or + ; the image is fairly small then use movdqa writes. + cmp rdi, rsi ; Use movdqa for aliased src, dst. + jz .1 + cmp rdx, 8192 ; ... or if the src and dest are small. + jc .1 + unpremultiply_pixel_blocks movntdq + jmp .do_leftovers +.1: + unpremultiply_pixel_blocks movdqa +.do_leftovers: + call unpremultiply_single_pixels .out: pop rbx + pop rbp ret -- cgit v1.2.3