diff options
author | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-13 12:34:41 +0200 |
---|---|---|
committer | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-13 12:34:41 +0200 |
commit | fa5f2c66156c0ba1b6cdc2df768226f149eb6d3f (patch) | |
tree | 563489b1480e14b8aa33f35b2d508df4bb5d6b35 | |
parent | 321f658793fc427d66b877f81036c40518419179 (diff) |
Removed crufty SSE2 versions.
-rw-r--r-- | unpremultiply-sse2-float.S | 105 | ||||
-rw-r--r-- | unpremultiply-sse2-test.S | 299 | ||||
-rw-r--r-- | unpremultiply-sse2.S | 337 | ||||
-rw-r--r-- | unpremultiply.c | 18 |
4 files changed, 264 insertions, 495 deletions
diff --git a/unpremultiply-sse2-float.S b/unpremultiply-sse2-float.S deleted file mode 100644 index b8c9182..0000000 --- a/unpremultiply-sse2-float.S +++ /dev/null @@ -1,105 +0,0 @@ - section .text - -%macro function 1 - global %1 -%1: -%endmacro - -%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d)) - -; Unpremultiply a pixel in-place with uint32 components in xmm register %1. -; Invariant: -; xmm0: 0 -; xmm6: 255.0f -; xmm7: (?,?,1.0f,?) -; Scratch: xmm5 -%macro unpremultiply1 1 - cvtdq2ps %1, %1 ; uint32 components -> float - rcpss xmm7, %1 ; xmm7: (?,?,1.0,1/a) - mulss xmm7, xmm6 ; xmm7: (?,?,1.0,255/a), xmm6: 255.0 - shufps xmm5, xmm7, SELECT(0,1,0,0) ; xmm5: (255/a,1.0,?,?) - shufps xmm5, xmm7, SELECT(0,0,3,2); xmm5: (255/a,255/a,255/a,1.0) - mulps %1, xmm5 ; %1: (255*r/a,.., 255*b/a, a) - cvtps2dq %1, %1 ; float components -> uint32 -%endmacro - -; Unpremultiply two pixels in-place with uint16 components in xmm register %1. -; Invariant: as above. -; Scratch: xmm4-5 -%macro unpremultiply2 1 - movdqa xmm4, %1 - punpckhwd xmm4, xmm0 - punpcklwd %1, xmm0 - unpremultiply1 xmm4 - unpremultiply1 %1 - packssdw %1, xmm4 -%endmacro - -; Unpremultiply four pixels in-place with uint8 components in xmm register %1. -; Invariant: as above. -; Scratch: xmm3-5 -%macro unpremultiply4 1 - movdqa xmm3, %1 - punpckhbw xmm3, xmm0 - punpcklbw %1, xmm0 - unpremultiply2 xmm3 - unpremultiply2 %1 - packuswb %1, xmm3 -%endmacro - -; Input: -; %1: movdqa or movdqu, depending on the alignment of rsi and rdi. -; r8: number of times N > 0 to loop. -; rsi: uint32_t[4*N]: source pixels. -; rdi: uint32_t[4*N]: destination pixels. -; Invariant: as above. -; Scratch: rsi,rdi,r8-9,xmm2-5 -%macro unpremultiply_loop_with 1 - xor r9,r9 ; index register. - align 16 -%%loop: - prefetchnta [rsi + r9*8 + 16*64] - %1 xmm2, [rsi+r9*8] - unpremultiply4 xmm2 - movntdq [rdi+r9*8], xmm2 - - add r9, 2 - dec r8 - jnz %%loop -%endmacro - -;; void unpremultiply_with_sse2_float(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) -;; -function unpremultiply_with_sse2_float - mov r8, rdx - shr r8, 2 ; TODO: left over pixels. - test r8, r8 - jnz .setup_invariants - ret - -.setup_invariants: - pxor xmm0, xmm0 ; constant zero for unpacking. - - mov rax, 1 - movd xmm7, eax - cvtdq2ps xmm7, xmm7 - shufps xmm7, xmm7, SELECT(1,1,0,1) ; xmm7: (0,0,1.0f,0) - - mov rax, 255 - movd xmm6, eax - cvtdq2ps xmm6, xmm6 - shufps xmm6, xmm6, SELECT(1,1,1,0) ; xmm6: 255f - - ; Decide on whether to use movdqu or movdqa based on source - ; alignment. We always use movntdq to write the dest. - test rsi, 15 - jz .aligned_case - jmp .unaligned_case - -.aligned_case: - unpremultiply_loop_with movdqa - ret - -.unaligned_case: - unpremultiply_loop_with movdqu - ret diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S deleted file mode 100644 index e8bef21..0000000 --- a/unpremultiply-sse2-test.S +++ /dev/null @@ -1,299 +0,0 @@ -;;; -;;; Unpremultiply routine for SSE2/AMD64. -;;; -;;; This file exports a function unpremultiply_with_sse2_test() that -;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels. -;;; - section .text - -; We're only using rax-rbp in this file so that -; conversion to 32 bit SSE2 would be easier by -; updating the register names and the -; argument extraction to the calling convention. - -; Location of alpha in a 32 bit pixel. Alpha measures opaqueness. -%define ASHIFT 24 -;%define ASHIFT 0 - -;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors -;; of the form -;; -;; (1.0, 255/i, 255/i, 255/i) for ASHIFT=0 -;; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24 -;; -;; in 8.8 bit fixed point format. - align 16 -reciprocal_table_Q: - dq 0 -%assign i 1 -%rep 255 -%assign recip 255*256 / i -%if ASHIFT == 0 - dw 256, recip, recip, recip -%elif ASHIFT==24 - dw recip, recip, recip, 256 -%endif -%assign i i+1 -%endrep - -;; Reciprocal table with 32 bit entries of ceil(255/i) in -;; 16.16 bit fixed point. -reciprocal_table_D: - dd 0 -%assign i 1 -%rep 255 -%assign recip (255*65536 + i-1) / i - dd recip -%assign i i+1 -%endrep - -unpremultiply_single_pixels: -;; Slower version for the odd pixels at the ends. -;; -;; In: -;; uint32_t *dst/rdi: Destination pixels. -;; uint32_t *src/rsi: Source pixels. -;; num_pixels/rcx: # pixels to unpremultiply. -;; -;; Out: -;; rdi: dst + 4*num_pixels; advanced past dst. -;; rsi: src + 4*num_pixels; advanced past src. -;; -;; Saved: rdx -;; Scratched: rax-rcx, rbp - ; Advance src/dst pointers to the end and setup iteration - ; from -num_pixels up to 0. - lea rsi, [rsi + rcx*4] - lea rdi, [rdi + rcx*4] - neg rcx - jz .out ; No pixels at all? -> .out - - push rdx ; Save callee-save register. -.loop: - ; Load the next source pixel. - mov eax, [rsi + rcx*4] - -%if ASHIFT == 24 - ; Extract alpha and look up the reciprocal. - mov ebx, eax - mov ebp, eax ; Initialise result pixel register. - and ebp, 0xFF000000 ; Mask off non-alpha from result pix. - jz .next - shr ebx, 24 ; Load alpha. - mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. - - ; Do the component from bits 0..7. - mov edx, eax - and edx, 255 ; Extract the next component. - shr eax, 8 ; Shift it out. - imul edx, ebx ; Divide for a result in 8.16 fixed pt. - shr edx, 16 ; Truncate and move to bits 0..7. - or ebp, edx ; Merge into result pixel. - - ; Do the component from bits 8..15. - mov edx, eax - and edx, 255 ; Extract the next component. - shr eax, 8 ; Shift it out. - imul edx, ebx ; Divide for a result in 8.16 fixed pt. - and edx, 0x00FF0000 ; Truncate fraction. - shr edx, 8 ; Move to bits 8..15. - or ebp, edx ; Merge into result pixel. - - ; Do the component from bits 16..23. - and eax, 255 ; Mask off alpha. - imul eax, ebx ; Divide for a result in 8.16 fixed pt. - and eax, 0x00FF0000 ; Truncate fraction. - or ebp, eax ; Merge into result pixel. - -%elif ASHIFT == 0 - ; Extract alpha and loop up the reciprocal. - mov ebx, eax - shr eax, 8 ; Shift out alpha. - and ebp, 255 ; Mask off non-alpha. - mov ebx, ebp ; Initialise result pixel. - jz .next - mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. - - ; Do the component from bits 8..15. - mov edx, eax - shr eax, 8 - and edx, 255 - imul edx, ebx - and edx, 0x00FF0000 - shr edx, 8 - or ebp, edx - - ; Do the component from bits 16..23 - mov edx, eax - shr eax, 8 - and edx, 255 - imul edx, ebx - and edx, 0x00FF0000 - or ebp, edx - - ; Do the component from bits 24..31. - imul eax, ebx - and eax, 0x00FF0000 - shl eax, 8 - or ebp, eax -%endif -.next: - ; Write the result pixel. - mov [rdi + rcx*4], ebp - - inc rcx - jnz .loop - - pop rdx ; Restore callee-save reg. -.out: - ret - -%macro unpremultiply_pixel_blocks 1 -;; Faster version that does it in blocks of four pixels at a time. -;; The macro is parameterised on the instruction used to move -;; an XMM register to memory. -;; -;; In: -;; uint32_t *src/rdi: Destination pixels. -;; uint32_t *dst/rsi: Source pixels. -;; num_pixels/rdx: # pixels to unpremultiply. Only -;; floor(num_pixels/4) will be. -;; -;; %1: Instruction used to write an xmm reg to dst. -;; -;; Out: -;; rcx: num_pixels mod 4 = # leftover pixels. -;; rdi: rdi + 16*floor(num_pixels/4); advanced past dst. -;; rsi: rsi + 16*floor(num_pixels/4); advanced past src. -;; -;; Scratched: xmm1-xmm4, rax-rdx, rbx - ; Advance the src and dst pointers to the end. The bias - ; of +-15 is used to have the loop condition trigger an exit - ; just before we access the last incomplete block. - shl rdx, 2 ; Size in bytes. - lea rsi, [rsi + rdx - 15] - lea rdi, [rdi + rdx - 15] - neg rdx - add rdx, 15 ; Offset to the last byte of the - ; first block from the end. - jmp %%test_cc - align 16 -%%loop: - ; Load four pixels into xmm1. The prefetchnta here - ; hides the difference between movdqa vs. movdqu for - ; aligned input. - prefetchnta [rsi + rdx + 64*8] ; TODO: check the prefetch dist? - movdqu xmm1, [rsi + rdx] - - ; Expand the 8 bit components into 16 bit ones in - ; two registers. - movdqa xmm2, xmm1 - punpckhbw xmm2, xmm2 - punpcklbw xmm1, xmm1 - - ; Load alphas into registers. - movzx eax, byte [rsi + rdx + ASHIFT/8 + 0] - movzx ebx, byte [rsi + rdx + ASHIFT/8 + 4] - movzx ecx, byte [rsi + rdx + ASHIFT/8 + 8] - movzx ebp, byte [rsi + rdx + ASHIFT/8 + 12] - - ; Fetch multplier vectors for each pixel based on the alphas - ; into the xmm3/xmm4 registers. - movq xmm3, [reciprocal_table_Q + 8*eax] - movq xmm4, [reciprocal_table_Q + 8*ecx] - movhpd xmm3, [reciprocal_table_Q + 8*ebx] - movhpd xmm4, [reciprocal_table_Q + 8*ebp] - - ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. - ; Treating the components as 0.16 bit fixed point, the pmulhuw - ; leaves the integer part of x*255/a in the result for the colour - ; components x in (r,g,b) but leaves the alphas alone. - pmulhuw xmm1, xmm3 - pmulhuw xmm2, xmm4 - - ; Pack the four resulting pixels from 16 to 8 bit components. - ; Here we saturate the result in case the input was superluminant. - packuswb xmm1, xmm2 - - ; Write the result. - %1 [rdi + rdx], xmm1 - - ; Increment to the next pixel. When this add overflows to >= 0 - ; then the next read of a block would venture past the end of - ; the buffer. - add rdx, 16 -%%test_cc: - jnc %%loop - - ; Offset the pointers back to the last incomplete block. - lea rsi, [rsi + rdx] - lea rdi, [rdi + rdx] - - ; Compute the # leftover pixels. - lea rcx, [rdx - 15] - neg rcx - and rcx, 15 ; # bytes leftover. - shr rcx, 2 ; # pixels leftover. -%endmacro - -global unpremultiply_with_sse2_test - -unpremultiply_with_sse2_test: -;; -;; void unpremultiply_with_sse2_test( -;; uint32_t *dst/rdi, -;; uint32_t const *src/rsi, -;; ulong n/rdx); -;; -;; This is the main entry point callable from the outside. -;; The calling convention used here is the ELF64 one. -;; - ; Save callee-saved registers. - push rbp - push rbx - - ; Save start of dst for alignment tests later. - mov rcx, rdi - - ; If we don't have enough pixels for at least a few iterations - ; of blocked unpremultiplication then do the pixels one at a time. - cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks. - jae .do_blocked - mov rcx, rdx ; Pixel count. - call unpremultiply_single_pixels - jmp .out - -.do_blocked: - ; If the destination pointer isn't even aligned to uint32_t - ; then we can't align it to 0 mod 16 using single pixels. - test rcx, 3 - jz .can_align_dst - unpremultiply_pixel_blocks movdqu - jmp .do_leftovers - -.can_align_dst: - ; Align the destination pointer to 0 mod 16 by - ; doing 0..3 single pixels. - neg rcx - and rcx, 15 ; # bytes to align to 16. - shr rcx, 2 ; # pixels to align to 16. - sub rdx, rcx - call unpremultiply_single_pixels - - ; If the source and dest are exactly aliased or - ; the image is fairly small then use movdqa writes. - cmp rdi, rsi ; Use movdqa for aliased src, dst. - jz .1 - cmp rdx, 8192 ; ... or if the src and dest are small. - jc .1 - unpremultiply_pixel_blocks movntdq - jmp .do_leftovers -.1: - unpremultiply_pixel_blocks movdqa - -.do_leftovers: - call unpremultiply_single_pixels -.out: - pop rbx - pop rbp - ret diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S index 1b699cc..e0650a3 100644 --- a/unpremultiply-sse2.S +++ b/unpremultiply-sse2.S @@ -1,17 +1,33 @@ +;;; +;;; Unpremultiply routine for SSE2/AMD64. +;;; +;;; This file exports a function unpremultiply_with_sse2() that +;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels. +;;; section .text +; We're only using rax-rbp in this file so that +; conversion to 32 bit SSE2 would be easier by +; updating the register names and the +; argument extraction to the calling convention. + +; Location of alpha in a 32 bit pixel. Alpha measures opaqueness. %define ASHIFT 24 ;%define ASHIFT 0 -%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d)) - -; Reciprocal table with 64 bit entries of a 4x16 vector -; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format. +;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors +;; of the form +;; +;; (1.0, 255/i, 255/i, 255/i) for ASHIFT=0 +;; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24 +;; +;; in 8.8 bit fixed point format. + align 16 reciprocal_table_Q: dq 0 %assign i 1 -%rep 255 -%assign recip ((255*256 + i-1) / i) +%rep 255 +%assign recip 255*256 / i %if ASHIFT == 0 dw 256, recip, recip, recip %elif ASHIFT==24 @@ -20,91 +36,264 @@ reciprocal_table_Q: %assign i i+1 %endrep -; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12. -; The pixels must not be supersaturated. -; Output: xmm1: u8[], Four unpremultiplied pixels. -; Invariant: -; xmm0: 0 -; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) -; Scratch: xmm2, rax, rbx, rcx, rdx -%macro unpremultiply_xmm1 0 +;; Reciprocal table with 32 bit entries of ceil(255/i) in +;; 16.16 bit fixed point. +reciprocal_table_D: + dd 0 +%assign i 1 +%rep 255 +%assign recip (255*65536 + i-1) / i + dd recip +%assign i i+1 +%endrep + +unpremultiply_single_pixels: +;; Slower version for the odd pixels at the ends. +;; +;; In: +;; uint32_t *dst/rdi: Destination pixels. +;; uint32_t *src/rsi: Source pixels. +;; num_pixels/rcx: # pixels to unpremultiply. +;; +;; Out: +;; rdi: dst + 4*num_pixels; advanced past dst. +;; rsi: src + 4*num_pixels; advanced past src. +;; +;; Saved: rdx +;; Scratched: rax-rcx, rbp + ; Advance src/dst pointers to the end and setup iteration + ; from -num_pixels up to 0. + lea rsi, [rsi + rcx*4] + lea rdi, [rdi + rcx*4] + neg rcx + jz .out ; No pixels at all? -> .out + + push rdx ; Save callee-save register. +.loop: + ; Load the next source pixel. + mov eax, [rsi + rcx*4] + +%if ASHIFT == 24 + ; Extract alpha and look up the reciprocal. + mov ebx, eax + mov ebp, eax ; Initialise result pixel register. + and ebp, 0xFF000000 ; Mask off non-alpha from result pix. + jz .next + shr ebx, 24 ; Load alpha. + mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. + + ; Do the component from bits 0..7. + mov edx, eax + and edx, 255 ; Extract the next component. + shr eax, 8 ; Shift it out. + imul edx, ebx ; Divide for a result in 8.16 fixed pt. + shr edx, 16 ; Truncate and move to bits 0..7. + or ebp, edx ; Merge into result pixel. + + ; Do the component from bits 8..15. + mov edx, eax + and edx, 255 ; Extract the next component. + shr eax, 8 ; Shift it out. + imul edx, ebx ; Divide for a result in 8.16 fixed pt. + and edx, 0x00FF0000 ; Truncate fraction. + shr edx, 8 ; Move to bits 8..15. + or ebp, edx ; Merge into result pixel. + + ; Do the component from bits 16..23. + and eax, 255 ; Mask off alpha. + imul eax, ebx ; Divide for a result in 8.16 fixed pt. + and eax, 0x00FF0000 ; Truncate fraction. + or ebp, eax ; Merge into result pixel. + +%elif ASHIFT == 0 + ; Extract alpha and loop up the reciprocal. + mov ebx, eax + shr eax, 8 ; Shift out alpha. + and ebp, 255 ; Mask off non-alpha. + mov ebx, ebp ; Initialise result pixel. + jz .next + mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. + + ; Do the component from bits 8..15. + mov edx, eax + shr eax, 8 + and edx, 255 + imul edx, ebx + and edx, 0x00FF0000 + shr edx, 8 + or ebp, edx + + ; Do the component from bits 16..23 + mov edx, eax + shr eax, 8 + and edx, 255 + imul edx, ebx + and edx, 0x00FF0000 + or ebp, edx + + ; Do the component from bits 24..31. + imul eax, ebx + and eax, 0x00FF0000 + shl eax, 8 + or ebp, eax +%endif +.next: + ; Write the result pixel. + mov [rdi + rcx*4], ebp + + inc rcx + jnz .loop + + pop rdx ; Restore callee-save reg. +.out: + ret + +%macro unpremultiply_pixel_blocks 1 +;; Faster version that does it in blocks of four pixels at a time. +;; The macro is parameterised on the instruction used to move +;; an XMM register to memory. +;; +;; In: +;; uint32_t *src/rdi: Destination pixels. +;; uint32_t *dst/rsi: Source pixels. +;; num_pixels/rdx: # pixels to unpremultiply. Only +;; floor(num_pixels/4) will be. +;; +;; %1: Instruction used to write an xmm reg to dst. +;; +;; Out: +;; rcx: num_pixels mod 4 = # leftover pixels. +;; rdi: rdi + 16*floor(num_pixels/4); advanced past dst. +;; rsi: rsi + 16*floor(num_pixels/4); advanced past src. +;; +;; Scratched: xmm1-xmm4, rax-rdx, rbx + ; Advance the src and dst pointers to the end. The bias + ; of +-15 is used to have the loop condition trigger an exit + ; just before we access the last incomplete block. + shl rdx, 2 ; Size in bytes. + lea rsi, [rsi + rdx - 15] + lea rdi, [rdi + rdx - 15] + neg rdx + add rdx, 15 ; Offset to the last byte of the + ; first block from the end. + jmp %%test_cc + align 16 +%%loop: + ; Load four pixels into xmm1. The prefetchnta here + ; hides the difference between movdqa vs. movdqu for + ; aligned input. + prefetchnta [rsi + rdx + 64*8] ; TODO: check the prefetch dist? + movdqu xmm1, [rsi + rdx] + ; Expand the 8 bit components into 16 bit ones in ; two registers. movdqa xmm2, xmm1 - punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a) - punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a) + punpckhbw xmm2, xmm2 + punpcklbw xmm1, xmm1 - ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. - pextrw ebx, xmm1, 4+ASHIFT/8 ; Extract pixel alphas into registers. - pextrw edx, xmm2, 4+ASHIFT/8 - pextrw eax, xmm1, 0+ASHIFT/8 - pextrw ecx, xmm2, 0+ASHIFT/8 - movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers - movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs. + ; Load alphas into registers. + movzx eax, byte [rsi + rdx + ASHIFT/8 + 0] + movzx ebx, byte [rsi + rdx + ASHIFT/8 + 4] + movzx ecx, byte [rsi + rdx + ASHIFT/8 + 8] + movzx ebp, byte [rsi + rdx + ASHIFT/8 + 12] + + ; Fetch multplier vectors for each pixel based on the alphas + ; into the xmm3/xmm4 registers. movq xmm3, [reciprocal_table_Q + 8*eax] movq xmm4, [reciprocal_table_Q + 8*ecx] - pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper. - pshufd xmm6, xmm6, SELECT(1,0,3,2) - por xmm3, xmm5 ; Combine 64 bit upper and lower - por xmm4, xmm6 ; into 128 bit multipliers. - pmullw xmm1, xmm3 ; Multiply components by coefs. - pmullw xmm2, xmm4 ; to produce 8.8 fp components. - psrlw xmm1, 8 ; Take floor of components. - psrlw xmm2, 8 + movhpd xmm3, [reciprocal_table_Q + 8*ebx] + movhpd xmm4, [reciprocal_table_Q + 8*ebp] + + ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. + ; Treating the components as 0.16 bit fixed point, the pmulhuw + ; leaves the integer part of x*255/a in the result for the colour + ; components x in (r,g,b) but leaves the alphas alone. + pmulhuw xmm1, xmm3 + pmulhuw xmm2, xmm4 ; Pack the four resulting pixels from 16 to 8 bit components. + ; Here we saturate the result in case the input was superluminant. packuswb xmm1, xmm2 -%endmacro -; Input: -; %1: movdqa or movdqu, depending on the alignment of rsi and rdi. -; r8: number of times N > 0 to loop. -; rsi: uint32_t[4*N]: source pixels. -; rdi: uint32_t[4*N]: destination pixels. -; Invariant: -; xmm0: 0 -; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) -; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9 -%macro unpremultiply_loop_with 1 - xor r9,r9 ; index register. - align 16 -%%loop: - prefetchnta [rsi + r9*8 + 16*64] - %1 xmm1, [rsi+r9*8] - unpremultiply_xmm1 - movntdq [rdi+r9*8], xmm1 - - add r9, 2 - dec r8 - jnz %%loop + ; Write the result. + %1 [rdi + rdx], xmm1 + + ; Increment to the next pixel. When this add overflows to >= 0 + ; then the next read of a block would venture past the end of + ; the buffer. + add rdx, 16 +%%test_cc: + jnc %%loop + + ; Offset the pointers back to the last incomplete block. + lea rsi, [rsi + rdx] + lea rdi, [rdi + rdx] + + ; Compute the # leftover pixels. + lea rcx, [rdx - 15] + neg rcx + and rcx, 15 ; # bytes leftover. + shr rcx, 2 ; # pixels leftover. %endmacro -;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) -;; global unpremultiply_with_sse2 -unpremultiply_with_sse2: - mov r8, rdx - shr r8, 2 ; TODO: left over pixels. - test r8, r8 - jnz .setup_invariants - ret -.setup_invariants: +unpremultiply_with_sse2: +;; +;; void unpremultiply_with_sse2( +;; uint32_t *dst/rdi, +;; uint32_t const *src/rsi, +;; ulong n/rdx); +;; +;; This is the main entry point callable from the outside. +;; The calling convention used here is the ELF64 one. +;; + ; Save callee-saved registers. + push rbp push rbx - pxor xmm0, xmm0 ; constant zero for unpacking. - ; Decide on whether to use movdqu or movdqa based on source - ; alignment. We always use mvntdq to write the dest. - test rsi, 15 - jz .aligned_case - jmp .unaligned_case + ; Save start of dst for alignment tests later. + mov rcx, rdi -.aligned_case: - unpremultiply_loop_with movdqa - pop rbx - ret + ; If we don't have enough pixels for at least a few iterations + ; of blocked unpremultiplication then do the pixels one at a time. + cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks. + jae .do_blocked + mov rcx, rdx ; Pixel count. + call unpremultiply_single_pixels + jmp .out + +.do_blocked: + ; If the destination pointer isn't even aligned to uint32_t + ; then we can't align it to 0 mod 16 using single pixels. + test rcx, 3 + jz .can_align_dst + unpremultiply_pixel_blocks movdqu + jmp .do_leftovers + +.can_align_dst: + ; Align the destination pointer to 0 mod 16 by + ; doing 0..3 single pixels. + neg rcx + and rcx, 15 ; # bytes to align to 16. + shr rcx, 2 ; # pixels to align to 16. + sub rdx, rcx + call unpremultiply_single_pixels + + ; If the source and dest are exactly aliased or + ; the image is fairly small then use movdqa writes. + cmp rdi, rsi ; Use movdqa for aliased src, dst. + jz .1 + cmp rdx, 8192 ; ... or if the src and dest are small. + jc .1 + unpremultiply_pixel_blocks movntdq + jmp .do_leftovers +.1: + unpremultiply_pixel_blocks movdqa -.unaligned_case: - unpremultiply_loop_with movdqu +.do_leftovers: + call unpremultiply_single_pixels +.out: pop rbx + pop rbp ret diff --git a/unpremultiply.c b/unpremultiply.c index 2f2b422..6d904d7 100644 --- a/unpremultiply.c +++ b/unpremultiply.c @@ -1,8 +1,6 @@ #define RUN_ME /* nasm -g -f elf64 unpremultiply-sse2.S -nasm -g -f elf64 unpremultiply-sse2-test.S -nasm -g -f elf64 unpremultiply-sse2-float.S -gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2*.o $0 +gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o $0 exit $? */ #include <assert.h> @@ -32,8 +30,6 @@ exit $? #define BMASK (255 << BSHIFT) void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n); -void unpremultiply_with_sse2_test(uint32_t *dst, uint32_t const *src, size_t n); -void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n); static void __attribute__((noinline)) unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) @@ -500,8 +496,6 @@ main(int argc, char **argv) 0 == strcmp(argv[i], "inv32-nocache") || 0 == strcmp(argv[i], "inv64-nocache") || 0 == strcmp(argv[i], "sse2") || - 0 == strcmp(argv[i], "sse2-test") || - 0 == strcmp(argv[i], "sse2-float") || 0 == strcmp(argv[i], "copy") || 0 == strcmp(argv[i], "read") || 0 == strcmp(argv[i], "write") || @@ -577,16 +571,6 @@ main(int argc, char **argv) unpremultiply_with_sse2(dst, src, n); } } - else if (0 == strcmp(method, "sse2-test")) { - while (nloops-- > 0) { - unpremultiply_with_sse2_test(dst, src, n); - } - } - else if (0 == strcmp(method, "sse2-float")) { - while (nloops-- > 0) { - unpremultiply_with_sse2_float(dst, src, n); - } - } else if (0 == strcmp(method, "noop")) { /* do nothing. */ } else { |