summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-13 12:34:41 +0200
committerM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-13 12:34:41 +0200
commitfa5f2c66156c0ba1b6cdc2df768226f149eb6d3f (patch)
tree563489b1480e14b8aa33f35b2d508df4bb5d6b35
parent321f658793fc427d66b877f81036c40518419179 (diff)
Removed crufty SSE2 versions.
-rw-r--r--unpremultiply-sse2-float.S105
-rw-r--r--unpremultiply-sse2-test.S299
-rw-r--r--unpremultiply-sse2.S337
-rw-r--r--unpremultiply.c18
4 files changed, 264 insertions, 495 deletions
diff --git a/unpremultiply-sse2-float.S b/unpremultiply-sse2-float.S
deleted file mode 100644
index b8c9182..0000000
--- a/unpremultiply-sse2-float.S
+++ /dev/null
@@ -1,105 +0,0 @@
- section .text
-
-%macro function 1
- global %1
-%1:
-%endmacro
-
-%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
-
-; Unpremultiply a pixel in-place with uint32 components in xmm register %1.
-; Invariant:
-; xmm0: 0
-; xmm6: 255.0f
-; xmm7: (?,?,1.0f,?)
-; Scratch: xmm5
-%macro unpremultiply1 1
- cvtdq2ps %1, %1 ; uint32 components -> float
- rcpss xmm7, %1 ; xmm7: (?,?,1.0,1/a)
- mulss xmm7, xmm6 ; xmm7: (?,?,1.0,255/a), xmm6: 255.0
- shufps xmm5, xmm7, SELECT(0,1,0,0) ; xmm5: (255/a,1.0,?,?)
- shufps xmm5, xmm7, SELECT(0,0,3,2); xmm5: (255/a,255/a,255/a,1.0)
- mulps %1, xmm5 ; %1: (255*r/a,.., 255*b/a, a)
- cvtps2dq %1, %1 ; float components -> uint32
-%endmacro
-
-; Unpremultiply two pixels in-place with uint16 components in xmm register %1.
-; Invariant: as above.
-; Scratch: xmm4-5
-%macro unpremultiply2 1
- movdqa xmm4, %1
- punpckhwd xmm4, xmm0
- punpcklwd %1, xmm0
- unpremultiply1 xmm4
- unpremultiply1 %1
- packssdw %1, xmm4
-%endmacro
-
-; Unpremultiply four pixels in-place with uint8 components in xmm register %1.
-; Invariant: as above.
-; Scratch: xmm3-5
-%macro unpremultiply4 1
- movdqa xmm3, %1
- punpckhbw xmm3, xmm0
- punpcklbw %1, xmm0
- unpremultiply2 xmm3
- unpremultiply2 %1
- packuswb %1, xmm3
-%endmacro
-
-; Input:
-; %1: movdqa or movdqu, depending on the alignment of rsi and rdi.
-; r8: number of times N > 0 to loop.
-; rsi: uint32_t[4*N]: source pixels.
-; rdi: uint32_t[4*N]: destination pixels.
-; Invariant: as above.
-; Scratch: rsi,rdi,r8-9,xmm2-5
-%macro unpremultiply_loop_with 1
- xor r9,r9 ; index register.
- align 16
-%%loop:
- prefetchnta [rsi + r9*8 + 16*64]
- %1 xmm2, [rsi+r9*8]
- unpremultiply4 xmm2
- movntdq [rdi+r9*8], xmm2
-
- add r9, 2
- dec r8
- jnz %%loop
-%endmacro
-
-;; void unpremultiply_with_sse2_float(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
-;;
-function unpremultiply_with_sse2_float
- mov r8, rdx
- shr r8, 2 ; TODO: left over pixels.
- test r8, r8
- jnz .setup_invariants
- ret
-
-.setup_invariants:
- pxor xmm0, xmm0 ; constant zero for unpacking.
-
- mov rax, 1
- movd xmm7, eax
- cvtdq2ps xmm7, xmm7
- shufps xmm7, xmm7, SELECT(1,1,0,1) ; xmm7: (0,0,1.0f,0)
-
- mov rax, 255
- movd xmm6, eax
- cvtdq2ps xmm6, xmm6
- shufps xmm6, xmm6, SELECT(1,1,1,0) ; xmm6: 255f
-
- ; Decide on whether to use movdqu or movdqa based on source
- ; alignment. We always use movntdq to write the dest.
- test rsi, 15
- jz .aligned_case
- jmp .unaligned_case
-
-.aligned_case:
- unpremultiply_loop_with movdqa
- ret
-
-.unaligned_case:
- unpremultiply_loop_with movdqu
- ret
diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S
deleted file mode 100644
index e8bef21..0000000
--- a/unpremultiply-sse2-test.S
+++ /dev/null
@@ -1,299 +0,0 @@
-;;;
-;;; Unpremultiply routine for SSE2/AMD64.
-;;;
-;;; This file exports a function unpremultiply_with_sse2_test() that
-;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels.
-;;;
- section .text
-
-; We're only using rax-rbp in this file so that
-; conversion to 32 bit SSE2 would be easier by
-; updating the register names and the
-; argument extraction to the calling convention.
-
-; Location of alpha in a 32 bit pixel. Alpha measures opaqueness.
-%define ASHIFT 24
-;%define ASHIFT 0
-
-;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
-;; of the form
-;;
-;; (1.0, 255/i, 255/i, 255/i) for ASHIFT=0
-;; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24
-;;
-;; in 8.8 bit fixed point format.
- align 16
-reciprocal_table_Q:
- dq 0
-%assign i 1
-%rep 255
-%assign recip 255*256 / i
-%if ASHIFT == 0
- dw 256, recip, recip, recip
-%elif ASHIFT==24
- dw recip, recip, recip, 256
-%endif
-%assign i i+1
-%endrep
-
-;; Reciprocal table with 32 bit entries of ceil(255/i) in
-;; 16.16 bit fixed point.
-reciprocal_table_D:
- dd 0
-%assign i 1
-%rep 255
-%assign recip (255*65536 + i-1) / i
- dd recip
-%assign i i+1
-%endrep
-
-unpremultiply_single_pixels:
-;; Slower version for the odd pixels at the ends.
-;;
-;; In:
-;; uint32_t *dst/rdi: Destination pixels.
-;; uint32_t *src/rsi: Source pixels.
-;; num_pixels/rcx: # pixels to unpremultiply.
-;;
-;; Out:
-;; rdi: dst + 4*num_pixels; advanced past dst.
-;; rsi: src + 4*num_pixels; advanced past src.
-;;
-;; Saved: rdx
-;; Scratched: rax-rcx, rbp
- ; Advance src/dst pointers to the end and setup iteration
- ; from -num_pixels up to 0.
- lea rsi, [rsi + rcx*4]
- lea rdi, [rdi + rcx*4]
- neg rcx
- jz .out ; No pixels at all? -> .out
-
- push rdx ; Save callee-save register.
-.loop:
- ; Load the next source pixel.
- mov eax, [rsi + rcx*4]
-
-%if ASHIFT == 24
- ; Extract alpha and look up the reciprocal.
- mov ebx, eax
- mov ebp, eax ; Initialise result pixel register.
- and ebp, 0xFF000000 ; Mask off non-alpha from result pix.
- jz .next
- shr ebx, 24 ; Load alpha.
- mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
-
- ; Do the component from bits 0..7.
- mov edx, eax
- and edx, 255 ; Extract the next component.
- shr eax, 8 ; Shift it out.
- imul edx, ebx ; Divide for a result in 8.16 fixed pt.
- shr edx, 16 ; Truncate and move to bits 0..7.
- or ebp, edx ; Merge into result pixel.
-
- ; Do the component from bits 8..15.
- mov edx, eax
- and edx, 255 ; Extract the next component.
- shr eax, 8 ; Shift it out.
- imul edx, ebx ; Divide for a result in 8.16 fixed pt.
- and edx, 0x00FF0000 ; Truncate fraction.
- shr edx, 8 ; Move to bits 8..15.
- or ebp, edx ; Merge into result pixel.
-
- ; Do the component from bits 16..23.
- and eax, 255 ; Mask off alpha.
- imul eax, ebx ; Divide for a result in 8.16 fixed pt.
- and eax, 0x00FF0000 ; Truncate fraction.
- or ebp, eax ; Merge into result pixel.
-
-%elif ASHIFT == 0
- ; Extract alpha and loop up the reciprocal.
- mov ebx, eax
- shr eax, 8 ; Shift out alpha.
- and ebp, 255 ; Mask off non-alpha.
- mov ebx, ebp ; Initialise result pixel.
- jz .next
- mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
-
- ; Do the component from bits 8..15.
- mov edx, eax
- shr eax, 8
- and edx, 255
- imul edx, ebx
- and edx, 0x00FF0000
- shr edx, 8
- or ebp, edx
-
- ; Do the component from bits 16..23
- mov edx, eax
- shr eax, 8
- and edx, 255
- imul edx, ebx
- and edx, 0x00FF0000
- or ebp, edx
-
- ; Do the component from bits 24..31.
- imul eax, ebx
- and eax, 0x00FF0000
- shl eax, 8
- or ebp, eax
-%endif
-.next:
- ; Write the result pixel.
- mov [rdi + rcx*4], ebp
-
- inc rcx
- jnz .loop
-
- pop rdx ; Restore callee-save reg.
-.out:
- ret
-
-%macro unpremultiply_pixel_blocks 1
-;; Faster version that does it in blocks of four pixels at a time.
-;; The macro is parameterised on the instruction used to move
-;; an XMM register to memory.
-;;
-;; In:
-;; uint32_t *src/rdi: Destination pixels.
-;; uint32_t *dst/rsi: Source pixels.
-;; num_pixels/rdx: # pixels to unpremultiply. Only
-;; floor(num_pixels/4) will be.
-;;
-;; %1: Instruction used to write an xmm reg to dst.
-;;
-;; Out:
-;; rcx: num_pixels mod 4 = # leftover pixels.
-;; rdi: rdi + 16*floor(num_pixels/4); advanced past dst.
-;; rsi: rsi + 16*floor(num_pixels/4); advanced past src.
-;;
-;; Scratched: xmm1-xmm4, rax-rdx, rbx
- ; Advance the src and dst pointers to the end. The bias
- ; of +-15 is used to have the loop condition trigger an exit
- ; just before we access the last incomplete block.
- shl rdx, 2 ; Size in bytes.
- lea rsi, [rsi + rdx - 15]
- lea rdi, [rdi + rdx - 15]
- neg rdx
- add rdx, 15 ; Offset to the last byte of the
- ; first block from the end.
- jmp %%test_cc
- align 16
-%%loop:
- ; Load four pixels into xmm1. The prefetchnta here
- ; hides the difference between movdqa vs. movdqu for
- ; aligned input.
- prefetchnta [rsi + rdx + 64*8] ; TODO: check the prefetch dist?
- movdqu xmm1, [rsi + rdx]
-
- ; Expand the 8 bit components into 16 bit ones in
- ; two registers.
- movdqa xmm2, xmm1
- punpckhbw xmm2, xmm2
- punpcklbw xmm1, xmm1
-
- ; Load alphas into registers.
- movzx eax, byte [rsi + rdx + ASHIFT/8 + 0]
- movzx ebx, byte [rsi + rdx + ASHIFT/8 + 4]
- movzx ecx, byte [rsi + rdx + ASHIFT/8 + 8]
- movzx ebp, byte [rsi + rdx + ASHIFT/8 + 12]
-
- ; Fetch multplier vectors for each pixel based on the alphas
- ; into the xmm3/xmm4 registers.
- movq xmm3, [reciprocal_table_Q + 8*eax]
- movq xmm4, [reciprocal_table_Q + 8*ecx]
- movhpd xmm3, [reciprocal_table_Q + 8*ebx]
- movhpd xmm4, [reciprocal_table_Q + 8*ebp]
-
- ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
- ; Treating the components as 0.16 bit fixed point, the pmulhuw
- ; leaves the integer part of x*255/a in the result for the colour
- ; components x in (r,g,b) but leaves the alphas alone.
- pmulhuw xmm1, xmm3
- pmulhuw xmm2, xmm4
-
- ; Pack the four resulting pixels from 16 to 8 bit components.
- ; Here we saturate the result in case the input was superluminant.
- packuswb xmm1, xmm2
-
- ; Write the result.
- %1 [rdi + rdx], xmm1
-
- ; Increment to the next pixel. When this add overflows to >= 0
- ; then the next read of a block would venture past the end of
- ; the buffer.
- add rdx, 16
-%%test_cc:
- jnc %%loop
-
- ; Offset the pointers back to the last incomplete block.
- lea rsi, [rsi + rdx]
- lea rdi, [rdi + rdx]
-
- ; Compute the # leftover pixels.
- lea rcx, [rdx - 15]
- neg rcx
- and rcx, 15 ; # bytes leftover.
- shr rcx, 2 ; # pixels leftover.
-%endmacro
-
-global unpremultiply_with_sse2_test
-
-unpremultiply_with_sse2_test:
-;;
-;; void unpremultiply_with_sse2_test(
-;; uint32_t *dst/rdi,
-;; uint32_t const *src/rsi,
-;; ulong n/rdx);
-;;
-;; This is the main entry point callable from the outside.
-;; The calling convention used here is the ELF64 one.
-;;
- ; Save callee-saved registers.
- push rbp
- push rbx
-
- ; Save start of dst for alignment tests later.
- mov rcx, rdi
-
- ; If we don't have enough pixels for at least a few iterations
- ; of blocked unpremultiplication then do the pixels one at a time.
- cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks.
- jae .do_blocked
- mov rcx, rdx ; Pixel count.
- call unpremultiply_single_pixels
- jmp .out
-
-.do_blocked:
- ; If the destination pointer isn't even aligned to uint32_t
- ; then we can't align it to 0 mod 16 using single pixels.
- test rcx, 3
- jz .can_align_dst
- unpremultiply_pixel_blocks movdqu
- jmp .do_leftovers
-
-.can_align_dst:
- ; Align the destination pointer to 0 mod 16 by
- ; doing 0..3 single pixels.
- neg rcx
- and rcx, 15 ; # bytes to align to 16.
- shr rcx, 2 ; # pixels to align to 16.
- sub rdx, rcx
- call unpremultiply_single_pixels
-
- ; If the source and dest are exactly aliased or
- ; the image is fairly small then use movdqa writes.
- cmp rdi, rsi ; Use movdqa for aliased src, dst.
- jz .1
- cmp rdx, 8192 ; ... or if the src and dest are small.
- jc .1
- unpremultiply_pixel_blocks movntdq
- jmp .do_leftovers
-.1:
- unpremultiply_pixel_blocks movdqa
-
-.do_leftovers:
- call unpremultiply_single_pixels
-.out:
- pop rbx
- pop rbp
- ret
diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S
index 1b699cc..e0650a3 100644
--- a/unpremultiply-sse2.S
+++ b/unpremultiply-sse2.S
@@ -1,17 +1,33 @@
+;;;
+;;; Unpremultiply routine for SSE2/AMD64.
+;;;
+;;; This file exports a function unpremultiply_with_sse2() that
+;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels.
+;;;
section .text
+; We're only using rax-rbp in this file so that
+; conversion to 32 bit SSE2 would be easier by
+; updating the register names and the
+; argument extraction to the calling convention.
+
+; Location of alpha in a 32 bit pixel. Alpha measures opaqueness.
%define ASHIFT 24
;%define ASHIFT 0
-%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
-
-; Reciprocal table with 64 bit entries of a 4x16 vector
-; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
+;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
+;; of the form
+;;
+;; (1.0, 255/i, 255/i, 255/i) for ASHIFT=0
+;; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24
+;;
+;; in 8.8 bit fixed point format.
+ align 16
reciprocal_table_Q:
dq 0
%assign i 1
-%rep 255
-%assign recip ((255*256 + i-1) / i)
+%rep 255
+%assign recip 255*256 / i
%if ASHIFT == 0
dw 256, recip, recip, recip
%elif ASHIFT==24
@@ -20,91 +36,264 @@ reciprocal_table_Q:
%assign i i+1
%endrep
-; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
-; The pixels must not be supersaturated.
-; Output: xmm1: u8[], Four unpremultiplied pixels.
-; Invariant:
-; xmm0: 0
-; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx
-%macro unpremultiply_xmm1 0
+;; Reciprocal table with 32 bit entries of ceil(255/i) in
+;; 16.16 bit fixed point.
+reciprocal_table_D:
+ dd 0
+%assign i 1
+%rep 255
+%assign recip (255*65536 + i-1) / i
+ dd recip
+%assign i i+1
+%endrep
+
+unpremultiply_single_pixels:
+;; Slower version for the odd pixels at the ends.
+;;
+;; In:
+;; uint32_t *dst/rdi: Destination pixels.
+;; uint32_t *src/rsi: Source pixels.
+;; num_pixels/rcx: # pixels to unpremultiply.
+;;
+;; Out:
+;; rdi: dst + 4*num_pixels; advanced past dst.
+;; rsi: src + 4*num_pixels; advanced past src.
+;;
+;; Saved: rdx
+;; Scratched: rax-rcx, rbp
+ ; Advance src/dst pointers to the end and setup iteration
+ ; from -num_pixels up to 0.
+ lea rsi, [rsi + rcx*4]
+ lea rdi, [rdi + rcx*4]
+ neg rcx
+ jz .out ; No pixels at all? -> .out
+
+ push rdx ; Save callee-save register.
+.loop:
+ ; Load the next source pixel.
+ mov eax, [rsi + rcx*4]
+
+%if ASHIFT == 24
+ ; Extract alpha and look up the reciprocal.
+ mov ebx, eax
+ mov ebp, eax ; Initialise result pixel register.
+ and ebp, 0xFF000000 ; Mask off non-alpha from result pix.
+ jz .next
+ shr ebx, 24 ; Load alpha.
+ mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+
+ ; Do the component from bits 0..7.
+ mov edx, eax
+ and edx, 255 ; Extract the next component.
+ shr eax, 8 ; Shift it out.
+ imul edx, ebx ; Divide for a result in 8.16 fixed pt.
+ shr edx, 16 ; Truncate and move to bits 0..7.
+ or ebp, edx ; Merge into result pixel.
+
+ ; Do the component from bits 8..15.
+ mov edx, eax
+ and edx, 255 ; Extract the next component.
+ shr eax, 8 ; Shift it out.
+ imul edx, ebx ; Divide for a result in 8.16 fixed pt.
+ and edx, 0x00FF0000 ; Truncate fraction.
+ shr edx, 8 ; Move to bits 8..15.
+ or ebp, edx ; Merge into result pixel.
+
+ ; Do the component from bits 16..23.
+ and eax, 255 ; Mask off alpha.
+ imul eax, ebx ; Divide for a result in 8.16 fixed pt.
+ and eax, 0x00FF0000 ; Truncate fraction.
+ or ebp, eax ; Merge into result pixel.
+
+%elif ASHIFT == 0
+ ; Extract alpha and loop up the reciprocal.
+ mov ebx, eax
+ shr eax, 8 ; Shift out alpha.
+ and ebp, 255 ; Mask off non-alpha.
+ mov ebx, ebp ; Initialise result pixel.
+ jz .next
+ mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+
+ ; Do the component from bits 8..15.
+ mov edx, eax
+ shr eax, 8
+ and edx, 255
+ imul edx, ebx
+ and edx, 0x00FF0000
+ shr edx, 8
+ or ebp, edx
+
+ ; Do the component from bits 16..23
+ mov edx, eax
+ shr eax, 8
+ and edx, 255
+ imul edx, ebx
+ and edx, 0x00FF0000
+ or ebp, edx
+
+ ; Do the component from bits 24..31.
+ imul eax, ebx
+ and eax, 0x00FF0000
+ shl eax, 8
+ or ebp, eax
+%endif
+.next:
+ ; Write the result pixel.
+ mov [rdi + rcx*4], ebp
+
+ inc rcx
+ jnz .loop
+
+ pop rdx ; Restore callee-save reg.
+.out:
+ ret
+
+%macro unpremultiply_pixel_blocks 1
+;; Faster version that does it in blocks of four pixels at a time.
+;; The macro is parameterised on the instruction used to move
+;; an XMM register to memory.
+;;
+;; In:
+;; uint32_t *src/rdi: Destination pixels.
+;; uint32_t *dst/rsi: Source pixels.
+;; num_pixels/rdx: # pixels to unpremultiply. Only
+;; floor(num_pixels/4) will be.
+;;
+;; %1: Instruction used to write an xmm reg to dst.
+;;
+;; Out:
+;; rcx: num_pixels mod 4 = # leftover pixels.
+;; rdi: rdi + 16*floor(num_pixels/4); advanced past dst.
+;; rsi: rsi + 16*floor(num_pixels/4); advanced past src.
+;;
+;; Scratched: xmm1-xmm4, rax-rdx, rbx
+ ; Advance the src and dst pointers to the end. The bias
+ ; of +-15 is used to have the loop condition trigger an exit
+ ; just before we access the last incomplete block.
+ shl rdx, 2 ; Size in bytes.
+ lea rsi, [rsi + rdx - 15]
+ lea rdi, [rdi + rdx - 15]
+ neg rdx
+ add rdx, 15 ; Offset to the last byte of the
+ ; first block from the end.
+ jmp %%test_cc
+ align 16
+%%loop:
+ ; Load four pixels into xmm1. The prefetchnta here
+ ; hides the difference between movdqa vs. movdqu for
+ ; aligned input.
+ prefetchnta [rsi + rdx + 64*8] ; TODO: check the prefetch dist?
+ movdqu xmm1, [rsi + rdx]
+
; Expand the 8 bit components into 16 bit ones in
; two registers.
movdqa xmm2, xmm1
- punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a)
- punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a)
+ punpckhbw xmm2, xmm2
+ punpcklbw xmm1, xmm1
- ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
- pextrw ebx, xmm1, 4+ASHIFT/8 ; Extract pixel alphas into registers.
- pextrw edx, xmm2, 4+ASHIFT/8
- pextrw eax, xmm1, 0+ASHIFT/8
- pextrw ecx, xmm2, 0+ASHIFT/8
- movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
- movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs.
+ ; Load alphas into registers.
+ movzx eax, byte [rsi + rdx + ASHIFT/8 + 0]
+ movzx ebx, byte [rsi + rdx + ASHIFT/8 + 4]
+ movzx ecx, byte [rsi + rdx + ASHIFT/8 + 8]
+ movzx ebp, byte [rsi + rdx + ASHIFT/8 + 12]
+
+ ; Fetch multplier vectors for each pixel based on the alphas
+ ; into the xmm3/xmm4 registers.
movq xmm3, [reciprocal_table_Q + 8*eax]
movq xmm4, [reciprocal_table_Q + 8*ecx]
- pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
- pshufd xmm6, xmm6, SELECT(1,0,3,2)
- por xmm3, xmm5 ; Combine 64 bit upper and lower
- por xmm4, xmm6 ; into 128 bit multipliers.
- pmullw xmm1, xmm3 ; Multiply components by coefs.
- pmullw xmm2, xmm4 ; to produce 8.8 fp components.
- psrlw xmm1, 8 ; Take floor of components.
- psrlw xmm2, 8
+ movhpd xmm3, [reciprocal_table_Q + 8*ebx]
+ movhpd xmm4, [reciprocal_table_Q + 8*ebp]
+
+ ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+ ; Treating the components as 0.16 bit fixed point, the pmulhuw
+ ; leaves the integer part of x*255/a in the result for the colour
+ ; components x in (r,g,b) but leaves the alphas alone.
+ pmulhuw xmm1, xmm3
+ pmulhuw xmm2, xmm4
; Pack the four resulting pixels from 16 to 8 bit components.
+ ; Here we saturate the result in case the input was superluminant.
packuswb xmm1, xmm2
-%endmacro
-; Input:
-; %1: movdqa or movdqu, depending on the alignment of rsi and rdi.
-; r8: number of times N > 0 to loop.
-; rsi: uint32_t[4*N]: source pixels.
-; rdi: uint32_t[4*N]: destination pixels.
-; Invariant:
-; xmm0: 0
-; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
-%macro unpremultiply_loop_with 1
- xor r9,r9 ; index register.
- align 16
-%%loop:
- prefetchnta [rsi + r9*8 + 16*64]
- %1 xmm1, [rsi+r9*8]
- unpremultiply_xmm1
- movntdq [rdi+r9*8], xmm1
-
- add r9, 2
- dec r8
- jnz %%loop
+ ; Write the result.
+ %1 [rdi + rdx], xmm1
+
+ ; Increment to the next pixel. When this add overflows to >= 0
+ ; then the next read of a block would venture past the end of
+ ; the buffer.
+ add rdx, 16
+%%test_cc:
+ jnc %%loop
+
+ ; Offset the pointers back to the last incomplete block.
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rdx]
+
+ ; Compute the # leftover pixels.
+ lea rcx, [rdx - 15]
+ neg rcx
+ and rcx, 15 ; # bytes leftover.
+ shr rcx, 2 ; # pixels leftover.
%endmacro
-;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
-;;
global unpremultiply_with_sse2
-unpremultiply_with_sse2:
- mov r8, rdx
- shr r8, 2 ; TODO: left over pixels.
- test r8, r8
- jnz .setup_invariants
- ret
-.setup_invariants:
+unpremultiply_with_sse2:
+;;
+;; void unpremultiply_with_sse2(
+;; uint32_t *dst/rdi,
+;; uint32_t const *src/rsi,
+;; ulong n/rdx);
+;;
+;; This is the main entry point callable from the outside.
+;; The calling convention used here is the ELF64 one.
+;;
+ ; Save callee-saved registers.
+ push rbp
push rbx
- pxor xmm0, xmm0 ; constant zero for unpacking.
- ; Decide on whether to use movdqu or movdqa based on source
- ; alignment. We always use mvntdq to write the dest.
- test rsi, 15
- jz .aligned_case
- jmp .unaligned_case
+ ; Save start of dst for alignment tests later.
+ mov rcx, rdi
-.aligned_case:
- unpremultiply_loop_with movdqa
- pop rbx
- ret
+ ; If we don't have enough pixels for at least a few iterations
+ ; of blocked unpremultiplication then do the pixels one at a time.
+ cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks.
+ jae .do_blocked
+ mov rcx, rdx ; Pixel count.
+ call unpremultiply_single_pixels
+ jmp .out
+
+.do_blocked:
+ ; If the destination pointer isn't even aligned to uint32_t
+ ; then we can't align it to 0 mod 16 using single pixels.
+ test rcx, 3
+ jz .can_align_dst
+ unpremultiply_pixel_blocks movdqu
+ jmp .do_leftovers
+
+.can_align_dst:
+ ; Align the destination pointer to 0 mod 16 by
+ ; doing 0..3 single pixels.
+ neg rcx
+ and rcx, 15 ; # bytes to align to 16.
+ shr rcx, 2 ; # pixels to align to 16.
+ sub rdx, rcx
+ call unpremultiply_single_pixels
+
+ ; If the source and dest are exactly aliased or
+ ; the image is fairly small then use movdqa writes.
+ cmp rdi, rsi ; Use movdqa for aliased src, dst.
+ jz .1
+ cmp rdx, 8192 ; ... or if the src and dest are small.
+ jc .1
+ unpremultiply_pixel_blocks movntdq
+ jmp .do_leftovers
+.1:
+ unpremultiply_pixel_blocks movdqa
-.unaligned_case:
- unpremultiply_loop_with movdqu
+.do_leftovers:
+ call unpremultiply_single_pixels
+.out:
pop rbx
+ pop rbp
ret
diff --git a/unpremultiply.c b/unpremultiply.c
index 2f2b422..6d904d7 100644
--- a/unpremultiply.c
+++ b/unpremultiply.c
@@ -1,8 +1,6 @@
#define RUN_ME /*
nasm -g -f elf64 unpremultiply-sse2.S
-nasm -g -f elf64 unpremultiply-sse2-test.S
-nasm -g -f elf64 unpremultiply-sse2-float.S
-gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2*.o $0
+gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o $0
exit $?
*/
#include <assert.h>
@@ -32,8 +30,6 @@ exit $?
#define BMASK (255 << BSHIFT)
void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
-void unpremultiply_with_sse2_test(uint32_t *dst, uint32_t const *src, size_t n);
-void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n);
static void __attribute__((noinline))
unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
@@ -500,8 +496,6 @@ main(int argc, char **argv)
0 == strcmp(argv[i], "inv32-nocache") ||
0 == strcmp(argv[i], "inv64-nocache") ||
0 == strcmp(argv[i], "sse2") ||
- 0 == strcmp(argv[i], "sse2-test") ||
- 0 == strcmp(argv[i], "sse2-float") ||
0 == strcmp(argv[i], "copy") ||
0 == strcmp(argv[i], "read") ||
0 == strcmp(argv[i], "write") ||
@@ -577,16 +571,6 @@ main(int argc, char **argv)
unpremultiply_with_sse2(dst, src, n);
}
}
- else if (0 == strcmp(method, "sse2-test")) {
- while (nloops-- > 0) {
- unpremultiply_with_sse2_test(dst, src, n);
- }
- }
- else if (0 == strcmp(method, "sse2-float")) {
- while (nloops-- > 0) {
- unpremultiply_with_sse2_float(dst, src, n);
- }
- }
else if (0 == strcmp(method, "noop")) {
/* do nothing. */
} else {