summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-13 11:10:44 +0200
committerM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-13 11:10:44 +0200
commitbca43fc5c350dece7cac8a12a237e5bf37a666ed (patch)
tree6351a7505e97ccad08fabec5f0b741b285505796
parent724e9b675b314602a3072ce708b2d1679ce05a84 (diff)
[sse2-test] sync it.
-rw-r--r--unpremultiply-sse2-test.S292
1 files changed, 247 insertions, 45 deletions
diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S
index a0dccf4..4d45f7a 100644
--- a/unpremultiply-sse2-test.S
+++ b/unpremultiply-sse2-test.S
@@ -1,21 +1,30 @@
section .text
+;;;
+;;; Unpremultiply routine for SSE2/AMD64.
+;;;
+; We're only using rax-rbp in this file so that
+; conversion to 32 bit SSE2 would be easier by
+; updating the register names and the
+; argument extraction to the calling convention.
+
+; Location of alpha in a 32 bit pixel.
%define ASHIFT 24
;%define ASHIFT 0
+;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
+;; of the form
+;;
+;; (1.0, 255/i, 255/i, 255/i) for ASHIFT=0
+;; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24
+;;
+;; in 8.8 bit fixed point format.
align 16
-; Reciprocal table with 64 bit entries of a 4x16 vectors
-; of the form
-;
-; (255/i, 255/i, 255/i, 1.0) for ASHIFT=0
-; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24
-;
-; in 8.8 fixed point format.
reciprocal_table_Q:
dq 0
%assign i 1
-%rep 255
-%assign recip (255*256 / i)
+%rep 255
+%assign recip 255*256 / i
%if ASHIFT == 0
dw 256, recip, recip, recip
%elif ASHIFT==24
@@ -24,25 +33,153 @@ reciprocal_table_Q:
%assign i i+1
%endrep
-;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;; Reciprocal table with 32 bit entries of ceil(255/i) in
+;; 16.16 bit fixed point.
+reciprocal_table_D:
+ dd 0
+%assign i 1
+%rep 255
+%assign recip (255*65536 + i-1) / i
+ dd recip
+%assign i i+1
+%endrep
+
+unpremultiply_single_pixels:
+;; Slower version for the odd pixels at the beginning and
+;; and.
;;
-global unpremultiply_with_sse2_test
-unpremultiply_with_sse2_test:
- mov r8, rdx
- shr r8, 2 ; TODO: left over pixels.
- test r8, r8
- jnz .else
- ret
-.else:
- push rbx
- xor r9,r9
- align 16
+;; In:
+;; uint32_t *dst/rdi: Destination pixels.
+;; uint32_t *src/rsi: Source pixels.
+;; num_pixels/rcx: # pixels to unpremultiply.
+;;
+;; Out:
+;; rdi: dst + 4*num_pixels; advanced past dst.
+;; rsi: src + 4*num_pixels; advanced past src.
+;;
+;; Saved: rdx
+;; Scratched: rax-rcx, rbp
+ ; Advance src/dst pointers to the end and setup iteration
+ ; from -num_pixels up to 0.
+ lea rsi, [rsi + rcx*4]
+ lea rdi, [rdi + rcx*4]
+ neg rcx
+ jz .out ; No pixels at all? -> .out
+
+ push rdx ; Save callee-save register.
.loop:
+ ; Load the next source pixel.
+ mov eax, [rsi + rcx*4]
+
+%if ASHIFT == 24
+ ; Extract alpha and look up the reciprocal.
+ mov ebx, eax
+ mov ebp, eax ; Initialise result pixel register.
+ shr ebx, 24 ; Load alpha.
+ mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+ and ebp, 0xFF000000 ; Mask off non-alpha from result pix.
+
+ ; Do the component from bits 0..7.
+ mov edx, eax
+ and edx, 255 ; Extract the next component.
+ shr eax, 8 ; Shift it out.
+ imul edx, ebx ; Divide for a result in 8.16 fixed pt.
+ shr edx, 16 ; Truncate and move to bits 0..7.
+ or ebp, edx ; Merge into result pixel.
+
+ ; Do the component from bits 8..15.
+ mov edx, eax
+ and edx, 255 ; Extract the next component.
+ shr eax, 8 ; Shift it out.
+ imul edx, ebx ; Divide for a result in 8.16 fixed pt.
+ and edx, 0x00FF0000 ; Truncate fraction.
+ shr edx, 8 ; Move to bits 8..15.
+ or ebp, edx ; Merge into result pixel.
+
+ ; Do the component from bits 16..23.
+ and eax, 255 ; Mask off alpha.
+ imul eax, ebx ; Divide for a result in 8.16 fixed pt.
+ and eax, 0x00FF0000 ; Truncate fraction.
+ or ebp, eax ; Merge into result pixel.
+
+%elif ASHIFT == 0
+ ; Extract alpha and loop up the reciprocal.
+ mov ebx, eax
+ shr eax, 8 ; Shift out alpha.
+ and ebx, 255 ; Mask off non-alpha.
+ mov ebp, ebx ; Initialise result pixel.
+ mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+
+ ; Do the component from bits 8..15.
+ mov edx, eax
+ shr eax, 8
+ and edx, 255
+ imul edx, ebx
+ and edx, 0x00FF0000
+ shr edx, 8
+ or ebp, edx
+
+ ; Do the component from bits 16..23
+ mov edx, eax
+ shr eax, 8
+ and edx, 255
+ imul edx, ebx
+ and edx, 0x00FF0000
+ or ebp, edx
+
+ ; Do the component from bits 24..31.
+ imul eax, ebx
+ and eax, 0x00FF0000
+ shl eax, 8
+ or ebp, eax
+%endif
+ ; Write the result pixel.
+ mov [rdi + rcx*4], ebp
+
+ inc rcx
+ jnz .loop
+
+ pop rdx ; Restore callee-save reg.
+.out:
+ ret
+
+%macro unpremultiply_pixel_blocks 1
+;; Faster version that does it in blocks of four pixels at a time.
+;; The macro is parameterised on the instruction used to move
+;; an XMM register to memory.
+;;
+;; In:
+;; uint32_t *src/rdi: Destination pixels.
+;; uint32_t *dst/rsi: Source pixels.
+;; num_pixels/rdx: # pixels to unpremultiply. Only
+;; floor(num_pixels/4) will be.
+;;
+;; %1: Instruction used to write an xmm reg to dst.
+;;
+;; Out:
+;; rcx: num_pixels mod 4 = # leftover pixels.
+;; rdi: rdi + 16*floor(num_pixels/4); advanced past dst.
+;; rsi: rsi + 16*floor(num_pixels/4); advanced past src.
+;;
+;; Scratched: xmm1-xmm4, rax-rdx, rbx
+
+ ; Advance the src and dst pointers to the end. The bias
+ ; of +-15 is used to have the loop condition trigger an exit
+ ; just before we access the last incomplete block.
+ shl rdx, 2 ; Size in bytes.
+ lea rsi, [rsi + rdx - 15]
+ lea rdi, [rdi + rdx - 15]
+ neg rdx
+ add rdx, 15 ; Offset to the last byte of the
+ ; first block from the end.
+ jmp %%test_cc
+ align 16
+%%loop:
; Load four pixels into xmm1. The prefetchnta here
- ; hides the difference between movdqa vs. movdqu on
+ ; hides the difference between movdqa vs. movdqu for
; aligned input.
- prefetchnta [rsi + r9*8 + 16*64]
- movdqu xmm1, [rsi+r9*8]
+ prefetchnta [rsi + rdx + 64*8] ; TODO: check the prefetch dist?
+ movdqu xmm1, [rsi + rdx]
; Expand the 8 bit components into 16 bit ones in
; two registers.
@@ -50,44 +187,109 @@ unpremultiply_with_sse2_test:
punpckhbw xmm2, xmm2
punpcklbw xmm1, xmm1
- ; Load alphas into GPRs.
- movzx eax, byte [rsi + r9*8 + ASHIFT/8 + 0]
- movzx ebx, byte [rsi + r9*8 + ASHIFT/8 + 4]
- movzx ecx, byte [rsi + r9*8 + ASHIFT/8 + 8]
- movzx edx, byte [rsi + r9*8 + ASHIFT/8 + 12]
+ ; Load alphas into registers.
+ movzx eax, byte [rsi + rdx + ASHIFT/8 + 0]
+ movzx ebx, byte [rsi + rdx + ASHIFT/8 + 4]
+ movzx ecx, byte [rsi + rdx + ASHIFT/8 + 8]
+ movzx ebp, byte [rsi + rdx + ASHIFT/8 + 12]
- ; Fetch component multipliers for each pixel based on the alphas
+ ; Fetch multplier vectors for each pixel based on the alphas
; into the xmm3/xmm4 registers.
movq xmm3, [reciprocal_table_Q + 8*eax]
movq xmm4, [reciprocal_table_Q + 8*ecx]
movhpd xmm3, [reciprocal_table_Q + 8*ebx]
- movhpd xmm4, [reciprocal_table_Q + 8*edx]
+ movhpd xmm4, [reciprocal_table_Q + 8*ebp]
; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
; Treating the components as 0.16 bit fixed point, the pmulhuw
; leaves the integer part of x*255/a in the result for the colour
- ; components and alphas themselves for the alpha components.
+ ; components x in (r,g,b) but leaves the alphas alone.
pmulhuw xmm1, xmm3
pmulhuw xmm2, xmm4
; Pack the four resulting pixels from 16 to 8 bit components.
+ ; Here we saturate the result in case the input was superluminant.
packuswb xmm1, xmm2
; Write the result.
- ; - When the destination is expected (say due to aliasing)
- ; to be in at least the L2 cache then the write should
- ; be done using movdqa or movdqu.
- ;
- ; - Otherwise the destination won't be or fit in any cache level
- ; in which case we should use movntdq.
-
-; movdqa [rdi+r9*8], xmm1
- movntdq [rdi+r9*8], xmm1
-
- add r9, 2
- dec r8
- jnz .loop
+ %1 [rdi + rdx], xmm1
+
+ ; Increment to the next pixel. When this add overflows to >= 0
+ ; then the next read of a block would venture past the end of
+ ; the buffer.
+ add rdx, 16
+%%test_cc:
+ jnc %%loop
+
+ ; Offset the pointers back to the last incomplete block.
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rdx]
+
+ ; Compute the # leftover pixels.
+ lea rcx, [rdx - 15]
+ neg rcx
+ and rcx, 15 ; # bytes leftover.
+ shr rcx, 2 ; # pixels leftover.
+%endmacro
+
+global unpremultiply_with_sse2_test
+
+unpremultiply_with_sse2_test:
+;;
+;; void unpremultiply_with_sse2(
+;; uint32_t *dst/rdi,
+;; uint32_t const *src/rsi,
+;; ulong n/rdx);
+;;
+;; This is the main entry point callable from the outside.
+;; The calling convention used here is the ELF64 one.
+;;
+ ; Save callee-saved registers.
+ push rbp
+ push rbx
+
+ ; Save start dst for alignment tests later.
+ mov rcx, rdi
+
+ ; If we don't have enough pixels for at least a few iterations
+ ; of blocked unpremultiplication then do the pixels one at a time.
+ cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks.
+ jae .do_blocked
+ mov rcx, rdx ; Pixel count.
+ call unpremultiply_single_pixels
+ jmp .out
+
+.do_blocked:
+ ; If the destination pointer isn't even aligned to uint32_t
+ ; then we can't align it to 0 mod 16 using single pixels.
+ test rcx, 3
+ jz .can_align_dst
+ unpremultiply_pixel_blocks movdqu
+ jmp .do_leftovers
+
+.can_align_dst:
+ ; Align the destination pointer to 0 mod 16 by
+ ; doing 0..3 single pixels.
+ neg rcx
+ and rcx, 15 ; # bytes to align to 16.
+ shr rcx, 2 ; # pixels to align to 16.
+ sub rdx, rcx
+ call unpremultiply_single_pixels
+
+ ; If the source and dest are exactly aliased or
+ ; the image is fairly small then use movdqa writes.
+ cmp rdi, rsi ; Use movdqa for aliased src, dst.
+ jz .1
+ cmp rdx, 8192 ; ... or if the src and dest are small.
+ jc .1
+ unpremultiply_pixel_blocks movntdq
+ jmp .do_leftovers
+.1:
+ unpremultiply_pixel_blocks movdqa
+.do_leftovers:
+ call unpremultiply_single_pixels
.out:
pop rbx
+ pop rbp
ret