Removed crufty SSE2 versions.

author: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-13 12:34:41 +0200
committer: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-13 12:34:41 +0200
commit: fa5f2c66156c0ba1b6cdc2df768226f149eb6d3f (patch)
tree: 563489b1480e14b8aa33f35b2d508df4bb5d6b35
parent: 321f658793fc427d66b877f81036c40518419179 (diff)
4 files changed, 264 insertions, 495 deletions
diff --git a/unpremultiply-sse2-float.S b/unpremultiply-sse2-float.S
deleted file mode 100644
index b8c9182..0000000
--- a/unpremultiply-sse2-float.S
+++ /dev/null
@@ -1,105 +0,0 @@
-	section .text
-
-%macro function 1
-	global	%1
-%1:
-%endmacro
-
-%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
-
-; Unpremultiply a pixel in-place with uint32 components in xmm register %1.
-; Invariant:
-;	xmm0: 0
-;	xmm6: 255.0f
-;	xmm7: (?,?,1.0f,?)
-; Scratch: xmm5
-%macro unpremultiply1 1
-	cvtdq2ps %1, %1			; uint32 components -> float
-	rcpss	xmm7, %1		; xmm7: (?,?,1.0,1/a)
-	mulss	xmm7, xmm6		; xmm7: (?,?,1.0,255/a), xmm6: 255.0
-	shufps	xmm5, xmm7, SELECT(0,1,0,0) ; xmm5: (255/a,1.0,?,?)
-	shufps	xmm5, xmm7, SELECT(0,0,3,2); xmm5: (255/a,255/a,255/a,1.0)
-	mulps	%1, xmm5		; %1: (255*r/a,.., 255*b/a, a)
-	cvtps2dq %1, %1			; float components -> uint32
-%endmacro
-
-; Unpremultiply two pixels in-place with uint16 components in xmm register %1.
-; Invariant: as above.
-; Scratch: xmm4-5
-%macro unpremultiply2 1
-	movdqa	xmm4, %1
-	punpckhwd xmm4, xmm0
-	punpcklwd %1, xmm0
-	unpremultiply1 xmm4
-	unpremultiply1 %1
-	packssdw %1, xmm4
-%endmacro
-
-; Unpremultiply four pixels in-place with uint8 components in xmm register %1.
-; Invariant: as above.
-; Scratch: xmm3-5
-%macro unpremultiply4 1
-	movdqa	xmm3, %1
-	punpckhbw xmm3, xmm0
-	punpcklbw %1, xmm0
-	unpremultiply2 xmm3
-	unpremultiply2 %1
-	packuswb %1, xmm3
-%endmacro
-
-; Input:
-;	%1:	movdqa or movdqu, depending on the alignment of rsi and rdi.
-;	r8:	number of times N > 0 to loop.
-;	rsi:	uint32_t[4*N]: source pixels.
-;	rdi:	uint32_t[4*N]: destination pixels.
-; Invariant: as above.
-; Scratch: rsi,rdi,r8-9,xmm2-5
-%macro	unpremultiply_loop_with 1
-	xor	r9,r9			; index register.
-	align 16
-%%loop:
-	prefetchnta	[rsi + r9*8 + 16*64]
-	%1	xmm2, [rsi+r9*8]
-	unpremultiply4 xmm2
-	movntdq	[rdi+r9*8], xmm2
-
-	add	r9, 2
-	dec	r8
-	jnz	%%loop
-%endmacro
-
-;; void unpremultiply_with_sse2_float(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
-;;
-function unpremultiply_with_sse2_float
-	mov	r8, rdx
-	shr	r8, 2			; TODO: left over pixels.
-	test	r8, r8
-	jnz	.setup_invariants
-	ret
-
-.setup_invariants:
-	pxor	xmm0, xmm0		; constant zero for unpacking.
-
-	mov	rax, 1
-	movd	xmm7, eax
-	cvtdq2ps xmm7, xmm7
-	shufps	xmm7, xmm7, SELECT(1,1,0,1) ; xmm7: (0,0,1.0f,0)
-
-	mov	rax, 255
-	movd	xmm6, eax
-	cvtdq2ps xmm6, xmm6
-	shufps	xmm6, xmm6, SELECT(1,1,1,0) ; xmm6: 255f
-
-	; Decide on whether to use movdqu or movdqa based on source
-	; alignment. We always use movntdq to write the dest.
-	test	rsi, 15
-	jz	.aligned_case
-	jmp	.unaligned_case
-
-.aligned_case:
-	unpremultiply_loop_with movdqa
-	ret
-
-.unaligned_case:
-	unpremultiply_loop_with movdqu
-	ret
diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S
deleted file mode 100644
index e8bef21..0000000
--- a/unpremultiply-sse2-test.S
+++ /dev/null
@@ -1,299 +0,0 @@
-;;;
-;;; Unpremultiply routine for SSE2/AMD64.
-;;;
-;;; This file exports a function unpremultiply_with_sse2_test() that
-;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels.
-;;;
-	section .text
-
-; We're only using rax-rbp in this file so that
-; conversion to 32 bit SSE2 would be easier by
-; updating the register names and the
-; argument extraction to the calling convention.
-
-; Location of alpha in a 32 bit pixel.  Alpha measures opaqueness.
-%define ASHIFT 24
-;%define ASHIFT 0
-
-;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
-;; of the form
-;;
-;;  (1.0, 255/i, 255/i, 255/i)	 for ASHIFT=0
-;;  (255/i, 255/i, 255/i, 1.0)	 for ASHIFT=24
-;;
-;; in 8.8 bit fixed point format.
-	align	16
-reciprocal_table_Q:
-	dq	0
-%assign i 1
-%rep	255
-%assign recip	255*256 / i
-%if ASHIFT == 0
-	dw	256, recip, recip, recip
-%elif ASHIFT==24
-	dw	recip, recip, recip, 256
-%endif
-%assign i i+1
-%endrep
-
-;; Reciprocal table with 32 bit entries of ceil(255/i) in
-;; 16.16 bit fixed point.
-reciprocal_table_D:
-	dd	0
-%assign i 1
-%rep	255
-%assign recip	(255*65536 + i-1) / i
-	dd	recip
-%assign i i+1
-%endrep
-
-unpremultiply_single_pixels:
-;; Slower version for the odd pixels at the ends.
-;;
-;; In:
-;;   uint32_t *dst/rdi:		Destination pixels.
-;;   uint32_t *src/rsi:		Source pixels.
-;;   num_pixels/rcx:		# pixels to unpremultiply.
-;;
-;; Out:
-;;   rdi:			dst + 4*num_pixels; advanced past dst.
-;;   rsi:			src + 4*num_pixels; advanced past src.
-;;
-;; Saved: rdx
-;; Scratched: rax-rcx, rbp
-	; Advance src/dst pointers to the end and setup iteration
-	; from -num_pixels up to 0.
-	lea	rsi, [rsi + rcx*4]
-	lea	rdi, [rdi + rcx*4]
-	neg	rcx
-	jz	.out			; No pixels at all? -> .out
-
-	push	rdx			; Save callee-save register.
-.loop:
-	; Load the next source pixel.
-	mov	eax, [rsi + rcx*4]
-
-%if ASHIFT == 24
-	; Extract alpha and look up the reciprocal.
-	mov	ebx, eax
-	mov	ebp, eax		; Initialise result pixel register.
-	and	ebp, 0xFF000000		; Mask off non-alpha from result pix.
-	jz	.next
-	shr	ebx, 24			; Load alpha.
-	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
-
-	; Do the component from bits 0..7.
-	mov	edx, eax
-	and	edx, 255		; Extract the next component.
-	shr	eax, 8			; Shift it out.
-	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
-	shr	edx, 16			; Truncate and move to bits 0..7.
-	or	ebp, edx		; Merge into result pixel.
-
-	; Do the component from bits 8..15.
-	mov	edx, eax
-	and	edx, 255		; Extract the next component.
-	shr	eax, 8			; Shift it out.
-	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
-	and	edx, 0x00FF0000		; Truncate fraction.
-	shr	edx, 8			; Move to bits 8..15.
-	or	ebp, edx		; Merge into result pixel.
-
-	; Do the component from bits 16..23.
-	and	eax, 255		; Mask off alpha.
-	imul	eax, ebx		; Divide for a result in 8.16 fixed pt.
-	and	eax, 0x00FF0000		; Truncate fraction.
-	or	ebp, eax		; Merge into result pixel.
-
-%elif ASHIFT == 0
-	; Extract alpha and loop up the reciprocal.
-	mov	ebx, eax
-	shr	eax, 8			; Shift out alpha.
-	and	ebp, 255		; Mask off non-alpha.
-	mov	ebx, ebp		; Initialise result pixel.
-	jz	.next
-	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
-
-	; Do the component from bits 8..15.
-	mov	edx, eax
-	shr	eax, 8
-	and	edx, 255
-	imul	edx, ebx
-	and	edx, 0x00FF0000
-	shr	edx, 8
-	or	ebp, edx
-
-	; Do the component from bits 16..23
-	mov	edx, eax
-	shr	eax, 8
-	and	edx, 255
-	imul	edx, ebx
-	and	edx, 0x00FF0000
-	or	ebp, edx
-
-	; Do the component from bits 24..31.
-	imul	eax, ebx
-	and	eax, 0x00FF0000
-	shl	eax, 8
-	or	ebp, eax
-%endif
-.next:
-	; Write the result pixel.
-	mov	[rdi + rcx*4], ebp
-
-	inc	rcx
-	jnz	.loop
-
-	pop	rdx			; Restore callee-save reg.
-.out:
-	ret
-
-%macro	unpremultiply_pixel_blocks 1
-;; Faster version that does it in blocks of four pixels at a time.
-;; The macro is parameterised on the instruction used to move
-;; an XMM register to memory.
-;;
-;; In:
-;;   uint32_t *src/rdi:	Destination pixels.
-;;   uint32_t *dst/rsi:	Source pixels.
-;;   num_pixels/rdx:	# pixels to unpremultiply.  Only
-;;			 floor(num_pixels/4) will be.
-;;
-;;   %1:		Instruction used to write an xmm reg to dst.
-;;
-;; Out:
-;;   rcx:		num_pixels mod 4 = # leftover pixels.
-;;   rdi:		rdi + 16*floor(num_pixels/4); advanced past dst.
-;;   rsi:		rsi + 16*floor(num_pixels/4); advanced past src.
-;;
-;; Scratched: xmm1-xmm4, rax-rdx, rbx
-	; Advance the src and dst pointers to the end.  The bias
-	; of +-15 is used to have the loop condition trigger an exit
-	; just before we access the last incomplete block.
-	shl	rdx, 2			; Size in bytes.
-	lea	rsi, [rsi + rdx - 15]
-	lea	rdi, [rdi + rdx - 15]
-	neg	rdx
-	add	rdx, 15			; Offset to the last byte of the
-					;  first block from the end.
-	jmp	%%test_cc
-	align 16
-%%loop:
-	; Load four pixels into xmm1.  The prefetchnta here
-	; hides the difference between movdqa vs. movdqu for
-	; aligned input.
-	prefetchnta	[rsi + rdx + 64*8] ; TODO: check the prefetch dist?
-	movdqu	xmm1, [rsi + rdx]
-
-	; Expand the 8 bit components into 16 bit ones in
-	; two registers.
-	movdqa	xmm2, xmm1
-	punpckhbw xmm2, xmm2
-	punpcklbw xmm1, xmm1
-
-	; Load alphas into registers.
-	movzx	eax, byte [rsi + rdx + ASHIFT/8 + 0]
-	movzx	ebx, byte [rsi + rdx + ASHIFT/8 + 4]
-	movzx	ecx, byte [rsi + rdx + ASHIFT/8 + 8]
-	movzx	ebp, byte [rsi + rdx + ASHIFT/8 + 12]
-
-	; Fetch multplier vectors for each pixel based on the alphas
-	; into the xmm3/xmm4 registers.
-	movq	xmm3, [reciprocal_table_Q + 8*eax]
-	movq	xmm4, [reciprocal_table_Q + 8*ecx]
-	movhpd	xmm3, [reciprocal_table_Q + 8*ebx]
-	movhpd	xmm4, [reciprocal_table_Q + 8*ebp]
-
-	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
-	; Treating the components as 0.16 bit fixed point, the pmulhuw
-	; leaves the integer part of x*255/a in the result for the colour
-	; components x in (r,g,b) but leaves the alphas alone.
-	pmulhuw	xmm1, xmm3
-	pmulhuw	xmm2, xmm4
-
-	; Pack the four resulting pixels from 16 to 8 bit components.
-	; Here we saturate the result in case the input was superluminant.
-	packuswb xmm1, xmm2
-
-	; Write the result.
-	%1	[rdi + rdx], xmm1
-
-	; Increment to the next pixel. When this add overflows to >= 0
-	; then the next read of a block would venture past the end of
-	; the buffer.
-	add	rdx, 16
-%%test_cc:
-	jnc	%%loop
-
-	; Offset the pointers back to the last incomplete block.
-	lea	rsi, [rsi + rdx]
-	lea	rdi, [rdi + rdx]
-
-	; Compute the # leftover pixels.
-	lea	rcx, [rdx - 15]
-	neg	rcx
-	and	rcx, 15			; # bytes leftover.
-	shr	rcx, 2			; # pixels leftover.
-%endmacro
-
-global unpremultiply_with_sse2_test
-
-unpremultiply_with_sse2_test:
-;;
-;; void unpremultiply_with_sse2_test(
-;;	uint32_t *dst/rdi,
-;;	uint32_t const *src/rsi,
-;;	ulong n/rdx);
-;;
-;; This is the main entry point callable from the outside.
-;; The calling convention used here is the ELF64 one.
-;;
-	; Save callee-saved registers.
-	push	rbp
-	push	rbx
-
-	; Save start of dst for alignment tests later.
-	mov	rcx, rdi
-
-	; If we don't have enough pixels for at least a few iterations
-	; of blocked unpremultiplication then do the pixels one at a time.
-	cmp	rdx, 3+4*4+3		; Max. pre/post align + 4 blocks.
-	jae	.do_blocked
-	 mov	rcx, rdx		; Pixel count.
-	 call	unpremultiply_single_pixels
-	 jmp	.out
-
-.do_blocked:
-	; If the destination pointer isn't even aligned to uint32_t
-	; then we can't align it to 0 mod 16 using single pixels.
-	test	rcx, 3
-	jz	.can_align_dst
-	 unpremultiply_pixel_blocks movdqu
-	 jmp	.do_leftovers
-
-.can_align_dst:
-	; Align the destination pointer to 0 mod 16 by
-	; doing 0..3 single pixels.
-	neg	rcx
-	and	rcx, 15			; # bytes to align to 16.
-	shr	rcx, 2			; # pixels to align to 16.
-	sub	rdx, rcx
-	call	unpremultiply_single_pixels
-
-	; If the source and dest are exactly aliased or
-	; the image is fairly small then use movdqa writes.
-	cmp	rdi, rsi		; Use movdqa for aliased src, dst.
-	jz	.1
-	cmp	rdx, 8192		; ... or if the src and dest are small.
-	jc	.1
-	 unpremultiply_pixel_blocks movntdq
-	 jmp	.do_leftovers
-.1:
-	 unpremultiply_pixel_blocks movdqa
-
-.do_leftovers:
-	call	unpremultiply_single_pixels
-.out:
-	pop	rbx
-	pop	rbp
-	ret
diff --git a/unpremultiply-sse2.S b/unpremultiply-sse2.S
index 1b699cc..e0650a3 100644
--- a/unpremultiply-sse2.S
+++ b/unpremultiply-sse2.S
@@ -1,17 +1,33 @@
+;;;
+;;; Unpremultiply routine for SSE2/AMD64.
+;;;
+;;; This file exports a function unpremultiply_with_sse2() that
+;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels.
+;;;
 	section .text
 
+; We're only using rax-rbp in this file so that
+; conversion to 32 bit SSE2 would be easier by
+; updating the register names and the
+; argument extraction to the calling convention.
+
+; Location of alpha in a 32 bit pixel.  Alpha measures opaqueness.
 %define ASHIFT 24
 ;%define ASHIFT 0
 
-%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
-
-; Reciprocal table with 64 bit entries of a 4x16 vector 
-; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
+;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
+;; of the form
+;;
+;;  (1.0, 255/i, 255/i, 255/i)	 for ASHIFT=0
+;;  (255/i, 255/i, 255/i, 1.0)	 for ASHIFT=24
+;;
+;; in 8.8 bit fixed point format.
+	align	16
 reciprocal_table_Q:
 	dq	0
 %assign i 1
-%rep    255
-%assign recip	((255*256 + i-1) / i)
+%rep	255
+%assign recip	255*256 / i
 %if ASHIFT == 0
 	dw	256, recip, recip, recip
 %elif ASHIFT==24
@@ -20,91 +36,264 @@ reciprocal_table_Q:
 %assign i i+1
 %endrep
 
-; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
-;		The pixels must not be supersaturated.
-; Output: xmm1: u8[], Four unpremultiplied pixels.
-; Invariant:
-;	xmm0: 0
-;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx
-%macro unpremultiply_xmm1 0
+;; Reciprocal table with 32 bit entries of ceil(255/i) in
+;; 16.16 bit fixed point.
+reciprocal_table_D:
+	dd	0
+%assign i 1
+%rep	255
+%assign recip	(255*65536 + i-1) / i
+	dd	recip
+%assign i i+1
+%endrep
+
+unpremultiply_single_pixels:
+;; Slower version for the odd pixels at the ends.
+;;
+;; In:
+;;   uint32_t *dst/rdi:		Destination pixels.
+;;   uint32_t *src/rsi:		Source pixels.
+;;   num_pixels/rcx:		# pixels to unpremultiply.
+;;
+;; Out:
+;;   rdi:			dst + 4*num_pixels; advanced past dst.
+;;   rsi:			src + 4*num_pixels; advanced past src.
+;;
+;; Saved: rdx
+;; Scratched: rax-rcx, rbp
+	; Advance src/dst pointers to the end and setup iteration
+	; from -num_pixels up to 0.
+	lea	rsi, [rsi + rcx*4]
+	lea	rdi, [rdi + rcx*4]
+	neg	rcx
+	jz	.out			; No pixels at all? -> .out
+
+	push	rdx			; Save callee-save register.
+.loop:
+	; Load the next source pixel.
+	mov	eax, [rsi + rcx*4]
+
+%if ASHIFT == 24
+	; Extract alpha and look up the reciprocal.
+	mov	ebx, eax
+	mov	ebp, eax		; Initialise result pixel register.
+	and	ebp, 0xFF000000		; Mask off non-alpha from result pix.
+	jz	.next
+	shr	ebx, 24			; Load alpha.
+	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+
+	; Do the component from bits 0..7.
+	mov	edx, eax
+	and	edx, 255		; Extract the next component.
+	shr	eax, 8			; Shift it out.
+	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
+	shr	edx, 16			; Truncate and move to bits 0..7.
+	or	ebp, edx		; Merge into result pixel.
+
+	; Do the component from bits 8..15.
+	mov	edx, eax
+	and	edx, 255		; Extract the next component.
+	shr	eax, 8			; Shift it out.
+	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
+	and	edx, 0x00FF0000		; Truncate fraction.
+	shr	edx, 8			; Move to bits 8..15.
+	or	ebp, edx		; Merge into result pixel.
+
+	; Do the component from bits 16..23.
+	and	eax, 255		; Mask off alpha.
+	imul	eax, ebx		; Divide for a result in 8.16 fixed pt.
+	and	eax, 0x00FF0000		; Truncate fraction.
+	or	ebp, eax		; Merge into result pixel.
+
+%elif ASHIFT == 0
+	; Extract alpha and loop up the reciprocal.
+	mov	ebx, eax
+	shr	eax, 8			; Shift out alpha.
+	and	ebp, 255		; Mask off non-alpha.
+	mov	ebx, ebp		; Initialise result pixel.
+	jz	.next
+	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+
+	; Do the component from bits 8..15.
+	mov	edx, eax
+	shr	eax, 8
+	and	edx, 255
+	imul	edx, ebx
+	and	edx, 0x00FF0000
+	shr	edx, 8
+	or	ebp, edx
+
+	; Do the component from bits 16..23
+	mov	edx, eax
+	shr	eax, 8
+	and	edx, 255
+	imul	edx, ebx
+	and	edx, 0x00FF0000
+	or	ebp, edx
+
+	; Do the component from bits 24..31.
+	imul	eax, ebx
+	and	eax, 0x00FF0000
+	shl	eax, 8
+	or	ebp, eax
+%endif
+.next:
+	; Write the result pixel.
+	mov	[rdi + rcx*4], ebp
+
+	inc	rcx
+	jnz	.loop
+
+	pop	rdx			; Restore callee-save reg.
+.out:
+	ret
+
+%macro	unpremultiply_pixel_blocks 1
+;; Faster version that does it in blocks of four pixels at a time.
+;; The macro is parameterised on the instruction used to move
+;; an XMM register to memory.
+;;
+;; In:
+;;   uint32_t *src/rdi:	Destination pixels.
+;;   uint32_t *dst/rsi:	Source pixels.
+;;   num_pixels/rdx:	# pixels to unpremultiply.  Only
+;;			 floor(num_pixels/4) will be.
+;;
+;;   %1:		Instruction used to write an xmm reg to dst.
+;;
+;; Out:
+;;   rcx:		num_pixels mod 4 = # leftover pixels.
+;;   rdi:		rdi + 16*floor(num_pixels/4); advanced past dst.
+;;   rsi:		rsi + 16*floor(num_pixels/4); advanced past src.
+;;
+;; Scratched: xmm1-xmm4, rax-rdx, rbx
+	; Advance the src and dst pointers to the end.  The bias
+	; of +-15 is used to have the loop condition trigger an exit
+	; just before we access the last incomplete block.
+	shl	rdx, 2			; Size in bytes.
+	lea	rsi, [rsi + rdx - 15]
+	lea	rdi, [rdi + rdx - 15]
+	neg	rdx
+	add	rdx, 15			; Offset to the last byte of the
+					;  first block from the end.
+	jmp	%%test_cc
+	align 16
+%%loop:
+	; Load four pixels into xmm1.  The prefetchnta here
+	; hides the difference between movdqa vs. movdqu for
+	; aligned input.
+	prefetchnta	[rsi + rdx + 64*8] ; TODO: check the prefetch dist?
+	movdqu	xmm1, [rsi + rdx]
+
 	; Expand the 8 bit components into 16 bit ones in
 	; two registers.
 	movdqa	xmm2, xmm1
-	punpckhbw xmm2, xmm0		; xmm2: (r,g,b,a|r,g,b,a)
-	punpcklbw xmm1, xmm0		; xmm1: (r,g,b,a|r,g,b,a)
+	punpckhbw xmm2, xmm2
+	punpcklbw xmm1, xmm1
 
-	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
-	pextrw	ebx, xmm1, 4+ASHIFT/8	; Extract pixel alphas into registers.
-	pextrw	edx, xmm2, 4+ASHIFT/8
-	pextrw	eax, xmm1, 0+ASHIFT/8
-	pextrw	ecx, xmm2, 0+ASHIFT/8
-	movq	xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
-	movq	xmm6, [reciprocal_table_Q + 8*edx] ;  into lower regs.
+	; Load alphas into registers.
+	movzx	eax, byte [rsi + rdx + ASHIFT/8 + 0]
+	movzx	ebx, byte [rsi + rdx + ASHIFT/8 + 4]
+	movzx	ecx, byte [rsi + rdx + ASHIFT/8 + 8]
+	movzx	ebp, byte [rsi + rdx + ASHIFT/8 + 12]
+
+	; Fetch multplier vectors for each pixel based on the alphas
+	; into the xmm3/xmm4 registers.
 	movq	xmm3, [reciprocal_table_Q + 8*eax]
 	movq	xmm4, [reciprocal_table_Q + 8*ecx]
-	pshufd	xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
-	pshufd	xmm6, xmm6, SELECT(1,0,3,2)
-	por	xmm3, xmm5		; Combine 64 bit upper and lower
-	por	xmm4, xmm6		;  into 128 bit multipliers.
-	pmullw	xmm1, xmm3	        ; Multiply components by coefs.
-	pmullw	xmm2, xmm4		;  to produce 8.8 fp components.
-	psrlw	xmm1, 8			; Take floor of components.
-	psrlw	xmm2, 8
+	movhpd	xmm3, [reciprocal_table_Q + 8*ebx]
+	movhpd	xmm4, [reciprocal_table_Q + 8*ebp]
+
+	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
+	; Treating the components as 0.16 bit fixed point, the pmulhuw
+	; leaves the integer part of x*255/a in the result for the colour
+	; components x in (r,g,b) but leaves the alphas alone.
+	pmulhuw	xmm1, xmm3
+	pmulhuw	xmm2, xmm4
 
 	; Pack the four resulting pixels from 16 to 8 bit components.
+	; Here we saturate the result in case the input was superluminant.
 	packuswb xmm1, xmm2
-%endmacro
 
-; Input:
-;	%1:	movdqa or movdqu, depending on the alignment of rsi and rdi.
-;	r8:	number of times N > 0 to loop.
-;	rsi:	uint32_t[4*N]: source pixels.
-;	rdi:	uint32_t[4*N]: destination pixels.
-; Invariant:
-;	xmm0: 0
-;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
-%macro	unpremultiply_loop_with 1
-	xor	r9,r9			; index register.
-	align 16
-%%loop:
-	prefetchnta	[rsi + r9*8 + 16*64]
-	%1	xmm1, [rsi+r9*8]
-	unpremultiply_xmm1
-	movntdq	[rdi+r9*8], xmm1
-
-	add	r9, 2
-	dec	r8
-	jnz	%%loop
+	; Write the result.
+	%1	[rdi + rdx], xmm1
+
+	; Increment to the next pixel. When this add overflows to >= 0
+	; then the next read of a block would venture past the end of
+	; the buffer.
+	add	rdx, 16
+%%test_cc:
+	jnc	%%loop
+
+	; Offset the pointers back to the last incomplete block.
+	lea	rsi, [rsi + rdx]
+	lea	rdi, [rdi + rdx]
+
+	; Compute the # leftover pixels.
+	lea	rcx, [rdx - 15]
+	neg	rcx
+	and	rcx, 15			; # bytes leftover.
+	shr	rcx, 2			; # pixels leftover.
 %endmacro
 
-;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
-;;
 global unpremultiply_with_sse2
-unpremultiply_with_sse2:
-	mov	r8, rdx
-	shr	r8, 2			; TODO: left over pixels.
-	test	r8, r8
-	jnz	.setup_invariants
-	ret
 
-.setup_invariants:
+unpremultiply_with_sse2:
+;;
+;; void unpremultiply_with_sse2(
+;;	uint32_t *dst/rdi,
+;;	uint32_t const *src/rsi,
+;;	ulong n/rdx);
+;;
+;; This is the main entry point callable from the outside.
+;; The calling convention used here is the ELF64 one.
+;;
+	; Save callee-saved registers.
+	push	rbp
 	push	rbx
-	pxor	xmm0, xmm0		; constant zero for unpacking.
 
-	; Decide on whether to use movdqu or movdqa based on source
-	; alignment. We always use mvntdq to write the dest.
-	test	rsi, 15
-	jz	.aligned_case
-	jmp	.unaligned_case
+	; Save start of dst for alignment tests later.
+	mov	rcx, rdi
 
-.aligned_case:
-	unpremultiply_loop_with movdqa
-	pop	rbx
-	ret
+	; If we don't have enough pixels for at least a few iterations
+	; of blocked unpremultiplication then do the pixels one at a time.
+	cmp	rdx, 3+4*4+3		; Max. pre/post align + 4 blocks.
+	jae	.do_blocked
+	 mov	rcx, rdx		; Pixel count.
+	 call	unpremultiply_single_pixels
+	 jmp	.out
+
+.do_blocked:
+	; If the destination pointer isn't even aligned to uint32_t
+	; then we can't align it to 0 mod 16 using single pixels.
+	test	rcx, 3
+	jz	.can_align_dst
+	 unpremultiply_pixel_blocks movdqu
+	 jmp	.do_leftovers
+
+.can_align_dst:
+	; Align the destination pointer to 0 mod 16 by
+	; doing 0..3 single pixels.
+	neg	rcx
+	and	rcx, 15			; # bytes to align to 16.
+	shr	rcx, 2			; # pixels to align to 16.
+	sub	rdx, rcx
+	call	unpremultiply_single_pixels
+
+	; If the source and dest are exactly aliased or
+	; the image is fairly small then use movdqa writes.
+	cmp	rdi, rsi		; Use movdqa for aliased src, dst.
+	jz	.1
+	cmp	rdx, 8192		; ... or if the src and dest are small.
+	jc	.1
+	 unpremultiply_pixel_blocks movntdq
+	 jmp	.do_leftovers
+.1:
+	 unpremultiply_pixel_blocks movdqa
 
-.unaligned_case:
-	unpremultiply_loop_with movdqu
+.do_leftovers:
+	call	unpremultiply_single_pixels
+.out:
 	pop	rbx
+	pop	rbp
 	ret
diff --git a/unpremultiply.c b/unpremultiply.c
index 2f2b422..6d904d7 100644
--- a/unpremultiply.c
+++ b/unpremultiply.c
@@ -1,8 +1,6 @@
 #define RUN_ME /*
 nasm -g -f elf64 unpremultiply-sse2.S
-nasm -g -f elf64 unpremultiply-sse2-test.S
-nasm -g -f elf64 unpremultiply-sse2-float.S
-gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2*.o $0
+gcc -W -Wall -Wextra -std=c99 -fomit-frame-pointer -funroll-all-loops -O3 -g -o `basename $0 .c` unpremultiply-sse2.o $0
 exit $?
 */
 #include <assert.h>
@@ -32,8 +30,6 @@ exit $?
 #define BMASK (255 << BSHIFT)
 
 void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n);
-void unpremultiply_with_sse2_test(uint32_t *dst, uint32_t const *src, size_t n);
-void unpremultiply_with_sse2_float(uint32_t *dst, uint32_t const *src, size_t n);
 
 static void __attribute__((noinline))
 unpremultiply_with_div(uint32_t * restrict dst, uint32_t const * restrict src, size_t n)
@@ -500,8 +496,6 @@ main(int argc, char **argv)
 		 0 == strcmp(argv[i], "inv32-nocache") ||
 		 0 == strcmp(argv[i], "inv64-nocache") ||
 		 0 == strcmp(argv[i], "sse2") ||
-		 0 == strcmp(argv[i], "sse2-test") ||
-		 0 == strcmp(argv[i], "sse2-float") ||
 		 0 == strcmp(argv[i], "copy") ||
 		 0 == strcmp(argv[i], "read") ||
 		 0 == strcmp(argv[i], "write") ||
@@ -577,16 +571,6 @@ main(int argc, char **argv)
 	    unpremultiply_with_sse2(dst, src, n);
 	}
     }
-    else if (0 == strcmp(method, "sse2-test")) {
-	while (nloops-- > 0) {
-	    unpremultiply_with_sse2_test(dst, src, n);
-	}
-    }
-    else if (0 == strcmp(method, "sse2-float")) {
-	while (nloops-- > 0) {
-	    unpremultiply_with_sse2_float(dst, src, n);
-	}
-    }
     else if (0 == strcmp(method, "noop")) {
 	/* do nothing. */
     } else {
author	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-13 12:34:41 +0200
committer	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-13 12:34:41 +0200
commit	fa5f2c66156c0ba1b6cdc2df768226f149eb6d3f (patch)
tree	563489b1480e14b8aa33f35b2d508df4bb5d6b35
parent	321f658793fc427d66b877f81036c40518419179 (diff)