From bca43fc5c350dece7cac8a12a237e5bf37a666ed Mon Sep 17 00:00:00 2001
From: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>
Date: Tue, 13 Jan 2009 11:10:44 +0200
Subject: [sse2-test] sync it.

---
 unpremultiply-sse2-test.S | 292 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 247 insertions(+), 45 deletions(-)

diff --git a/unpremultiply-sse2-test.S b/unpremultiply-sse2-test.S
index a0dccf4..4d45f7a 100644
--- a/unpremultiply-sse2-test.S
+++ b/unpremultiply-sse2-test.S
@@ -1,21 +1,30 @@
 	section .text
+;;;
+;;; Unpremultiply routine for SSE2/AMD64.
+;;;
 
+; We're only using rax-rbp in this file so that
+; conversion to 32 bit SSE2 would be easier by
+; updating the register names and the
+; argument extraction to the calling convention.
+
+; Location of alpha in a 32 bit pixel.
 %define ASHIFT 24
 ;%define ASHIFT 0
 
+;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
+;; of the form
+;;
+;;  (1.0, 255/i, 255/i, 255/i)	 for ASHIFT=0
+;;  (255/i, 255/i, 255/i, 1.0)	 for ASHIFT=24
+;;
+;; in 8.8 bit fixed point format.
 	align	16
-; Reciprocal table with 64 bit entries of a 4x16 vectors
-; of the form
-;
-;  (255/i, 255/i, 255/i, 1.0)   for ASHIFT=0
-;  (255/i, 255/i, 255/i, 1.0)   for ASHIFT=24
-;
-; in 8.8 fixed point format.
 reciprocal_table_Q:
 	dq	0
 %assign i 1
-%rep    255
-%assign recip	(255*256 / i)
+%rep	255
+%assign recip	255*256 / i
 %if ASHIFT == 0
 	dw	256, recip, recip, recip
 %elif ASHIFT==24
@@ -24,25 +33,153 @@ reciprocal_table_Q:
 %assign i i+1
 %endrep
 
-;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
+;; Reciprocal table with 32 bit entries of ceil(255/i) in
+;; 16.16 bit fixed point.
+reciprocal_table_D:
+	dd	0
+%assign i 1
+%rep	255
+%assign recip	(255*65536 + i-1) / i
+	dd	recip
+%assign i i+1
+%endrep
+
+unpremultiply_single_pixels:
+;; Slower version for the odd pixels at the beginning and
+;; and.
 ;;
-global unpremultiply_with_sse2_test
-unpremultiply_with_sse2_test:
-	mov	r8, rdx
-	shr	r8, 2			; TODO: left over pixels.
-	test	r8, r8
-	jnz	.else
-	 ret
-.else:
-	push	rbx
-	xor	r9,r9
-	align 16
+;; In:
+;;   uint32_t *dst/rdi:		Destination pixels.
+;;   uint32_t *src/rsi:		Source pixels.
+;;   num_pixels/rcx:		# pixels to unpremultiply.
+;;
+;; Out:
+;;   rdi:			dst + 4*num_pixels; advanced past dst.
+;;   rsi:			src + 4*num_pixels; advanced past src.
+;;
+;; Saved: rdx
+;; Scratched: rax-rcx, rbp
+	; Advance src/dst pointers to the end and setup iteration
+	; from -num_pixels up to 0.
+	lea	rsi, [rsi + rcx*4]
+	lea	rdi, [rdi + rcx*4]
+	neg	rcx
+	jz	.out			; No pixels at all? -> .out
+
+	push	rdx			; Save callee-save register.
 .loop:
+	; Load the next source pixel.
+	mov	eax, [rsi + rcx*4]
+
+%if ASHIFT == 24
+	; Extract alpha and look up the reciprocal.
+	mov	ebx, eax
+	mov	ebp, eax		; Initialise result pixel register.
+	shr	ebx, 24			; Load alpha.
+	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+	and	ebp, 0xFF000000		; Mask off non-alpha from result pix.
+
+	; Do the component from bits 0..7.
+	mov	edx, eax
+	and	edx, 255		; Extract the next component.
+	shr	eax, 8			; Shift it out.
+	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
+	shr	edx, 16			; Truncate and move to bits 0..7.
+	or	ebp, edx		; Merge into result pixel.
+
+	; Do the component from bits 8..15.
+	mov	edx, eax
+	and	edx, 255		; Extract the next component.
+	shr	eax, 8			; Shift it out.
+	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
+	and	edx, 0x00FF0000		; Truncate fraction.
+	shr	edx, 8			; Move to bits 8..15.
+	or	ebp, edx		; Merge into result pixel.
+
+	; Do the component from bits 16..23.
+	and	eax, 255		; Mask off alpha.
+	imul	eax, ebx		; Divide for a result in 8.16 fixed pt.
+	and	eax, 0x00FF0000		; Truncate fraction.
+	or	ebp, eax		; Merge into result pixel.
+
+%elif ASHIFT == 0
+	; Extract alpha and loop up the reciprocal.
+	mov	ebx, eax
+	shr	eax, 8			; Shift out alpha.
+	and	ebx, 255		; Mask off non-alpha.
+	mov	ebp, ebx		; Initialise result pixel.
+	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.
+
+	; Do the component from bits 8..15.
+	mov	edx, eax
+	shr	eax, 8
+	and	edx, 255
+	imul	edx, ebx
+	and	edx, 0x00FF0000
+	shr	edx, 8
+	or	ebp, edx
+
+	; Do the component from bits 16..23
+	mov	edx, eax
+	shr	eax, 8
+	and	edx, 255
+	imul	edx, ebx
+	and	edx, 0x00FF0000
+	or	ebp, edx
+
+	; Do the component from bits 24..31.
+	imul	eax, ebx
+	and	eax, 0x00FF0000
+	shl	eax, 8
+	or	ebp, eax
+%endif
+	; Write the result pixel.
+	mov	[rdi + rcx*4], ebp
+
+	inc	rcx
+	jnz	.loop
+
+	pop	rdx			; Restore callee-save reg.
+.out:
+	ret
+
+%macro	unpremultiply_pixel_blocks 1
+;; Faster version that does it in blocks of four pixels at a time.
+;; The macro is parameterised on the instruction used to move
+;; an XMM register to memory.
+;;
+;; In:
+;;   uint32_t *src/rdi:	Destination pixels.
+;;   uint32_t *dst/rsi:	Source pixels.
+;;   num_pixels/rdx:	# pixels to unpremultiply.  Only
+;;			 floor(num_pixels/4) will be.
+;;
+;;   %1:		Instruction used to write an xmm reg to dst.
+;;
+;; Out:
+;;   rcx:		num_pixels mod 4 = # leftover pixels.
+;;   rdi:		rdi + 16*floor(num_pixels/4); advanced past dst.
+;;   rsi:		rsi + 16*floor(num_pixels/4); advanced past src.
+;;
+;; Scratched: xmm1-xmm4, rax-rdx, rbx
+
+	; Advance the src and dst pointers to the end.  The bias
+	; of +-15 is used to have the loop condition trigger an exit
+	; just before we access the last incomplete block.
+	shl	rdx, 2			; Size in bytes.
+	lea	rsi, [rsi + rdx - 15]
+	lea	rdi, [rdi + rdx - 15]
+	neg	rdx
+	add	rdx, 15			; Offset to the last byte of the
+					;  first block from the end.
+	jmp	%%test_cc
+	align 16
+%%loop:
 	; Load four pixels into xmm1.  The prefetchnta here
-	; hides the difference between movdqa vs. movdqu on
+	; hides the difference between movdqa vs. movdqu for
 	; aligned input.
-	prefetchnta	[rsi + r9*8 + 16*64]
-	movdqu	xmm1, [rsi+r9*8]
+	prefetchnta	[rsi + rdx + 64*8] ; TODO: check the prefetch dist?
+	movdqu	xmm1, [rsi + rdx]
 
 	; Expand the 8 bit components into 16 bit ones in
 	; two registers.
@@ -50,44 +187,109 @@ unpremultiply_with_sse2_test:
 	punpckhbw xmm2, xmm2
 	punpcklbw xmm1, xmm1
 
-	; Load alphas into GPRs.
-	movzx	eax, byte [rsi + r9*8 + ASHIFT/8 + 0]
-	movzx	ebx, byte [rsi + r9*8 + ASHIFT/8 + 4]
-	movzx	ecx, byte [rsi + r9*8 + ASHIFT/8 + 8]
-	movzx	edx, byte [rsi + r9*8 + ASHIFT/8 + 12]
+	; Load alphas into registers.
+	movzx	eax, byte [rsi + rdx + ASHIFT/8 + 0]
+	movzx	ebx, byte [rsi + rdx + ASHIFT/8 + 4]
+	movzx	ecx, byte [rsi + rdx + ASHIFT/8 + 8]
+	movzx	ebp, byte [rsi + rdx + ASHIFT/8 + 12]
 
-	; Fetch component multipliers for each pixel based on the alphas
+	; Fetch multplier vectors for each pixel based on the alphas
 	; into the xmm3/xmm4 registers.
 	movq	xmm3, [reciprocal_table_Q + 8*eax]
 	movq	xmm4, [reciprocal_table_Q + 8*ecx]
 	movhpd	xmm3, [reciprocal_table_Q + 8*ebx]
-	movhpd	xmm4, [reciprocal_table_Q + 8*edx]
+	movhpd	xmm4, [reciprocal_table_Q + 8*ebp]
 
 	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
 	; Treating the components as 0.16 bit fixed point, the pmulhuw
 	; leaves the integer part of x*255/a in the result for the colour
-	; components and alphas themselves for the alpha components.
+	; components x in (r,g,b) but leaves the alphas alone.
 	pmulhuw	xmm1, xmm3
 	pmulhuw	xmm2, xmm4
 
 	; Pack the four resulting pixels from 16 to 8 bit components.
+	; Here we saturate the result in case the input was superluminant.
 	packuswb xmm1, xmm2
 
 	; Write the result.
-	; - When the destination is expected (say due to aliasing)
-	;   to be in at least the L2 cache then the write should
-	;   be done using movdqa or movdqu.
-	;
-	; - Otherwise the destination won't be or fit in any cache level
-	; in which case we should use movntdq.
-
-;	movdqa	[rdi+r9*8], xmm1
-	movntdq	[rdi+r9*8], xmm1
-
-	add	r9, 2
-	dec	r8
-	jnz	.loop
+	%1	[rdi + rdx], xmm1
+
+	; Increment to the next pixel. When this add overflows to >= 0
+	; then the next read of a block would venture past the end of
+	; the buffer.
+	add	rdx, 16
+%%test_cc:
+	jnc	%%loop
+
+	; Offset the pointers back to the last incomplete block.
+	lea	rsi, [rsi + rdx]
+	lea	rdi, [rdi + rdx]
+
+	; Compute the # leftover pixels.
+	lea	rcx, [rdx - 15]
+	neg	rcx
+	and	rcx, 15			; # bytes leftover.
+	shr	rcx, 2			; # pixels leftover.
+%endmacro
+
+global unpremultiply_with_sse2_test
+
+unpremultiply_with_sse2_test:
+;;
+;; void unpremultiply_with_sse2(
+;;	uint32_t *dst/rdi,
+;;	uint32_t const *src/rsi,
+;;	ulong n/rdx);
+;;
+;; This is the main entry point callable from the outside.
+;; The calling convention used here is the ELF64 one.
+;;
+	; Save callee-saved registers.
+	push	rbp
+	push	rbx
+
+	; Save start dst for alignment tests later.
+	mov	rcx, rdi
+
+	; If we don't have enough pixels for at least a few iterations
+	; of blocked unpremultiplication then do the pixels one at a time.
+	cmp	rdx, 3+4*4+3		; Max. pre/post align + 4 blocks.
+	jae	.do_blocked
+	 mov	rcx, rdx		; Pixel count.
+	 call	unpremultiply_single_pixels
+	 jmp	.out
+
+.do_blocked:
+	; If the destination pointer isn't even aligned to uint32_t
+	; then we can't align it to 0 mod 16 using single pixels.
+	test	rcx, 3
+	jz	.can_align_dst
+	 unpremultiply_pixel_blocks movdqu
+	 jmp	.do_leftovers
+
+.can_align_dst:
+	; Align the destination pointer to 0 mod 16 by
+	; doing 0..3 single pixels.
+	neg	rcx
+	and	rcx, 15			; # bytes to align to 16.
+	shr	rcx, 2			; # pixels to align to 16.
+	sub	rdx, rcx
+	call	unpremultiply_single_pixels
+
+	; If the source and dest are exactly aliased or
+	; the image is fairly small then use movdqa writes.
+	cmp	rdi, rsi		; Use movdqa for aliased src, dst.
+	jz	.1
+	cmp	rdx, 8192		; ... or if the src and dest are small.
+	jc	.1
+	 unpremultiply_pixel_blocks movntdq
+	 jmp	.do_leftovers
+.1:
+	 unpremultiply_pixel_blocks movdqa
 
+.do_leftovers:
+	call	unpremultiply_single_pixels
 .out:
 	pop	rbx
+	pop	rbp
 	ret
-- 
cgit v1.2.3