;;;
;;; Copyright (c) 2009  M Joonas Pihlaja
;;;
;;; Permission is hereby granted, free of charge, to any person
;;; obtaining a copy of this software and associated documentation
;;; files (the "Software"), to deal in the Software without
;;; restriction, including without limitation the rights to use,
;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
;;; copies of the Software, and to permit persons to whom the
;;; Software is furnished to do so, subject to the following
;;; conditions:
;;;
;;; The above copyright notice and this permission notice shall be
;;; included in all copies or substantial portions of the Software.
;;;
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
;;; OTHER DEALINGS IN THE SOFTWARE.

;;;
;;; Unpremultiply routine for SSE2/AMD64.
;;;
;;; This file exports a function unpremultiply_with_sse2() that
;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels.
;;;
;;; void unpremultiply_with_sse2(
;;;		uint32_t        *dst,
;;;		uint32_t const  *src,
;;;		unsigned long    num_pixels);
;;;
;;; Assembled with nasm 2.06rc2.
;;;
	section .text

; We're only using rax-rbp in this file so that
; conversion to 32 bit SSE2 would be easier by
; updating the register names and the
; argument extraction to the calling convention.

; Location of alpha in a 32 bit pixel.  Alpha measures opaqueness.
%define ASHIFT 24
;%define ASHIFT 0

;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
;; of the form
;;
;;  (1.0, 255/i, 255/i, 255/i)	 for ASHIFT=0
;;  (255/i, 255/i, 255/i, 1.0)	 for ASHIFT=24
;;
;; in 8.8 bit fixed point format.
	align	16
reciprocal_table_Q:
	dq	0
%assign i 1
%rep	255
%assign recip	255*256 / i
%if ASHIFT == 0
	dw	256, recip, recip, recip
%elif ASHIFT==24
	dw	recip, recip, recip, 256
%endif
%assign i i+1
%endrep

;; Reciprocal table with 32 bit entries of ceil(255/i) in
;; 16.16 bit fixed point.
reciprocal_table_D:
	dd	0
%assign i 1
%rep	255
%assign recip	(255*65536 + i-1) / i
	dd	recip
%assign i i+1
%endrep

unpremultiply_single_pixels:
;; Slower version for the odd pixels at the ends.
;;
;; In:
;;   uint32_t *dst/rdi:		Destination pixels.
;;   uint32_t *src/rsi:		Source pixels.
;;   num_pixels/rcx:		# pixels to unpremultiply.
;;
;; Out:
;;   rdi:			dst + 4*num_pixels; advanced past dst.
;;   rsi:			src + 4*num_pixels; advanced past src.
;;
;; Saved: rdx
;; Scratched: rax-rcx, rbp
	; Advance src/dst pointers to the end and setup iteration
	; from -num_pixels up to 0.
	lea	rsi, [rsi + rcx*4]
	lea	rdi, [rdi + rcx*4]
	neg	rcx
	jz	.out			; No pixels at all? -> .out

	push	rdx			; Save callee-save register.
.loop:
	; Load the next source pixel.
	mov	eax, [rsi + rcx*4]

%if ASHIFT == 24
	; Extract alpha and look up the reciprocal.
	mov	ebx, eax
	mov	ebp, eax		; Initialise result pixel register.
	and	ebp, 0xFF000000		; Mask off non-alpha from result pix.
;	jz	.next
	shr	ebx, 24			; Load alpha.
	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.

	; Do the component from bits 0..7.
	mov	edx, eax
	and	edx, 255		; Extract the next component.
	shr	eax, 8			; Shift it out.
	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
	shr	edx, 16			; Truncate and move to bits 0..7.
	or	ebp, edx		; Merge into result pixel.

	; Do the component from bits 8..15.
	mov	edx, eax
	and	edx, 255		; Extract the next component.
	shr	eax, 8			; Shift it out.
	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
	and	edx, 0x00FF0000		; Truncate fraction.
	shr	edx, 8			; Move to bits 8..15.
	or	ebp, edx		; Merge into result pixel.

	; Do the component from bits 16..23.
	and	eax, 255		; Mask off alpha.
	imul	eax, ebx		; Divide for a result in 8.16 fixed pt.
	and	eax, 0x00FF0000		; Truncate fraction.
	or	ebp, eax		; Merge into result pixel.

%elif ASHIFT == 0
	; Extract alpha and loop up the reciprocal.
	mov	ebx, eax
	shr	eax, 8			; Shift out alpha.
	and	ebp, 255		; Mask off non-alpha.
	mov	ebx, ebp		; Initialise result pixel.
	jz	.next
	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.

	; Do the component from bits 8..15.
	mov	edx, eax
	shr	eax, 8
	and	edx, 255
	imul	edx, ebx
	and	edx, 0x00FF0000
	shr	edx, 8
	or	ebp, edx

	; Do the component from bits 16..23
	mov	edx, eax
	shr	eax, 8
	and	edx, 255
	imul	edx, ebx
	and	edx, 0x00FF0000
	or	ebp, edx

	; Do the component from bits 24..31.
	imul	eax, ebx
	and	eax, 0x00FF0000
	shl	eax, 8
	or	ebp, eax
%endif
.next:
	; Write the result pixel.
	mov	[rdi + rcx*4], ebp

	inc	rcx
	jnz	.loop

	pop	rdx			; Restore callee-save reg.
.out:
	ret

%macro	unpremultiply_pixel_blocks 1
;; Faster version that does it in blocks of four pixels at a time.
;; The macro is parameterised on the instruction used to move
;; an XMM register to memory.
;;
;; In:
;;   uint32_t *src/rdi:	Destination pixels.
;;   uint32_t *dst/rsi:	Source pixels.
;;   num_pixels/rdx:	# pixels to unpremultiply.  Only
;;			 floor(num_pixels/4) will be.
;;
;;   %1:		Instruction used to write an xmm reg to dst.
;;
;; Out:
;;   rcx:		num_pixels mod 4 = # leftover pixels.
;;   rdi:		rdi + 16*floor(num_pixels/4); advanced past dst.
;;   rsi:		rsi + 16*floor(num_pixels/4); advanced past src.
;;
;; Scratched: xmm1-xmm4, rax-rdx, rbx
	; Advance the src and dst pointers to the end.  The bias
	; of +-15 is used to have the loop condition trigger an exit
	; just before we access the last incomplete block.
	shl	rdx, 2			; Size in bytes.
	lea	rsi, [rsi + rdx - 15]
	lea	rdi, [rdi + rdx - 15]
	neg	rdx
	add	rdx, 15			; Offset to the last byte of the
					;  first block from the end.
	jmp	%%test_cc
	align 16
%%loop:
	; Load four pixels into xmm1.  The prefetchnta here
	; hides the difference between movdqa vs. movdqu for
	; aligned input.
	prefetchnta	[rsi + rdx + 64*8] ; TODO: check the prefetch dist?
	movdqu	xmm1, [rsi + rdx]

	; Expand the 8 bit components into 16 bit ones in
	; two registers.
	movdqa	xmm2, xmm1
	punpckhbw xmm2, xmm2
	punpcklbw xmm1, xmm1

	; Load alphas into registers.
	movzx	eax, byte [rsi + rdx + ASHIFT/8 + 0]
	movzx	ebx, byte [rsi + rdx + ASHIFT/8 + 4]
	movzx	ecx, byte [rsi + rdx + ASHIFT/8 + 8]
	movzx	ebp, byte [rsi + rdx + ASHIFT/8 + 12]

	; Fetch multplier vectors for each pixel based on the alphas
	; into the xmm3/xmm4 registers.
	movq	xmm3, [reciprocal_table_Q + 8*eax]
	movq	xmm4, [reciprocal_table_Q + 8*ecx]
	movhpd	xmm3, [reciprocal_table_Q + 8*ebx]
	movhpd	xmm4, [reciprocal_table_Q + 8*ebp]

	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
	; Treating the components as 0.16 bit fixed point, the pmulhuw
	; leaves the integer part of x*255/a in the result for the colour
	; components x in (r,g,b) but leaves the alphas alone.
	pmulhuw	xmm1, xmm3
	pmulhuw	xmm2, xmm4

	; Pack the four resulting pixels from 16 to 8 bit components.
	; Here we saturate the result in case the input was superluminant.
	packuswb xmm1, xmm2

	; Write the result.
	%1	[rdi + rdx], xmm1

	; Increment to the next pixel. When this add overflows to >= 0
	; then the next read of a block would venture past the end of
	; the buffer.
	add	rdx, 16
%%test_cc:
	jnc	%%loop

	; Offset the pointers back to the last incomplete block.
	lea	rsi, [rsi + rdx]
	lea	rdi, [rdi + rdx]

	; Compute the # leftover pixels.
	lea	rcx, [rdx - 15]
	neg	rcx
	and	rcx, 15			; # bytes leftover.
	shr	rcx, 2			; # pixels leftover.
%endmacro

global unpremultiply_with_sse2
unpremultiply_with_sse2:
;;
;; void unpremultiply_with_sse2(
;;	uint32_t *dst/rdi,
;;	uint32_t const *src/rsi,
;;	ulong n/rdx);
;;
;; This is the main entry point callable from the outside.
;; The calling convention used here is the ELF64 one.
;;
	; Save callee-saved registers.
	push	rbp
	push	rbx

	; Save start of dst for alignment tests later.
	mov	rcx, rdi

	; If we don't have enough pixels for at least a few iterations
	; of blocked unpremultiplication then do the pixels one at a time.
	cmp	rdx, 3+4*4+3		; Max. pre/post align + 4 blocks.
	jae	.do_blocked
	 mov	rcx, rdx		; Pixel count.
	 call	unpremultiply_single_pixels
	 jmp	.out

.do_blocked:
	; If the destination pointer isn't even aligned to uint32_t
	; then we can't align it to 0 mod 16 using single pixels.
	test	rcx, 3
	jz	.can_align_dst
	 unpremultiply_pixel_blocks movdqu
	 jmp	.do_leftovers

.can_align_dst:
	; Align the destination pointer to 0 mod 16 by
	; doing 0..3 single pixels.
	neg	rcx
	and	rcx, 15			; # bytes to align to 16.
	shr	rcx, 2			; # pixels to align to 16.
	sub	rdx, rcx
	call	unpremultiply_single_pixels

	; If the source and dest are exactly aliased or
	; the image is fairly small then use movdqa writes.
	cmp	rdi, rsi		; Use movdqa for aliased src, dst.
	jz	.1
;	cmp	rdx, 128		; ... or if the src and dest are small.
;	jc	.1
	 unpremultiply_pixel_blocks movntdq
	 jmp	.do_leftovers
.1:
	 unpremultiply_pixel_blocks movdqa

.do_leftovers:
	call	unpremultiply_single_pixels
.out:
	pop	rbx
	pop	rbp
	ret