Remove the old unused SSE2 version.

author: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-03 04:23:57 +0200
committer: M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> 2009-01-03 04:23:57 +0200
commit: 3eb27ac1329e5c6335593d40bc9b77ae27884b38 (patch)
tree: 535acff22172da8938dc8c262651e9d324790977
parent: 109b7cdbd6e617b9f818bbc110c1f6841487eb27 (diff)
1 files changed, 0 insertions, 165 deletions
diff --git a/unpremultiply-sse2-old.S b/unpremultiply-sse2-old.S
deleted file mode 100644
index d8c6209..0000000
--- a/unpremultiply-sse2-old.S
+++ /dev/null
@@ -1,165 +0,0 @@
-	section .text
-
-%macro function 1
-	global	%1
-%1:
-%endmacro
-
-%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
-
-reciprocal_table:
-	dd	0
-%assign i 1
-%rep    255
-	dd	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
-%assign i i+1
-%endrep
-
-; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
-;		The pixels must not be supersaturated.
-; Output: xmm1: u8[], Four unpremultiplied pixels.
-; Invariant:
-;	xmm0: 0
-;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx
-%macro unpremultiply_xmm1_old 0
-	; Expand the 8 bit components into 16 bit ones in
-	; two registers.
-	movdqa	xmm2, xmm1
-	punpckhbw xmm2, xmm0		; xmm2: (r,g,b,a|r,g,b,a)
-	punpcklbw xmm1, xmm0		; xmm1: (r,g,b,a|r,g,b,a)
-
-	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
-	pextrw	eax, xmm1, 0		; Extract low half reg alphas.
-	pextrw	ecx, xmm2, 0
-	pextrw	ebx, xmm1, 4	       ; Extract high half reg alphas.
-	pextrw	edx, xmm2, 4
-	mov	eax, [reciprocal_table + 4*eax] ; Fetch 255/alpha
-	mov	ecx, [reciprocal_table + 4*ecx] ; as 8.8 fp numbers.
-	mov	ebx, [reciprocal_table + 4*ebx]
-	mov	edx, [reciprocal_table + 4*edx]
-	pinsrw	xmm3, eax, 1	       ; Inject into coefficient regs.
-	pinsrw	xmm4, ecx, 1
-	pshuflw	xmm3, xmm3, SELECT(1,1,1,0) ; Replicate non-alpha coefs.
-	pshuflw	xmm4, xmm4, SELECT(1,1,1,0)
-	pinsrw	xmm3, ebx, 5
-	pinsrw	xmm4, edx, 5
-	pshufhw	xmm3, xmm3, SELECT(1,1,1,0)
-	pshufhw	xmm4, xmm4, SELECT(1,1,1,0)
-	pmullw	xmm1, xmm3	        ; Multiply components by coefs.
-	pmullw	xmm2, xmm4		; to produce 8.8 fp components.
-	psrlw	xmm1, 8			; Take floor of components.
-	psrlw	xmm2, 8
-
-	; Pack the four resulting pixels from 16 to 8 bit components.
-	packuswb xmm1, xmm2
-%endmacro
-
-; Reciprocal table with 64 bit entries of a 4x16 vector 
-; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
-reciprocal_table_Q:
-	dq	0
-%assign i 1
-%rep    255
-	dw	256			; unity
-	dw	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
-	dw	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
-	dw	(255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
-%assign i i+1
-%endrep
-
-; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
-;		The pixels must not be supersaturated.
-; Output: xmm1: u8[], Four unpremultiplied pixels.
-; Invariant:
-;	xmm0: 0
-;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx
-%macro unpremultiply_xmm1 0
-	; Expand the 8 bit components into 16 bit ones in
-	; two registers.
-	movdqa	xmm2, xmm1
-	punpckhbw xmm2, xmm0		; xmm2: (r,g,b,a|r,g,b,a)
-	punpcklbw xmm1, xmm0		; xmm1: (r,g,b,a|r,g,b,a)
-
-	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
-	pextrw	ebx, xmm1, 4		; Extract pixel alphas into registers.
-	pextrw	edx, xmm2, 4
-	pextrw	eax, xmm1, 0
-	pextrw	ecx, xmm2, 0
-	movq	xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
-	movq	xmm6, [reciprocal_table_Q + 8*edx] ;  into lower regs.
-	movq	xmm3, [reciprocal_table_Q + 8*eax]
-	movq	xmm4, [reciprocal_table_Q + 8*ecx]
-	pshufd	xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
-	pshufd	xmm6, xmm6, SELECT(1,0,3,2)
-	por	xmm3, xmm5		; Combine 64 bit upper and lower
-	por	xmm4, xmm6		;  into 128 bit multipliers.
-	pmullw	xmm1, xmm3	        ; Multiply components by coefs.
-	pmullw	xmm2, xmm4		;  to produce 8.8 fp components.
-	psrlw	xmm1, 8			; Take floor of components.
-	psrlw	xmm2, 8
-
-	; Pack the four resulting pixels from 16 to 8 bit components.
-	packuswb xmm1, xmm2
-%endmacro
-
-; Input:
-;	%1:	movdqa or movdqu, depending on the alignment of rsi and rdi.
-;	r8:	number of times N > 0 to loop.
-;	rsi:	uint32_t[4*N]: source pixels.
-;	rdi:	uint32_t[4*N]: destination pixels.
-; Invariant:
-;	xmm0: 0
-;	xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
-%macro	unpremultiply_loop_with 1
-	xor	r9,r9			; index register.
-	align 16
-%%loop:
-	prefetchnta	[rsi + r9*8 + 16*64]
-	%1	xmm1, [rsi+r9*8]
-	unpremultiply_xmm1
-	movntdq	[rdi+r9*8], xmm1
-
-	add	r9, 2
-	dec	r8
-	jnz	%%loop
-%endmacro
-
-;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
-;;
-function unpremultiply_with_sse2
-	mov	r8, rdx
-	shr	r8, 2			; TODO: left over pixels.
-	test	r8, r8
-	jnz	.setup_invariants
-	ret
-
-.setup_invariants:
-	push	rbx
-	pxor	xmm0, xmm0		; constant zero for unpacking.
-
-	; Setup component multiplier registers xmm3/xmm4 with
-	; 8.8 fixed point coefficients.  The alpha component always
-	; has a coefficient of one.
-	mov	rax, 256*256+256	; unity in 8.8 fp
-	movd	xmm3, eax
-	pshufd	xmm3, xmm3, SELECT(0,0,0,0)
-	movdqa	xmm4, xmm3
-
-	; Decide on whether to use movdqu or movdqa based on source
-	; alignment. We always use mvntdq to write the dest.
-	test	rsi, 15
-	jz	.aligned_case
-	jmp	.unaligned_case
-
-.aligned_case:
-	unpremultiply_loop_with movdqa
-	pop	rbx
-	ret
-
-.unaligned_case:
-	unpremultiply_loop_with movdqu
-	pop	rbx
-	ret
author	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-03 04:23:57 +0200
committer	M Joonas Pihlaja <jpihlaja@cc.helsinki.fi>	2009-01-03 04:23:57 +0200
commit	3eb27ac1329e5c6335593d40bc9b77ae27884b38 (patch)
tree	535acff22172da8938dc8c262651e9d324790977
parent	109b7cdbd6e617b9f818bbc110c1f6841487eb27 (diff)