diff options
author | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-03 04:23:57 +0200 |
---|---|---|
committer | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-01-03 04:23:57 +0200 |
commit | 3eb27ac1329e5c6335593d40bc9b77ae27884b38 (patch) | |
tree | 535acff22172da8938dc8c262651e9d324790977 | |
parent | 109b7cdbd6e617b9f818bbc110c1f6841487eb27 (diff) |
Remove the old unused SSE2 version.
-rw-r--r-- | unpremultiply-sse2-old.S | 165 |
1 files changed, 0 insertions, 165 deletions
diff --git a/unpremultiply-sse2-old.S b/unpremultiply-sse2-old.S deleted file mode 100644 index d8c6209..0000000 --- a/unpremultiply-sse2-old.S +++ /dev/null @@ -1,165 +0,0 @@ - section .text - -%macro function 1 - global %1 -%1: -%endmacro - -%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d)) - -reciprocal_table: - dd 0 -%assign i 1 -%rep 255 - dd (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. -%assign i i+1 -%endrep - -; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12. -; The pixels must not be supersaturated. -; Output: xmm1: u8[], Four unpremultiplied pixels. -; Invariant: -; xmm0: 0 -; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) -; Scratch: xmm2, rax, rbx, rcx, rdx -%macro unpremultiply_xmm1_old 0 - ; Expand the 8 bit components into 16 bit ones in - ; two registers. - movdqa xmm2, xmm1 - punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a) - punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a) - - ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. - pextrw eax, xmm1, 0 ; Extract low half reg alphas. - pextrw ecx, xmm2, 0 - pextrw ebx, xmm1, 4 ; Extract high half reg alphas. - pextrw edx, xmm2, 4 - mov eax, [reciprocal_table + 4*eax] ; Fetch 255/alpha - mov ecx, [reciprocal_table + 4*ecx] ; as 8.8 fp numbers. - mov ebx, [reciprocal_table + 4*ebx] - mov edx, [reciprocal_table + 4*edx] - pinsrw xmm3, eax, 1 ; Inject into coefficient regs. - pinsrw xmm4, ecx, 1 - pshuflw xmm3, xmm3, SELECT(1,1,1,0) ; Replicate non-alpha coefs. - pshuflw xmm4, xmm4, SELECT(1,1,1,0) - pinsrw xmm3, ebx, 5 - pinsrw xmm4, edx, 5 - pshufhw xmm3, xmm3, SELECT(1,1,1,0) - pshufhw xmm4, xmm4, SELECT(1,1,1,0) - pmullw xmm1, xmm3 ; Multiply components by coefs. - pmullw xmm2, xmm4 ; to produce 8.8 fp components. - psrlw xmm1, 8 ; Take floor of components. - psrlw xmm2, 8 - - ; Pack the four resulting pixels from 16 to 8 bit components. - packuswb xmm1, xmm2 -%endmacro - -; Reciprocal table with 64 bit entries of a 4x16 vector -; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format. -reciprocal_table_Q: - dq 0 -%assign i 1 -%rep 255 - dw 256 ; unity - dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. - dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. - dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point. -%assign i i+1 -%endrep - -; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12. -; The pixels must not be supersaturated. -; Output: xmm1: u8[], Four unpremultiplied pixels. -; Invariant: -; xmm0: 0 -; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) -; Scratch: xmm2, rax, rbx, rcx, rdx -%macro unpremultiply_xmm1 0 - ; Expand the 8 bit components into 16 bit ones in - ; two registers. - movdqa xmm2, xmm1 - punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a) - punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a) - - ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. - pextrw ebx, xmm1, 4 ; Extract pixel alphas into registers. - pextrw edx, xmm2, 4 - pextrw eax, xmm1, 0 - pextrw ecx, xmm2, 0 - movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers - movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs. - movq xmm3, [reciprocal_table_Q + 8*eax] - movq xmm4, [reciprocal_table_Q + 8*ecx] - pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper. - pshufd xmm6, xmm6, SELECT(1,0,3,2) - por xmm3, xmm5 ; Combine 64 bit upper and lower - por xmm4, xmm6 ; into 128 bit multipliers. - pmullw xmm1, xmm3 ; Multiply components by coefs. - pmullw xmm2, xmm4 ; to produce 8.8 fp components. - psrlw xmm1, 8 ; Take floor of components. - psrlw xmm2, 8 - - ; Pack the four resulting pixels from 16 to 8 bit components. - packuswb xmm1, xmm2 -%endmacro - -; Input: -; %1: movdqa or movdqu, depending on the alignment of rsi and rdi. -; r8: number of times N > 0 to loop. -; rsi: uint32_t[4*N]: source pixels. -; rdi: uint32_t[4*N]: destination pixels. -; Invariant: -; xmm0: 0 -; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256) -; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9 -%macro unpremultiply_loop_with 1 - xor r9,r9 ; index register. - align 16 -%%loop: - prefetchnta [rsi + r9*8 + 16*64] - %1 xmm1, [rsi+r9*8] - unpremultiply_xmm1 - movntdq [rdi+r9*8], xmm1 - - add r9, 2 - dec r8 - jnz %%loop -%endmacro - -;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx) -;; -function unpremultiply_with_sse2 - mov r8, rdx - shr r8, 2 ; TODO: left over pixels. - test r8, r8 - jnz .setup_invariants - ret - -.setup_invariants: - push rbx - pxor xmm0, xmm0 ; constant zero for unpacking. - - ; Setup component multiplier registers xmm3/xmm4 with - ; 8.8 fixed point coefficients. The alpha component always - ; has a coefficient of one. - mov rax, 256*256+256 ; unity in 8.8 fp - movd xmm3, eax - pshufd xmm3, xmm3, SELECT(0,0,0,0) - movdqa xmm4, xmm3 - - ; Decide on whether to use movdqu or movdqa based on source - ; alignment. We always use mvntdq to write the dest. - test rsi, 15 - jz .aligned_case - jmp .unaligned_case - -.aligned_case: - unpremultiply_loop_with movdqa - pop rbx - ret - -.unaligned_case: - unpremultiply_loop_with movdqu - pop rbx - ret |