summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-03 04:23:57 +0200
committerM Joonas Pihlaja <jpihlaja@cc.helsinki.fi>2009-01-03 04:23:57 +0200
commit3eb27ac1329e5c6335593d40bc9b77ae27884b38 (patch)
tree535acff22172da8938dc8c262651e9d324790977
parent109b7cdbd6e617b9f818bbc110c1f6841487eb27 (diff)
Remove the old unused SSE2 version.
-rw-r--r--unpremultiply-sse2-old.S165
1 files changed, 0 insertions, 165 deletions
diff --git a/unpremultiply-sse2-old.S b/unpremultiply-sse2-old.S
deleted file mode 100644
index d8c6209..0000000
--- a/unpremultiply-sse2-old.S
+++ /dev/null
@@ -1,165 +0,0 @@
- section .text
-
-%macro function 1
- global %1
-%1:
-%endmacro
-
-%define SELECT(a,b,c,d) ((a)*64 + (b)*16 + (c)*4 + (d))
-
-reciprocal_table:
- dd 0
-%assign i 1
-%rep 255
- dd (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
-%assign i i+1
-%endrep
-
-; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
-; The pixels must not be supersaturated.
-; Output: xmm1: u8[], Four unpremultiplied pixels.
-; Invariant:
-; xmm0: 0
-; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx
-%macro unpremultiply_xmm1_old 0
- ; Expand the 8 bit components into 16 bit ones in
- ; two registers.
- movdqa xmm2, xmm1
- punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a)
- punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a)
-
- ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
- pextrw eax, xmm1, 0 ; Extract low half reg alphas.
- pextrw ecx, xmm2, 0
- pextrw ebx, xmm1, 4 ; Extract high half reg alphas.
- pextrw edx, xmm2, 4
- mov eax, [reciprocal_table + 4*eax] ; Fetch 255/alpha
- mov ecx, [reciprocal_table + 4*ecx] ; as 8.8 fp numbers.
- mov ebx, [reciprocal_table + 4*ebx]
- mov edx, [reciprocal_table + 4*edx]
- pinsrw xmm3, eax, 1 ; Inject into coefficient regs.
- pinsrw xmm4, ecx, 1
- pshuflw xmm3, xmm3, SELECT(1,1,1,0) ; Replicate non-alpha coefs.
- pshuflw xmm4, xmm4, SELECT(1,1,1,0)
- pinsrw xmm3, ebx, 5
- pinsrw xmm4, edx, 5
- pshufhw xmm3, xmm3, SELECT(1,1,1,0)
- pshufhw xmm4, xmm4, SELECT(1,1,1,0)
- pmullw xmm1, xmm3 ; Multiply components by coefs.
- pmullw xmm2, xmm4 ; to produce 8.8 fp components.
- psrlw xmm1, 8 ; Take floor of components.
- psrlw xmm2, 8
-
- ; Pack the four resulting pixels from 16 to 8 bit components.
- packuswb xmm1, xmm2
-%endmacro
-
-; Reciprocal table with 64 bit entries of a 4x16 vector
-; (255/i, 255/i, 255/i, 1.0) in 8.8 fixed point format.
-reciprocal_table_Q:
- dq 0
-%assign i 1
-%rep 255
- dw 256 ; unity
- dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
- dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
- dw (255*256 + i-1) / i ; ceil(255/i) in 8.8 fixed point.
-%assign i i+1
-%endrep
-
-; Input: xmm1: u8[], Four pixels with alpha in slots 0, 4, 8, 12.
-; The pixels must not be supersaturated.
-; Output: xmm1: u8[], Four unpremultiplied pixels.
-; Invariant:
-; xmm0: 0
-; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx
-%macro unpremultiply_xmm1 0
- ; Expand the 8 bit components into 16 bit ones in
- ; two registers.
- movdqa xmm2, xmm1
- punpckhbw xmm2, xmm0 ; xmm2: (r,g,b,a|r,g,b,a)
- punpcklbw xmm1, xmm0 ; xmm1: (r,g,b,a|r,g,b,a)
-
- ; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
- pextrw ebx, xmm1, 4 ; Extract pixel alphas into registers.
- pextrw edx, xmm2, 4
- pextrw eax, xmm1, 0
- pextrw ecx, xmm2, 0
- movq xmm5, [reciprocal_table_Q + 8*ebx] ; Fetch multipliers
- movq xmm6, [reciprocal_table_Q + 8*edx] ; into lower regs.
- movq xmm3, [reciprocal_table_Q + 8*eax]
- movq xmm4, [reciprocal_table_Q + 8*ecx]
- pshufd xmm5, xmm5, SELECT(1,0,3,2) ; Shuffle to upper.
- pshufd xmm6, xmm6, SELECT(1,0,3,2)
- por xmm3, xmm5 ; Combine 64 bit upper and lower
- por xmm4, xmm6 ; into 128 bit multipliers.
- pmullw xmm1, xmm3 ; Multiply components by coefs.
- pmullw xmm2, xmm4 ; to produce 8.8 fp components.
- psrlw xmm1, 8 ; Take floor of components.
- psrlw xmm2, 8
-
- ; Pack the four resulting pixels from 16 to 8 bit components.
- packuswb xmm1, xmm2
-%endmacro
-
-; Input:
-; %1: movdqa or movdqu, depending on the alignment of rsi and rdi.
-; r8: number of times N > 0 to loop.
-; rsi: uint32_t[4*N]: source pixels.
-; rdi: uint32_t[4*N]: destination pixels.
-; Invariant:
-; xmm0: 0
-; xmm3, xmm4: u16[] = (?,?,?,256,?,?,?,256)
-; Scratch: xmm2, rax, rbx, rcx, rdx, rsi, rdi, r8, r9
-%macro unpremultiply_loop_with 1
- xor r9,r9 ; index register.
- align 16
-%%loop:
- prefetchnta [rsi + r9*8 + 16*64]
- %1 xmm1, [rsi+r9*8]
- unpremultiply_xmm1
- movntdq [rdi+r9*8], xmm1
-
- add r9, 2
- dec r8
- jnz %%loop
-%endmacro
-
-;; void unpremultiply_with_sse2(uint32_t *dst/rdi, uint32_t const *src/rsi, ulong n/rdx)
-;;
-function unpremultiply_with_sse2
- mov r8, rdx
- shr r8, 2 ; TODO: left over pixels.
- test r8, r8
- jnz .setup_invariants
- ret
-
-.setup_invariants:
- push rbx
- pxor xmm0, xmm0 ; constant zero for unpacking.
-
- ; Setup component multiplier registers xmm3/xmm4 with
- ; 8.8 fixed point coefficients. The alpha component always
- ; has a coefficient of one.
- mov rax, 256*256+256 ; unity in 8.8 fp
- movd xmm3, eax
- pshufd xmm3, xmm3, SELECT(0,0,0,0)
- movdqa xmm4, xmm3
-
- ; Decide on whether to use movdqu or movdqa based on source
- ; alignment. We always use mvntdq to write the dest.
- test rsi, 15
- jz .aligned_case
- jmp .unaligned_case
-
-.aligned_case:
- unpremultiply_loop_with movdqa
- pop rbx
- ret
-
-.unaligned_case:
- unpremultiply_loop_with movdqu
- pop rbx
- ret