;;; ;;; Copyright (c) 2009 M Joonas Pihlaja ;;; ;;; Permission is hereby granted, free of charge, to any person ;;; obtaining a copy of this software and associated documentation ;;; files (the "Software"), to deal in the Software without ;;; restriction, including without limitation the rights to use, ;;; copy, modify, merge, publish, distribute, sublicense, and/or sell ;;; copies of the Software, and to permit persons to whom the ;;; Software is furnished to do so, subject to the following ;;; conditions: ;;; ;;; The above copyright notice and this permission notice shall be ;;; included in all copies or substantial portions of the Software. ;;; ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES ;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ;;; OTHER DEALINGS IN THE SOFTWARE. ;;; ;;; Unpremultiply routine for SSE2/AMD64. ;;; ;;; This file exports a function unpremultiply_with_sse2() that ;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels. ;;; ;;; void unpremultiply_with_sse2( ;;; uint32_t *dst, ;;; uint32_t const *src, ;;; unsigned long num_pixels); ;;; ;;; Assembled with nasm 2.06rc2. ;;; section .text ; We're only using rax-rbp in this file so that ; conversion to 32 bit SSE2 would be easier by ; updating the register names and the ; argument extraction to the calling convention. ; Location of alpha in a 32 bit pixel. Alpha measures opaqueness. %define ASHIFT 24 ;%define ASHIFT 0 ;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors ;; of the form ;; ;; (1.0, 255/i, 255/i, 255/i) for ASHIFT=0 ;; (255/i, 255/i, 255/i, 1.0) for ASHIFT=24 ;; ;; in 8.8 bit fixed point format. align 16 reciprocal_table_Q: dq 0 %assign i 1 %rep 255 %assign recip 255*256 / i %if ASHIFT == 0 dw 256, recip, recip, recip %elif ASHIFT==24 dw recip, recip, recip, 256 %endif %assign i i+1 %endrep ;; Reciprocal table with 32 bit entries of ceil(255/i) in ;; 16.16 bit fixed point. reciprocal_table_D: dd 0 %assign i 1 %rep 255 %assign recip (255*65536 + i-1) / i dd recip %assign i i+1 %endrep unpremultiply_single_pixels: ;; Slower version for the odd pixels at the ends. ;; ;; In: ;; uint32_t *dst/rdi: Destination pixels. ;; uint32_t *src/rsi: Source pixels. ;; num_pixels/rcx: # pixels to unpremultiply. ;; ;; Out: ;; rdi: dst + 4*num_pixels; advanced past dst. ;; rsi: src + 4*num_pixels; advanced past src. ;; ;; Saved: rdx ;; Scratched: rax-rcx, rbp ; Advance src/dst pointers to the end and setup iteration ; from -num_pixels up to 0. lea rsi, [rsi + rcx*4] lea rdi, [rdi + rcx*4] neg rcx jz .out ; No pixels at all? -> .out push rdx ; Save callee-save register. .loop: ; Load the next source pixel. mov eax, [rsi + rcx*4] %if ASHIFT == 24 ; Extract alpha and look up the reciprocal. mov ebx, eax mov ebp, eax ; Initialise result pixel register. and ebp, 0xFF000000 ; Mask off non-alpha from result pix. ; jz .next shr ebx, 24 ; Load alpha. mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. ; Do the component from bits 0..7. mov edx, eax and edx, 255 ; Extract the next component. shr eax, 8 ; Shift it out. imul edx, ebx ; Divide for a result in 8.16 fixed pt. shr edx, 16 ; Truncate and move to bits 0..7. or ebp, edx ; Merge into result pixel. ; Do the component from bits 8..15. mov edx, eax and edx, 255 ; Extract the next component. shr eax, 8 ; Shift it out. imul edx, ebx ; Divide for a result in 8.16 fixed pt. and edx, 0x00FF0000 ; Truncate fraction. shr edx, 8 ; Move to bits 8..15. or ebp, edx ; Merge into result pixel. ; Do the component from bits 16..23. and eax, 255 ; Mask off alpha. imul eax, ebx ; Divide for a result in 8.16 fixed pt. and eax, 0x00FF0000 ; Truncate fraction. or ebp, eax ; Merge into result pixel. %elif ASHIFT == 0 ; Extract alpha and loop up the reciprocal. mov ebx, eax shr eax, 8 ; Shift out alpha. and ebp, 255 ; Mask off non-alpha. mov ebx, ebp ; Initialise result pixel. jz .next mov ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal. ; Do the component from bits 8..15. mov edx, eax shr eax, 8 and edx, 255 imul edx, ebx and edx, 0x00FF0000 shr edx, 8 or ebp, edx ; Do the component from bits 16..23 mov edx, eax shr eax, 8 and edx, 255 imul edx, ebx and edx, 0x00FF0000 or ebp, edx ; Do the component from bits 24..31. imul eax, ebx and eax, 0x00FF0000 shl eax, 8 or ebp, eax %endif .next: ; Write the result pixel. mov [rdi + rcx*4], ebp inc rcx jnz .loop pop rdx ; Restore callee-save reg. .out: ret %macro unpremultiply_pixel_blocks 1 ;; Faster version that does it in blocks of four pixels at a time. ;; The macro is parameterised on the instruction used to move ;; an XMM register to memory. ;; ;; In: ;; uint32_t *src/rdi: Destination pixels. ;; uint32_t *dst/rsi: Source pixels. ;; num_pixels/rdx: # pixels to unpremultiply. Only ;; floor(num_pixels/4) will be. ;; ;; %1: Instruction used to write an xmm reg to dst. ;; ;; Out: ;; rcx: num_pixels mod 4 = # leftover pixels. ;; rdi: rdi + 16*floor(num_pixels/4); advanced past dst. ;; rsi: rsi + 16*floor(num_pixels/4); advanced past src. ;; ;; Scratched: xmm1-xmm4, rax-rdx, rbx ; Advance the src and dst pointers to the end. The bias ; of +-15 is used to have the loop condition trigger an exit ; just before we access the last incomplete block. shl rdx, 2 ; Size in bytes. lea rsi, [rsi + rdx - 15] lea rdi, [rdi + rdx - 15] neg rdx add rdx, 15 ; Offset to the last byte of the ; first block from the end. jmp %%test_cc align 16 %%loop: ; Load four pixels into xmm1. The prefetchnta here ; hides the difference between movdqa vs. movdqu for ; aligned input. prefetchnta [rsi + rdx + 64*8] ; TODO: check the prefetch dist? movdqu xmm1, [rsi + rdx] ; Expand the 8 bit components into 16 bit ones in ; two registers. movdqa xmm2, xmm1 punpckhbw xmm2, xmm2 punpcklbw xmm1, xmm1 ; Load alphas into registers. movzx eax, byte [rsi + rdx + ASHIFT/8 + 0] movzx ebx, byte [rsi + rdx + ASHIFT/8 + 4] movzx ecx, byte [rsi + rdx + ASHIFT/8 + 8] movzx ebp, byte [rsi + rdx + ASHIFT/8 + 12] ; Fetch multplier vectors for each pixel based on the alphas ; into the xmm3/xmm4 registers. movq xmm3, [reciprocal_table_Q + 8*eax] movq xmm4, [reciprocal_table_Q + 8*ecx] movhpd xmm3, [reciprocal_table_Q + 8*ebx] movhpd xmm4, [reciprocal_table_Q + 8*ebp] ; Do the unpremultiply in-place in the pixels in xmm1, xmm2. ; Treating the components as 0.16 bit fixed point, the pmulhuw ; leaves the integer part of x*255/a in the result for the colour ; components x in (r,g,b) but leaves the alphas alone. pmulhuw xmm1, xmm3 pmulhuw xmm2, xmm4 ; Pack the four resulting pixels from 16 to 8 bit components. ; Here we saturate the result in case the input was superluminant. packuswb xmm1, xmm2 ; Write the result. %1 [rdi + rdx], xmm1 ; Increment to the next pixel. When this add overflows to >= 0 ; then the next read of a block would venture past the end of ; the buffer. add rdx, 16 %%test_cc: jnc %%loop ; Offset the pointers back to the last incomplete block. lea rsi, [rsi + rdx] lea rdi, [rdi + rdx] ; Compute the # leftover pixels. lea rcx, [rdx - 15] neg rcx and rcx, 15 ; # bytes leftover. shr rcx, 2 ; # pixels leftover. %endmacro global unpremultiply_with_sse2 unpremultiply_with_sse2: ;; ;; void unpremultiply_with_sse2( ;; uint32_t *dst/rdi, ;; uint32_t const *src/rsi, ;; ulong n/rdx); ;; ;; This is the main entry point callable from the outside. ;; The calling convention used here is the ELF64 one. ;; ; Save callee-saved registers. push rbp push rbx ; Save start of dst for alignment tests later. mov rcx, rdi ; If we don't have enough pixels for at least a few iterations ; of blocked unpremultiplication then do the pixels one at a time. cmp rdx, 3+4*4+3 ; Max. pre/post align + 4 blocks. jae .do_blocked mov rcx, rdx ; Pixel count. call unpremultiply_single_pixels jmp .out .do_blocked: ; If the destination pointer isn't even aligned to uint32_t ; then we can't align it to 0 mod 16 using single pixels. test rcx, 3 jz .can_align_dst unpremultiply_pixel_blocks movdqu jmp .do_leftovers .can_align_dst: ; Align the destination pointer to 0 mod 16 by ; doing 0..3 single pixels. neg rcx and rcx, 15 ; # bytes to align to 16. shr rcx, 2 ; # pixels to align to 16. sub rdx, rcx call unpremultiply_single_pixels ; If the source and dest are exactly aliased or ; the image is fairly small then use movdqa writes. cmp rdi, rsi ; Use movdqa for aliased src, dst. jz .1 ; cmp rdx, 128 ; ... or if the src and dest are small. ; jc .1 unpremultiply_pixel_blocks movntdq jmp .do_leftovers .1: unpremultiply_pixel_blocks movdqa .do_leftovers: call unpremultiply_single_pixels .out: pop rbx pop rbp ret